Home | History | Annotate | Download | only in ufs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
     27 /* All Rights Reserved */
     28 
     29 /*
     30  * Portions of this source code were derived from Berkeley 4.3 BSD
     31  * under license from the Regents of the University of California.
     32  */
     33 
     34 #include <sys/types.h>
     35 #include <sys/t_lock.h>
     36 #include <sys/param.h>
     37 #include <sys/time.h>
     38 #include <sys/systm.h>
     39 #include <sys/sysmacros.h>
     40 #include <sys/resource.h>
     41 #include <sys/signal.h>
     42 #include <sys/cred.h>
     43 #include <sys/user.h>
     44 #include <sys/buf.h>
     45 #include <sys/vfs.h>
     46 #include <sys/vnode.h>
     47 #include <sys/proc.h>
     48 #include <sys/disp.h>
     49 #include <sys/file.h>
     50 #include <sys/fcntl.h>
     51 #include <sys/flock.h>
     52 #include <sys/kmem.h>
     53 #include <sys/uio.h>
     54 #include <sys/dnlc.h>
     55 #include <sys/conf.h>
     56 #include <sys/mman.h>
     57 #include <sys/pathname.h>
     58 #include <sys/debug.h>
     59 #include <sys/vmsystm.h>
     60 #include <sys/cmn_err.h>
     61 #include <sys/filio.h>
     62 #include <sys/atomic.h>
     63 
     64 #include <sys/fssnap_if.h>
     65 #include <sys/fs/ufs_fs.h>
     66 #include <sys/fs/ufs_lockfs.h>
     67 #include <sys/fs/ufs_filio.h>
     68 #include <sys/fs/ufs_inode.h>
     69 #include <sys/fs/ufs_fsdir.h>
     70 #include <sys/fs/ufs_quota.h>
     71 #include <sys/fs/ufs_trans.h>
     72 #include <sys/fs/ufs_panic.h>
     73 #include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
     74 #include <sys/errno.h>
     75 
     76 #include <sys/filio.h>		/* _FIOIO */
     77 
     78 #include <vm/hat.h>
     79 #include <vm/page.h>
     80 #include <vm/pvn.h>
     81 #include <vm/as.h>
     82 #include <vm/seg.h>
     83 #include <vm/seg_map.h>
     84 #include <vm/seg_vn.h>
     85 #include <vm/seg_kmem.h>
     86 #include <vm/rm.h>
     87 #include <sys/swap.h>
     88 #include <sys/epm.h>
     89 
     90 #include <fs/fs_subr.h>
     91 
     92 static void	*ufs_directio_zero_buf;
     93 static int	ufs_directio_zero_len	= 8192;
     94 
     95 int	ufs_directio_enabled = 1;	/* feature is enabled */
     96 
     97 /*
     98  * for kstats reader
     99  */
    100 struct ufs_directio_kstats {
    101 	kstat_named_t	logical_reads;
    102 	kstat_named_t	phys_reads;
    103 	kstat_named_t	hole_reads;
    104 	kstat_named_t	nread;
    105 	kstat_named_t	logical_writes;
    106 	kstat_named_t	phys_writes;
    107 	kstat_named_t	nwritten;
    108 	kstat_named_t	nflushes;
    109 } ufs_directio_kstats = {
    110 	{ "logical_reads",	KSTAT_DATA_UINT64 },
    111 	{ "phys_reads",		KSTAT_DATA_UINT64 },
    112 	{ "hole_reads",		KSTAT_DATA_UINT64 },
    113 	{ "nread",		KSTAT_DATA_UINT64 },
    114 	{ "logical_writes",	KSTAT_DATA_UINT64 },
    115 	{ "phys_writes",	KSTAT_DATA_UINT64 },
    116 	{ "nwritten",		KSTAT_DATA_UINT64 },
    117 	{ "nflushes",		KSTAT_DATA_UINT64 },
    118 };
    119 
    120 kstat_t	*ufs_directio_kstatsp;
    121 
    122 /*
    123  * use kmem_cache_create for direct-physio buffers. This has shown
    124  * a better cache distribution compared to buffers on the
    125  * stack. It also avoids semaphore construction/deconstruction
    126  * per request
    127  */
    128 struct directio_buf {
    129 	struct directio_buf	*next;
    130 	char		*addr;
    131 	size_t		nbytes;
    132 	struct buf	buf;
    133 };
    134 static struct kmem_cache *directio_buf_cache;
    135 
    136 
    137 /* ARGSUSED */
    138 static int
    139 directio_buf_constructor(void *dbp, void *cdrarg, int kmflags)
    140 {
    141 	bioinit((struct buf *)&((struct directio_buf *)dbp)->buf);
    142 	return (0);
    143 }
    144 
    145 /* ARGSUSED */
    146 static void
    147 directio_buf_destructor(void *dbp, void *cdrarg)
    148 {
    149 	biofini((struct buf *)&((struct directio_buf *)dbp)->buf);
    150 }
    151 
    152 void
    153 directio_bufs_init(void)
    154 {
    155 	directio_buf_cache = kmem_cache_create("directio_buf_cache",
    156 	    sizeof (struct directio_buf), 0,
    157 	    directio_buf_constructor, directio_buf_destructor,
    158 	    NULL, NULL, NULL, 0);
    159 }
    160 
    161 void
    162 ufs_directio_init(void)
    163 {
    164 	/*
    165 	 * kstats
    166 	 */
    167 	ufs_directio_kstatsp = kstat_create("ufs", 0,
    168 	    "directio", "ufs", KSTAT_TYPE_NAMED,
    169 	    sizeof (ufs_directio_kstats) / sizeof (kstat_named_t),
    170 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
    171 	if (ufs_directio_kstatsp) {
    172 		ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats;
    173 		kstat_install(ufs_directio_kstatsp);
    174 	}
    175 	/*
    176 	 * kzero is broken so we have to use a private buf of zeroes
    177 	 */
    178 	ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP);
    179 	directio_bufs_init();
    180 }
    181 
    182 /*
    183  * Wait for the first direct IO operation to finish
    184  */
    185 static int
    186 directio_wait_one(struct directio_buf *dbp, long *bytes_iop)
    187 {
    188 	buf_t	*bp;
    189 	int	error;
    190 
    191 	/*
    192 	 * Wait for IO to finish
    193 	 */
    194 	bp = &dbp->buf;
    195 	error = biowait(bp);
    196 
    197 	/*
    198 	 * bytes_io will be used to figure out a resid
    199 	 * for the caller. The resid is approximated by reporting
    200 	 * the bytes following the first failed IO as the residual.
    201 	 *
    202 	 * I am cautious about using b_resid because I
    203 	 * am not sure how well the disk drivers maintain it.
    204 	 */
    205 	if (error)
    206 		if (bp->b_resid)
    207 			*bytes_iop = bp->b_bcount - bp->b_resid;
    208 		else
    209 			*bytes_iop = 0;
    210 	else
    211 		*bytes_iop += bp->b_bcount;
    212 	/*
    213 	 * Release direct IO resources
    214 	 */
    215 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
    216 	kmem_cache_free(directio_buf_cache, dbp);
    217 	return (error);
    218 }
    219 
    220 /*
    221  * Wait for all of the direct IO operations to finish
    222  */
    223 
    224 uint32_t	ufs_directio_drop_kpri = 0;	/* enable kpri hack */
    225 
    226 static int
    227 directio_wait(struct directio_buf *tail, long *bytes_iop)
    228 {
    229 	int	error = 0, newerror;
    230 	struct directio_buf	*dbp;
    231 	uint_t	kpri_req_save;
    232 
    233 	/*
    234 	 * The linked list of directio buf structures is maintained
    235 	 * in reverse order (tail->last request->penultimate request->...)
    236 	 */
    237 	/*
    238 	 * This is the k_pri_req hack. Large numbers of threads
    239 	 * sleeping with kernel priority will cause scheduler thrashing
    240 	 * on an MP machine. This can be seen running Oracle using
    241 	 * directio to ufs files. Sleep at normal priority here to
    242 	 * more closely mimic physio to a device partition. This
    243 	 * workaround is disabled by default as a niced thread could
    244 	 * be starved from running while holding i_rwlock and i_contents.
    245 	 */
    246 	if (ufs_directio_drop_kpri) {
    247 		kpri_req_save = curthread->t_kpri_req;
    248 		curthread->t_kpri_req = 0;
    249 	}
    250 	while ((dbp = tail) != NULL) {
    251 		tail = dbp->next;
    252 		newerror = directio_wait_one(dbp, bytes_iop);
    253 		if (error == 0)
    254 			error = newerror;
    255 	}
    256 	if (ufs_directio_drop_kpri)
    257 		curthread->t_kpri_req = kpri_req_save;
    258 	return (error);
    259 }
    260 /*
    261  * Initiate direct IO request
    262  */
    263 static void
    264 directio_start(struct ufsvfs *ufsvfsp, struct inode *ip, size_t nbytes,
    265 	offset_t offset, char *addr, enum seg_rw rw, struct proc *procp,
    266 	struct directio_buf **tailp, page_t **pplist)
    267 {
    268 	buf_t *bp;
    269 	struct directio_buf *dbp;
    270 
    271 	/*
    272 	 * Allocate a directio buf header
    273 	 *   Note - list is maintained in reverse order.
    274 	 *   directio_wait_one() depends on this fact when
    275 	 *   adjusting the ``bytes_io'' param. bytes_io
    276 	 *   is used to compute a residual in the case of error.
    277 	 */
    278 	dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP);
    279 	dbp->next = *tailp;
    280 	*tailp = dbp;
    281 
    282 	/*
    283 	 * Initialize buf header
    284 	 */
    285 	dbp->addr = addr;
    286 	dbp->nbytes = nbytes;
    287 	bp = &dbp->buf;
    288 	bp->b_edev = ip->i_dev;
    289 	bp->b_lblkno = btodt(offset);
    290 	bp->b_bcount = nbytes;
    291 	bp->b_un.b_addr = addr;
    292 	bp->b_proc = procp;
    293 	bp->b_file = ip->i_vnode;
    294 
    295 	/*
    296 	 * Note that S_WRITE implies B_READ and vice versa: a read(2)
    297 	 * will B_READ data from the filesystem and S_WRITE it into
    298 	 * the user's buffer; a write(2) will S_READ data from the
    299 	 * user's buffer and B_WRITE it to the filesystem.
    300 	 */
    301 	if (rw == S_WRITE) {
    302 		bp->b_flags = B_BUSY | B_PHYS | B_READ;
    303 		ufs_directio_kstats.phys_reads.value.ui64++;
    304 		ufs_directio_kstats.nread.value.ui64 += nbytes;
    305 	} else {
    306 		bp->b_flags = B_BUSY | B_PHYS | B_WRITE;
    307 		ufs_directio_kstats.phys_writes.value.ui64++;
    308 		ufs_directio_kstats.nwritten.value.ui64 += nbytes;
    309 	}
    310 	bp->b_shadow = pplist;
    311 	if (pplist != NULL)
    312 		bp->b_flags |= B_SHADOW;
    313 
    314 	/*
    315 	 * Issue I/O request.
    316 	 */
    317 	ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
    318 	if (ufsvfsp->vfs_snapshot)
    319 		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
    320 	else
    321 		(void) bdev_strategy(bp);
    322 
    323 	if (rw == S_WRITE)
    324 		lwp_stat_update(LWP_STAT_OUBLK, 1);
    325 	else
    326 		lwp_stat_update(LWP_STAT_INBLK, 1);
    327 
    328 }
    329 
    330 uint32_t	ufs_shared_writes;	/* writes done w/ lock shared */
    331 uint32_t	ufs_cur_writes;		/* # concurrent writes */
    332 uint32_t	ufs_maxcur_writes;	/* high water concurrent writes */
    333 uint32_t	ufs_posix_hits;		/* writes done /w lock excl. */
    334 
    335 /*
    336  * Force POSIX syncronous data integrity on all writes for testing.
    337  */
    338 uint32_t	ufs_force_posix_sdi = 0;
    339 
    340 /*
    341  * Direct Write
    342  */
    343 
    344 int
    345 ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite,
    346 	cred_t *cr, int *statusp)
    347 {
    348 	long		resid, bytes_written;
    349 	u_offset_t	size, uoff;
    350 	uio_t		*uio = arg_uio;
    351 	rlim64_t	limit = uio->uio_llimit;
    352 	int		on, n, error, newerror, len, has_holes;
    353 	daddr_t		bn;
    354 	size_t		nbytes;
    355 	struct fs	*fs;
    356 	vnode_t		*vp;
    357 	iovec_t		*iov;
    358 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
    359 	struct proc	*procp;
    360 	struct as	*as;
    361 	struct directio_buf	*tail;
    362 	int		exclusive, ncur, bmap_peek;
    363 	uio_t		copy_uio;
    364 	iovec_t		copy_iov;
    365 	char		*copy_base;
    366 	long		copy_resid;
    367 
    368 	/*
    369 	 * assume that directio isn't possible (normal case)
    370 	 */
    371 	*statusp = DIRECTIO_FAILURE;
    372 
    373 	/*
    374 	 * Don't go direct
    375 	 */
    376 	if (ufs_directio_enabled == 0)
    377 		return (0);
    378 
    379 	/*
    380 	 * mapped file; nevermind
    381 	 */
    382 	if (ip->i_mapcnt)
    383 		return (0);
    384 
    385 	/*
    386 	 * CAN WE DO DIRECT IO?
    387 	 */
    388 	uoff = uio->uio_loffset;
    389 	resid = uio->uio_resid;
    390 
    391 	/*
    392 	 * beyond limit
    393 	 */
    394 	if (uoff + resid > limit)
    395 		return (0);
    396 
    397 	/*
    398 	 * must be sector aligned
    399 	 */
    400 	if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
    401 		return (0);
    402 
    403 	/*
    404 	 * SHOULD WE DO DIRECT IO?
    405 	 */
    406 	size = ip->i_size;
    407 	has_holes = -1;
    408 
    409 	/*
    410 	 * only on regular files; no metadata
    411 	 */
    412 	if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip)
    413 		return (0);
    414 
    415 	/*
    416 	 * Synchronous, allocating writes run very slow in Direct-Mode
    417 	 * 	XXX - can be fixed with bmap_write changes for large writes!!!
    418 	 *	XXX - can be fixed for updates to "almost-full" files
    419 	 *	XXX - WARNING - system hangs if bmap_write() has to
    420 	 * 			allocate lots of pages since pageout
    421 	 * 			suspends on locked inode
    422 	 */
    423 	if (!rewrite && (ip->i_flag & ISYNC)) {
    424 		if ((uoff + resid) > size)
    425 			return (0);
    426 		has_holes = bmap_has_holes(ip);
    427 		if (has_holes)
    428 			return (0);
    429 	}
    430 
    431 	/*
    432 	 * Each iovec must be short aligned and sector aligned.  If
    433 	 * one is not, then kmem_alloc a new buffer and copy all of
    434 	 * the smaller buffers into the new buffer.  This new
    435 	 * buffer will be short aligned and sector aligned.
    436 	 */
    437 	iov = uio->uio_iov;
    438 	nbytes = uio->uio_iovcnt;
    439 	while (nbytes--) {
    440 		if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 ||
    441 		    (intptr_t)(iov->iov_base) & 1) {
    442 			copy_resid = uio->uio_resid;
    443 			copy_base = kmem_alloc(copy_resid, KM_NOSLEEP);
    444 			if (copy_base == NULL)
    445 				return (0);
    446 			copy_iov.iov_base = copy_base;
    447 			copy_iov.iov_len = copy_resid;
    448 			copy_uio.uio_iov = &copy_iov;
    449 			copy_uio.uio_iovcnt = 1;
    450 			copy_uio.uio_segflg = UIO_SYSSPACE;
    451 			copy_uio.uio_extflg = UIO_COPY_DEFAULT;
    452 			copy_uio.uio_loffset = uio->uio_loffset;
    453 			copy_uio.uio_resid = uio->uio_resid;
    454 			copy_uio.uio_llimit = uio->uio_llimit;
    455 			error = uiomove(copy_base, copy_resid, UIO_WRITE, uio);
    456 			if (error) {
    457 				kmem_free(copy_base, copy_resid);
    458 				return (0);
    459 			}
    460 			uio = &copy_uio;
    461 			break;
    462 		}
    463 		iov++;
    464 	}
    465 
    466 	/*
    467 	 * From here on down, all error exits must go to errout and
    468 	 * not simply return a 0.
    469 	 */
    470 
    471 	/*
    472 	 * DIRECTIO
    473 	 */
    474 
    475 	fs = ip->i_fs;
    476 
    477 	/*
    478 	 * POSIX check. If attempting a concurrent re-write, make sure
    479 	 * that this will be a single request to the driver to meet
    480 	 * POSIX synchronous data integrity requirements.
    481 	 */
    482 	bmap_peek = 0;
    483 	if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) {
    484 		int upgrade = 0;
    485 
    486 		/* check easy conditions first */
    487 		if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) {
    488 			upgrade = 1;
    489 		} else {
    490 			/* now look for contiguous allocation */
    491 			len = (ssize_t)blkroundup(fs, resid);
    492 			error = bmap_read(ip, uoff, &bn, &len);
    493 			if (error || bn == UFS_HOLE || len == 0)
    494 				goto errout;
    495 			/* save a call to bmap_read later */
    496 			bmap_peek = 1;
    497 			if (len < resid)
    498 				upgrade = 1;
    499 		}
    500 		if (upgrade) {
    501 			rw_exit(&ip->i_contents);
    502 			rw_enter(&ip->i_contents, RW_WRITER);
    503 			ufs_posix_hits++;
    504 		}
    505 	}
    506 
    507 
    508 	/*
    509 	 * allocate space
    510 	 */
    511 
    512 	/*
    513 	 * If attempting a re-write, there is no allocation to do.
    514 	 * bmap_write would trip an ASSERT if i_contents is held shared.
    515 	 */
    516 	if (rewrite)
    517 		goto skip_alloc;
    518 
    519 	do {
    520 		on = (int)blkoff(fs, uoff);
    521 		n = (int)MIN(fs->fs_bsize - on, resid);
    522 		if ((uoff + n) > ip->i_size) {
    523 			error = bmap_write(ip, uoff, (int)(on + n),
    524 			    (int)(uoff & (offset_t)MAXBOFFSET) == 0,
    525 			    NULL, cr);
    526 			/* Caller is responsible for updating i_seq if needed */
    527 			if (error)
    528 				break;
    529 			ip->i_size = uoff + n;
    530 			ip->i_flag |= IATTCHG;
    531 		} else if (n == MAXBSIZE) {
    532 			error = bmap_write(ip, uoff, (int)(on + n),
    533 			    BI_ALLOC_ONLY, NULL, cr);
    534 			/* Caller is responsible for updating i_seq if needed */
    535 		} else {
    536 			if (has_holes < 0)
    537 				has_holes = bmap_has_holes(ip);
    538 			if (has_holes) {
    539 				uint_t	blk_size;
    540 				u_offset_t offset;
    541 
    542 				offset = uoff & (offset_t)fs->fs_bmask;
    543 				blk_size = (int)blksize(fs, ip,
    544 				    (daddr_t)lblkno(fs, offset));
    545 				error = bmap_write(ip, uoff, blk_size,
    546 				    BI_NORMAL, NULL, cr);
    547 				/*
    548 				 * Caller is responsible for updating
    549 				 * i_seq if needed
    550 				 */
    551 			} else
    552 				error = 0;
    553 		}
    554 		if (error)
    555 			break;
    556 		uoff += n;
    557 		resid -= n;
    558 		/*
    559 		 * if file has grown larger than 2GB, set flag
    560 		 * in superblock if not already set
    561 		 */
    562 		if ((ip->i_size > MAXOFF32_T) &&
    563 		    !(fs->fs_flags & FSLARGEFILES)) {
    564 			ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
    565 			mutex_enter(&ufsvfsp->vfs_lock);
    566 			fs->fs_flags |= FSLARGEFILES;
    567 			ufs_sbwrite(ufsvfsp);
    568 			mutex_exit(&ufsvfsp->vfs_lock);
    569 		}
    570 	} while (resid);
    571 
    572 	if (error) {
    573 		/*
    574 		 * restore original state
    575 		 */
    576 		if (resid) {
    577 			if (size == ip->i_size)
    578 				goto errout;
    579 			(void) ufs_itrunc(ip, size, 0, cr);
    580 		}
    581 		/*
    582 		 * try non-directio path
    583 		 */
    584 		goto errout;
    585 	}
    586 skip_alloc:
    587 
    588 	/*
    589 	 * get rid of cached pages
    590 	 */
    591 	vp = ITOV(ip);
    592 	exclusive = rw_write_held(&ip->i_contents);
    593 	if (vn_has_cached_data(vp)) {
    594 		if (!exclusive) {
    595 			/*
    596 			 * Still holding i_rwlock, so no allocations
    597 			 * can happen after dropping contents.
    598 			 */
    599 			rw_exit(&ip->i_contents);
    600 			rw_enter(&ip->i_contents, RW_WRITER);
    601 		}
    602 		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
    603 		    B_INVAL, cr, NULL);
    604 		if (vn_has_cached_data(vp))
    605 			goto errout;
    606 		if (!exclusive)
    607 			rw_downgrade(&ip->i_contents);
    608 		ufs_directio_kstats.nflushes.value.ui64++;
    609 	}
    610 
    611 	/*
    612 	 * Direct Writes
    613 	 */
    614 
    615 	if (!exclusive) {
    616 		ufs_shared_writes++;
    617 		ncur = atomic_add_32_nv(&ufs_cur_writes, 1);
    618 		if (ncur > ufs_maxcur_writes)
    619 			ufs_maxcur_writes = ncur;
    620 	}
    621 
    622 	/*
    623 	 * proc and as are for VM operations in directio_start()
    624 	 */
    625 	if (uio->uio_segflg == UIO_USERSPACE) {
    626 		procp = ttoproc(curthread);
    627 		as = procp->p_as;
    628 	} else {
    629 		procp = NULL;
    630 		as = &kas;
    631 	}
    632 	*statusp = DIRECTIO_SUCCESS;
    633 	error = 0;
    634 	newerror = 0;
    635 	resid = uio->uio_resid;
    636 	bytes_written = 0;
    637 	ufs_directio_kstats.logical_writes.value.ui64++;
    638 	while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
    639 		size_t pglck_len, pglck_size;
    640 		caddr_t pglck_base;
    641 		page_t **pplist, **spplist;
    642 
    643 		tail = NULL;
    644 
    645 		/*
    646 		 * Adjust number of bytes
    647 		 */
    648 		iov = uio->uio_iov;
    649 		pglck_len = (size_t)MIN(iov->iov_len, resid);
    650 		pglck_base = iov->iov_base;
    651 		if (pglck_len == 0) {
    652 			uio->uio_iov++;
    653 			uio->uio_iovcnt--;
    654 			continue;
    655 		}
    656 
    657 		/*
    658 		 * Try to Lock down the largest chunck of pages possible.
    659 		 */
    660 		pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
    661 		error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ);
    662 
    663 		if (error)
    664 			break;
    665 
    666 		pglck_size = pglck_len;
    667 		while (pglck_len) {
    668 
    669 			nbytes = pglck_len;
    670 			uoff = uio->uio_loffset;
    671 
    672 			if (!bmap_peek) {
    673 
    674 				/*
    675 				 * Re-adjust number of bytes to contiguous
    676 				 * range. May have already called bmap_read
    677 				 * in the case of a concurrent rewrite.
    678 				 */
    679 				len = (ssize_t)blkroundup(fs, nbytes);
    680 				error = bmap_read(ip, uoff, &bn, &len);
    681 				if (error)
    682 					break;
    683 				if (bn == UFS_HOLE || len == 0)
    684 					break;
    685 			}
    686 			nbytes = (size_t)MIN(nbytes, len);
    687 			bmap_peek = 0;
    688 
    689 			/*
    690 			 * Get the pagelist pointer for this offset to be
    691 			 * passed to directio_start.
    692 			 */
    693 
    694 			if (pplist != NULL)
    695 				spplist = pplist +
    696 				    btop((uintptr_t)iov->iov_base -
    697 				    ((uintptr_t)pglck_base & PAGEMASK));
    698 			else
    699 				spplist = NULL;
    700 
    701 			/*
    702 			 * Kick off the direct write requests
    703 			 */
    704 			directio_start(ufsvfsp, ip, nbytes, ldbtob(bn),
    705 			    iov->iov_base, S_READ, procp, &tail, spplist);
    706 
    707 			/*
    708 			 * Adjust pointers and counters
    709 			 */
    710 			iov->iov_len -= nbytes;
    711 			iov->iov_base += nbytes;
    712 			uio->uio_loffset += nbytes;
    713 			resid -= nbytes;
    714 			pglck_len -= nbytes;
    715 		}
    716 
    717 		/*
    718 		 * Wait for outstanding requests
    719 		 */
    720 		newerror = directio_wait(tail, &bytes_written);
    721 
    722 		/*
    723 		 * Release VM resources
    724 		 */
    725 		as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ);
    726 
    727 	}
    728 
    729 	if (!exclusive) {
    730 		atomic_add_32(&ufs_cur_writes, -1);
    731 		/*
    732 		 * If this write was done shared, readers may
    733 		 * have pulled in unmodified pages. Get rid of
    734 		 * these potentially stale pages.
    735 		 */
    736 		if (vn_has_cached_data(vp)) {
    737 			rw_exit(&ip->i_contents);
    738 			rw_enter(&ip->i_contents, RW_WRITER);
    739 			(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
    740 			    B_INVAL, cr, NULL);
    741 			ufs_directio_kstats.nflushes.value.ui64++;
    742 			rw_downgrade(&ip->i_contents);
    743 		}
    744 	}
    745 
    746 	/*
    747 	 * If error, adjust resid to begin at the first
    748 	 * un-writable byte.
    749 	 */
    750 	if (error == 0)
    751 		error = newerror;
    752 	if (error)
    753 		resid = uio->uio_resid - bytes_written;
    754 	arg_uio->uio_resid = resid;
    755 
    756 	if (!rewrite) {
    757 		ip->i_flag |= IUPD | ICHG;
    758 		/* Caller will update i_seq */
    759 		TRANS_INODE(ip->i_ufsvfs, ip);
    760 	}
    761 	/*
    762 	 * If there is a residual; adjust the EOF if necessary
    763 	 */
    764 	if (resid) {
    765 		if (size != ip->i_size) {
    766 			if (uio->uio_loffset > size)
    767 				size = uio->uio_loffset;
    768 			(void) ufs_itrunc(ip, size, 0, cr);
    769 		}
    770 	}
    771 
    772 	if (uio == &copy_uio)
    773 		kmem_free(copy_base, copy_resid);
    774 
    775 	return (error);
    776 
    777 errout:
    778 	if (uio == &copy_uio)
    779 		kmem_free(copy_base, copy_resid);
    780 
    781 	return (0);
    782 }
    783 /*
    784  * Direct read of a hole
    785  */
    786 static int
    787 directio_hole(struct uio *uio, size_t nbytes)
    788 {
    789 	int		error = 0, nzero;
    790 	uio_t		phys_uio;
    791 	iovec_t		phys_iov;
    792 
    793 	ufs_directio_kstats.hole_reads.value.ui64++;
    794 	ufs_directio_kstats.nread.value.ui64 += nbytes;
    795 
    796 	phys_iov.iov_base = uio->uio_iov->iov_base;
    797 	phys_iov.iov_len = nbytes;
    798 
    799 	phys_uio.uio_iov = &phys_iov;
    800 	phys_uio.uio_iovcnt = 1;
    801 	phys_uio.uio_resid = phys_iov.iov_len;
    802 	phys_uio.uio_segflg = uio->uio_segflg;
    803 	phys_uio.uio_extflg = uio->uio_extflg;
    804 	while (error == 0 && phys_uio.uio_resid) {
    805 		nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len);
    806 		error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ,
    807 		    &phys_uio);
    808 	}
    809 	return (error);
    810 }
    811 
    812 /*
    813  * Direct Read
    814  */
    815 int
    816 ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp)
    817 {
    818 	ssize_t		resid, bytes_read;
    819 	u_offset_t	size, uoff;
    820 	int		error, newerror, len;
    821 	size_t		nbytes;
    822 	struct fs	*fs;
    823 	vnode_t		*vp;
    824 	daddr_t		bn;
    825 	iovec_t		*iov;
    826 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
    827 	struct proc	*procp;
    828 	struct as	*as;
    829 	struct directio_buf	*tail;
    830 
    831 	/*
    832 	 * assume that directio isn't possible (normal case)
    833 	 */
    834 	*statusp = DIRECTIO_FAILURE;
    835 
    836 	/*
    837 	 * Don't go direct
    838 	 */
    839 	if (ufs_directio_enabled == 0)
    840 		return (0);
    841 
    842 	/*
    843 	 * mapped file; nevermind
    844 	 */
    845 	if (ip->i_mapcnt)
    846 		return (0);
    847 
    848 	/*
    849 	 * CAN WE DO DIRECT IO?
    850 	 */
    851 	/*
    852 	 * must be sector aligned
    853 	 */
    854 	uoff = uio->uio_loffset;
    855 	resid = uio->uio_resid;
    856 	if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
    857 		return (0);
    858 	/*
    859 	 * must be short aligned and sector aligned
    860 	 */
    861 	iov = uio->uio_iov;
    862 	nbytes = uio->uio_iovcnt;
    863 	while (nbytes--) {
    864 		if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0)
    865 			return (0);
    866 		if ((intptr_t)(iov++->iov_base) & 1)
    867 			return (0);
    868 	}
    869 
    870 	/*
    871 	 * DIRECTIO
    872 	 */
    873 	fs = ip->i_fs;
    874 
    875 	/*
    876 	 * don't read past EOF
    877 	 */
    878 	size = ip->i_size;
    879 
    880 	/*
    881 	 * The file offset is past EOF so bail out here; we don't want
    882 	 * to update uio_resid and make it look like we read something.
    883 	 * We say that direct I/O was a success to avoid having rdip()
    884 	 * go through the same "read past EOF logic".
    885 	 */
    886 	if (uoff >= size) {
    887 		*statusp = DIRECTIO_SUCCESS;
    888 		return (0);
    889 	}
    890 
    891 	/*
    892 	 * The read would extend past EOF so make it smaller.
    893 	 */
    894 	if ((uoff + resid) > size) {
    895 		resid = size - uoff;
    896 		/*
    897 		 * recheck sector alignment
    898 		 */
    899 		if (resid & (DEV_BSIZE - 1))
    900 			return (0);
    901 	}
    902 
    903 	/*
    904 	 * At this point, we know there is some real work to do.
    905 	 */
    906 	ASSERT(resid);
    907 
    908 	/*
    909 	 * get rid of cached pages
    910 	 */
    911 	vp = ITOV(ip);
    912 	if (vn_has_cached_data(vp)) {
    913 		rw_exit(&ip->i_contents);
    914 		rw_enter(&ip->i_contents, RW_WRITER);
    915 		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
    916 		    B_INVAL, cr, NULL);
    917 		if (vn_has_cached_data(vp))
    918 			return (0);
    919 		rw_downgrade(&ip->i_contents);
    920 		ufs_directio_kstats.nflushes.value.ui64++;
    921 	}
    922 	/*
    923 	 * Direct Reads
    924 	 */
    925 
    926 	/*
    927 	 * proc and as are for VM operations in directio_start()
    928 	 */
    929 	if (uio->uio_segflg == UIO_USERSPACE) {
    930 		procp = ttoproc(curthread);
    931 		as = procp->p_as;
    932 	} else {
    933 		procp = NULL;
    934 		as = &kas;
    935 	}
    936 
    937 	*statusp = DIRECTIO_SUCCESS;
    938 	error = 0;
    939 	newerror = 0;
    940 	bytes_read = 0;
    941 	ufs_directio_kstats.logical_reads.value.ui64++;
    942 	while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
    943 		size_t pglck_len, pglck_size;
    944 		caddr_t pglck_base;
    945 		page_t **pplist, **spplist;
    946 
    947 		tail = NULL;
    948 
    949 		/*
    950 		 * Adjust number of bytes
    951 		 */
    952 		iov = uio->uio_iov;
    953 		pglck_len = (size_t)MIN(iov->iov_len, resid);
    954 		pglck_base = iov->iov_base;
    955 		if (pglck_len == 0) {
    956 			uio->uio_iov++;
    957 			uio->uio_iovcnt--;
    958 			continue;
    959 		}
    960 
    961 		/*
    962 		 * Try to Lock down the largest chunck of pages possible.
    963 		 */
    964 		pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
    965 		error = as_pagelock(as, &pplist, pglck_base,
    966 		    pglck_len, S_WRITE);
    967 
    968 		if (error)
    969 			break;
    970 
    971 		pglck_size = pglck_len;
    972 		while (pglck_len) {
    973 
    974 			nbytes = pglck_len;
    975 			uoff = uio->uio_loffset;
    976 
    977 			/*
    978 			 * Re-adjust number of bytes to contiguous range
    979 			 */
    980 			len = (ssize_t)blkroundup(fs, nbytes);
    981 			error = bmap_read(ip, uoff, &bn, &len);
    982 			if (error)
    983 				break;
    984 
    985 			if (bn == UFS_HOLE) {
    986 				nbytes = (size_t)MIN(fs->fs_bsize -
    987 				    (long)blkoff(fs, uoff), nbytes);
    988 				error = directio_hole(uio, nbytes);
    989 				/*
    990 				 * Hole reads are not added to the list
    991 				 * processed by directio_wait() below so
    992 				 * account for bytes read here.
    993 				 */
    994 				if (!error)
    995 					bytes_read += nbytes;
    996 			} else {
    997 				nbytes = (size_t)MIN(nbytes, len);
    998 
    999 				/*
   1000 				 * Get the pagelist pointer for this offset
   1001 				 * to be passed to directio_start.
   1002 				 */
   1003 				if (pplist != NULL)
   1004 					spplist = pplist +
   1005 					    btop((uintptr_t)iov->iov_base -
   1006 					    ((uintptr_t)pglck_base & PAGEMASK));
   1007 				else
   1008 					spplist = NULL;
   1009 
   1010 				/*
   1011 				 * Kick off the direct read requests
   1012 				 */
   1013 				directio_start(ufsvfsp, ip, nbytes,
   1014 				    ldbtob(bn), iov->iov_base,
   1015 				    S_WRITE, procp, &tail, spplist);
   1016 			}
   1017 
   1018 			if (error)
   1019 				break;
   1020 
   1021 			/*
   1022 			 * Adjust pointers and counters
   1023 			 */
   1024 			iov->iov_len -= nbytes;
   1025 			iov->iov_base += nbytes;
   1026 			uio->uio_loffset += nbytes;
   1027 			resid -= nbytes;
   1028 			pglck_len -= nbytes;
   1029 		}
   1030 
   1031 		/*
   1032 		 * Wait for outstanding requests
   1033 		 */
   1034 		newerror = directio_wait(tail, &bytes_read);
   1035 		/*
   1036 		 * Release VM resources
   1037 		 */
   1038 		as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE);
   1039 
   1040 	}
   1041 
   1042 	/*
   1043 	 * If error, adjust resid to begin at the first
   1044 	 * un-read byte.
   1045 	 */
   1046 	if (error == 0)
   1047 		error = newerror;
   1048 	uio->uio_resid -= bytes_read;
   1049 	return (error);
   1050 }
   1051