Home | History | Annotate | Download | only in ufs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 #include <sys/types.h>
     29 #include <sys/param.h>
     30 #include <sys/sysmacros.h>
     31 #include <sys/conf.h>
     32 #include <sys/fssnap_if.h>
     33 #include <sys/fs/ufs_inode.h>
     34 #include <sys/fs/ufs_lockfs.h>
     35 #include <sys/fs/ufs_log.h>
     36 #include <sys/fs/ufs_trans.h>
     37 #include <sys/cmn_err.h>
     38 #include <vm/pvn.h>
     39 #include <vm/seg_map.h>
     40 #include <sys/fdbuffer.h>
     41 
     42 #ifdef DEBUG
     43 int evn_ufs_debug = 0;
     44 #define	DEBUGF(args)	{ if (evn_ufs_debug) cmn_err args; }
     45 #else
     46 #define	DEBUGF(args)
     47 #endif
     48 
     49 /*
     50  * ufs_rdwr_data - supports reading or writing data when
     51  * no changes are permitted in file size or space allocation.
     52  *
     53  * Inputs:
     54  * fdb - The mandatory fdbuffer supports
     55  *	the read or write operation.
     56  * flags - defaults (zero value) to synchronous write
     57  *	B_READ - indicates read operation
     58  *	B_ASYNC - indicates perform operation asynchronously
     59  */
     60 /*ARGSUSED*/
     61 int
     62 ufs_rdwr_data(
     63 	vnode_t		*vnodep,
     64 	u_offset_t	offset,
     65 	size_t		len,
     66 	fdbuffer_t	*fdbp,
     67 	int		flags,
     68 	cred_t		*credp)
     69 {
     70 	struct inode	*ip = VTOI(vnodep);
     71 	struct fs	*fs;
     72 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
     73 	struct buf	*bp;
     74 	krw_t		rwtype = RW_READER;
     75 	u_offset_t	offset1 = offset;	/* Initial offset */
     76 	size_t		iolen;
     77 	int		curlen = 0;
     78 	int		pplen;
     79 	daddr_t		bn;
     80 	int		contig = 0;
     81 	int		error = 0;
     82 	int		nbytes;			/* Number bytes this IO */
     83 	int		offsetn;		/* Start point this IO */
     84 	int		iswrite = flags & B_WRITE;
     85 	int		io_started = 0;		/* No IO started */
     86 	struct ulockfs	*ulp;
     87 	uint_t		protp = PROT_ALL;
     88 
     89 	error = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, segkmap, !iswrite,
     90 	    &protp);
     91 	if (error) {
     92 		if (flags & B_ASYNC) {
     93 			fdb_ioerrdone(fdbp, error);
     94 		}
     95 		return (error);
     96 	}
     97 	fs = ufsvfsp->vfs_fs;
     98 	iolen = len;
     99 
    100 	DEBUGF((CE_CONT, "?ufs_rdwr: %s vp: %p pages:%p  off %llx len %lx"
    101 	    " isize: %llx fdb: %p\n",
    102 	    flags & B_READ ? "READ" : "WRITE", (void *)vnodep,
    103 	    (void *)vnodep->v_pages, offset1, iolen, ip->i_size, (void *)fdbp));
    104 
    105 	rw_enter(&ip->i_ufsvfs->vfs_dqrwlock, RW_READER);
    106 	rw_enter(&ip->i_contents, rwtype);
    107 
    108 	ASSERT(offset1 < ip->i_size);
    109 
    110 	if ((offset1 + iolen) > ip->i_size) {
    111 		iolen = ip->i_size - offset1;
    112 	}
    113 	while (!error && curlen < iolen) {
    114 
    115 		contig = 0;
    116 
    117 		if ((error = bmap_read(ip, offset1, &bn, &contig)) != 0) {
    118 			break;
    119 		}
    120 		ASSERT(!(bn == UFS_HOLE && iswrite));
    121 		if (bn == UFS_HOLE) {
    122 			/*
    123 			 * If the above assertion is true,
    124 			 * then the following if statement can never be true.
    125 			 */
    126 			if (iswrite && (rwtype == RW_READER)) {
    127 				rwtype = RW_WRITER;
    128 				if (!rw_tryupgrade(&ip->i_contents)) {
    129 					rw_exit(&ip->i_contents);
    130 					rw_enter(&ip->i_contents, rwtype);
    131 					continue;
    132 				}
    133 			}
    134 			offsetn = blkoff(fs, offset1);
    135 			pplen = P2ROUNDUP(len, PAGESIZE);
    136 			nbytes = MIN((pplen - curlen),
    137 			    (fs->fs_bsize - offsetn));
    138 			ASSERT(nbytes > 0);
    139 
    140 			/*
    141 			 * We may be reading or writing.
    142 			 */
    143 			DEBUGF((CE_CONT, "?ufs_rdwr_data: hole %llx - %lx\n",
    144 			    offset1, (iolen - curlen)));
    145 
    146 			if (iswrite) {
    147 				printf("**WARNING: ignoring hole in write\n");
    148 				error = ENOSPC;
    149 			} else {
    150 				fdb_add_hole(fdbp, offset1 - offset, nbytes);
    151 			}
    152 			offset1 += nbytes;
    153 			curlen += nbytes;
    154 			continue;
    155 
    156 		}
    157 		ASSERT(contig > 0);
    158 		pplen = P2ROUNDUP(len, PAGESIZE);
    159 
    160 		contig = MIN(contig, len - curlen);
    161 		contig = P2ROUNDUP(contig, DEV_BSIZE);
    162 
    163 		bp = fdb_iosetup(fdbp, offset1 - offset, contig, vnodep, flags);
    164 
    165 		bp->b_edev = ip->i_dev;
    166 		bp->b_dev = cmpdev(ip->i_dev);
    167 		bp->b_blkno = bn;
    168 		bp->b_file = ip->i_vnode;
    169 		bp->b_offset = (offset_t)offset1;
    170 
    171 		if (ufsvfsp->vfs_snapshot) {
    172 			fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
    173 		} else {
    174 			(void) bdev_strategy(bp);
    175 		}
    176 		io_started = 1;
    177 
    178 		offset1 += contig;
    179 		curlen += contig;
    180 		if (iswrite)
    181 			lwp_stat_update(LWP_STAT_OUBLK, 1);
    182 		else
    183 			lwp_stat_update(LWP_STAT_INBLK, 1);
    184 
    185 		if ((flags & B_ASYNC) == 0) {
    186 			error = biowait(bp);
    187 			fdb_iodone(bp);
    188 		}
    189 
    190 		DEBUGF((CE_CONT, "?loop ufs_rdwr_data.. off %llx len %lx\n",
    191 		    offset1, (iolen - curlen)));
    192 	}
    193 
    194 	DEBUGF((CE_CONT, "?ufs_rdwr_data: off %llx len %lx pages: %p ------\n",
    195 	    offset1, (iolen - curlen), (void *)vnodep->v_pages));
    196 
    197 	rw_exit(&ip->i_contents);
    198 	rw_exit(&ip->i_ufsvfs->vfs_dqrwlock);
    199 
    200 	if (flags & B_ASYNC) {
    201 		/*
    202 		 * Show that no more asynchronous IO will be added
    203 		 */
    204 		fdb_ioerrdone(fdbp, error);
    205 	}
    206 	if (ulp) {
    207 		ufs_lockfs_end(ulp);
    208 	}
    209 	if (io_started && flags & B_ASYNC) {
    210 		return (0);
    211 	} else {
    212 		return (error);
    213 	}
    214 }
    215 
    216 /*
    217  * ufs_alloc_data - supports allocating space and reads or writes
    218  * that involve changes to file length or space allocation.
    219  *
    220  * This function is more expensive, because of the UFS log transaction,
    221  * so ufs_rdwr_data() should be used when space or file length changes
    222  * will not occur.
    223  *
    224  * Inputs:
    225  * fdb - A null pointer instructs this function to only allocate
    226  *	space for the specified offset and length.
    227  *	An actual fdbuffer instructs this function to perform
    228  *	the read or write operation.
    229  * flags - defaults (zero value) to synchronous write
    230  *	B_READ - indicates read operation
    231  *	B_ASYNC - indicates perform operation asynchronously
    232  */
    233 int
    234 ufs_alloc_data(
    235 	vnode_t		*vnodep,
    236 	u_offset_t	offset,
    237 	size_t		*len,
    238 	fdbuffer_t	*fdbp,
    239 	int		flags,
    240 	cred_t		*credp)
    241 {
    242 	struct inode	*ip = VTOI(vnodep);
    243 	size_t		done_len, io_len;
    244 	int		contig;
    245 	u_offset_t	uoff, io_off;
    246 	int		error = 0;		/* No error occurred */
    247 	int		offsetn;		/* Start point this IO */
    248 	int		nbytes;			/* Number bytes in this IO */
    249 	daddr_t		bn;
    250 	struct fs	*fs;
    251 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
    252 	int		i_size_changed = 0;
    253 	u_offset_t	old_i_size;
    254 	struct ulockfs	*ulp;
    255 	int		trans_size;
    256 	int		issync;			/* UFS Log transaction */
    257 						/* synchronous when non-zero */
    258 
    259 	int		io_started = 0;		/* No IO started */
    260 	uint_t		protp = PROT_ALL;
    261 
    262 	ASSERT((flags & B_WRITE) == 0);
    263 
    264 	/*
    265 	 * Obey the lockfs protocol
    266 	 */
    267 	error = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, segkmap, 0, &protp);
    268 	if (error) {
    269 		if ((fdbp != NULL) && (flags & B_ASYNC)) {
    270 			fdb_ioerrdone(fdbp, error);
    271 		}
    272 		return (error);
    273 	}
    274 	if (ulp) {
    275 		/*
    276 		 * Try to begin a UFS log transaction
    277 		 */
    278 		trans_size = TOP_GETPAGE_SIZE(ip);
    279 		TRANS_TRY_BEGIN_CSYNC(ufsvfsp, issync, TOP_GETPAGE,
    280 		    trans_size, error);
    281 		if (error == EWOULDBLOCK) {
    282 			ufs_lockfs_end(ulp);
    283 			if ((fdbp != NULL) && (flags & B_ASYNC)) {
    284 				fdb_ioerrdone(fdbp, EDEADLK);
    285 			}
    286 			return (EDEADLK);
    287 		}
    288 	}
    289 
    290 	uoff = offset;
    291 	io_off = offset;
    292 	io_len = *len;
    293 	done_len = 0;
    294 
    295 	DEBUGF((CE_CONT, "?ufs_alloc: off %llx len %lx size %llx fdb: %p\n",
    296 	    uoff, (io_len - done_len), ip->i_size, (void *)fdbp));
    297 
    298 	rw_enter(&ip->i_ufsvfs->vfs_dqrwlock, RW_READER);
    299 	rw_enter(&ip->i_contents, RW_WRITER);
    300 
    301 	ASSERT((ip->i_mode & IFMT) == IFREG);
    302 
    303 	fs = ip->i_fs;
    304 
    305 	while (error == 0 && done_len < io_len) {
    306 		uoff = (u_offset_t)(io_off + done_len);
    307 		offsetn = (int)blkoff(fs, uoff);
    308 		nbytes = (int)MIN(fs->fs_bsize - offsetn, io_len - done_len);
    309 
    310 		DEBUGF((CE_CONT, "?ufs_alloc_data: offset: %llx len %x\n",
    311 		    uoff, nbytes));
    312 
    313 		if (uoff + nbytes > ip->i_size) {
    314 			/*
    315 			 * We are extending the length of the file.
    316 			 * bmap is used so that we are sure that
    317 			 * if we need to allocate new blocks, that it
    318 			 * is done here before we up the file size.
    319 			 */
    320 			DEBUGF((CE_CONT, "?ufs_alloc_data: grow %llx -> %llx\n",
    321 			    ip->i_size, uoff + nbytes));
    322 
    323 			error = bmap_write(ip, uoff, (offsetn + nbytes),
    324 			    BI_ALLOC_ONLY, NULL, credp);
    325 			if (ip->i_flag & (ICHG|IUPD))
    326 				ip->i_seq++;
    327 			if (error) {
    328 				DEBUGF((CE_CONT, "?ufs_alloc_data: grow "
    329 				    "failed err: %d\n", error));
    330 				break;
    331 			}
    332 			if (fdbp != NULL) {
    333 				if (uoff >= ip->i_size) {
    334 					/*
    335 					 * Desired offset is past end of bytes
    336 					 * in file, so we have a hole.
    337 					 */
    338 					fdb_add_hole(fdbp, uoff - offset,
    339 					    nbytes);
    340 				} else {
    341 					int contig;
    342 					buf_t *bp;
    343 
    344 					error = bmap_read(ip, uoff, &bn,
    345 					    &contig);
    346 					if (error) {
    347 						break;
    348 					}
    349 
    350 					contig = ip->i_size - uoff;
    351 					contig = P2ROUNDUP(contig, DEV_BSIZE);
    352 
    353 					bp = fdb_iosetup(fdbp, uoff - offset,
    354 					    contig, vnodep, flags);
    355 
    356 					bp->b_edev = ip->i_dev;
    357 					bp->b_dev = cmpdev(ip->i_dev);
    358 					bp->b_blkno = bn;
    359 					bp->b_file = ip->i_vnode;
    360 					bp->b_offset = (offset_t)uoff;
    361 
    362 					if (ufsvfsp->vfs_snapshot) {
    363 						fssnap_strategy(
    364 						    &ufsvfsp->vfs_snapshot, bp);
    365 					} else {
    366 						(void) bdev_strategy(bp);
    367 					}
    368 					io_started = 1;
    369 
    370 					lwp_stat_update(LWP_STAT_OUBLK, 1);
    371 
    372 					if ((flags & B_ASYNC) == 0) {
    373 						error = biowait(bp);
    374 						fdb_iodone(bp);
    375 						if (error) {
    376 							break;
    377 						}
    378 					}
    379 					if (contig > (ip->i_size - uoff)) {
    380 						contig -= ip->i_size - uoff;
    381 
    382 						fdb_add_hole(fdbp,
    383 						    ip->i_size - offset,
    384 						    contig);
    385 					}
    386 				}
    387 			}
    388 
    389 			i_size_changed = 1;
    390 			old_i_size = ip->i_size;
    391 			UFS_SET_ISIZE(uoff + nbytes, ip);
    392 			TRANS_INODE(ip->i_ufsvfs, ip);
    393 			/*
    394 			 * file has grown larger than 2GB. Set flag
    395 			 * in superblock to indicate this, if it
    396 			 * is not already set.
    397 			 */
    398 			if ((ip->i_size > MAXOFF32_T) &&
    399 			    !(fs->fs_flags & FSLARGEFILES)) {
    400 				ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
    401 				mutex_enter(&ufsvfsp->vfs_lock);
    402 				fs->fs_flags |= FSLARGEFILES;
    403 				ufs_sbwrite(ufsvfsp);
    404 				mutex_exit(&ufsvfsp->vfs_lock);
    405 			}
    406 		} else {
    407 			/*
    408 			 * The file length is not being extended.
    409 			 */
    410 			error = bmap_read(ip, uoff, &bn, &contig);
    411 			if (error) {
    412 				DEBUGF((CE_CONT, "?ufs_alloc_data: "
    413 				    "bmap_read err: %d\n", error));
    414 				break;
    415 			}
    416 
    417 			if (bn != UFS_HOLE) {
    418 				/*
    419 				 * Did not map a hole in the file
    420 				 */
    421 				int	contig = P2ROUNDUP(nbytes, DEV_BSIZE);
    422 				buf_t	*bp;
    423 
    424 				if (fdbp != NULL) {
    425 					bp = fdb_iosetup(fdbp, uoff - offset,
    426 					    contig, vnodep, flags);
    427 
    428 					bp->b_edev = ip->i_dev;
    429 					bp->b_dev = cmpdev(ip->i_dev);
    430 					bp->b_blkno = bn;
    431 					bp->b_file = ip->i_vnode;
    432 					bp->b_offset = (offset_t)uoff;
    433 
    434 					if (ufsvfsp->vfs_snapshot) {
    435 						fssnap_strategy(
    436 						    &ufsvfsp->vfs_snapshot, bp);
    437 					} else {
    438 						(void) bdev_strategy(bp);
    439 					}
    440 					io_started = 1;
    441 
    442 					lwp_stat_update(LWP_STAT_OUBLK, 1);
    443 
    444 					if ((flags & B_ASYNC) == 0) {
    445 						error = biowait(bp);
    446 						fdb_iodone(bp);
    447 						if (error) {
    448 							break;
    449 						}
    450 					}
    451 				}
    452 			} else {
    453 				/*
    454 				 * We read a hole in the file.
    455 				 * We have to allocate blocks for the hole.
    456 				 */
    457 				error = bmap_write(ip, uoff, (offsetn + nbytes),
    458 				    BI_ALLOC_ONLY, NULL, credp);
    459 				if (ip->i_flag & (ICHG|IUPD))
    460 					ip->i_seq++;
    461 				if (error) {
    462 					DEBUGF((CE_CONT, "?ufs_alloc_data: fill"
    463 					    " hole failed error: %d\n", error));
    464 					break;
    465 				}
    466 				if (fdbp != NULL) {
    467 					fdb_add_hole(fdbp, uoff - offset,
    468 					    nbytes);
    469 				}
    470 			}
    471 		}
    472 		done_len += nbytes;
    473 	}
    474 
    475 	if (error) {
    476 		if (i_size_changed) {
    477 			/*
    478 			 * Allocation of the blocks for the file failed.
    479 			 * So truncate the file size back to its original size.
    480 			 */
    481 			(void) ufs_itrunc(ip, old_i_size, 0, credp);
    482 		}
    483 	}
    484 
    485 	DEBUGF((CE_CONT, "?ufs_alloc: uoff %llx len %lx\n",
    486 	    uoff, (io_len - done_len)));
    487 
    488 	if ((offset + *len) < (NDADDR * fs->fs_bsize)) {
    489 		*len = (size_t)(roundup(offset + *len, fs->fs_fsize) - offset);
    490 	} else {
    491 		*len = (size_t)(roundup(offset + *len, fs->fs_bsize) - offset);
    492 	}
    493 
    494 	/*
    495 	 * Flush cached pages.
    496 	 *
    497 	 * XXX - There should be no pages involved, since the I/O was performed
    498 	 * through the device strategy routine and the page cache was bypassed.
    499 	 * However, testing has demonstrated that this VOP_PUTPAGE is
    500 	 * necessary. Without this, data might not always be read back as it
    501 	 * was written.
    502 	 *
    503 	 */
    504 	(void) VOP_PUTPAGE(vnodep, 0, 0, B_INVAL, credp, NULL);
    505 
    506 	rw_exit(&ip->i_contents);
    507 	rw_exit(&ip->i_ufsvfs->vfs_dqrwlock);
    508 
    509 	if ((fdbp != NULL) && (flags & B_ASYNC)) {
    510 		/*
    511 		 * Show that no more asynchronous IO will be added
    512 		 */
    513 		fdb_ioerrdone(fdbp, error);
    514 	}
    515 	if (ulp) {
    516 		/*
    517 		 * End the UFS Log transaction
    518 		 */
    519 		TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_GETPAGE,
    520 		    trans_size);
    521 		ufs_lockfs_end(ulp);
    522 	}
    523 	if (io_started && (flags & B_ASYNC)) {
    524 		return (0);
    525 	} else {
    526 		return (error);
    527 	}
    528 }
    529