Home | History | Annotate | Download | only in udfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/types.h>
     28 #include <sys/t_lock.h>
     29 #include <sys/param.h>
     30 #include <sys/time.h>
     31 #include <sys/systm.h>
     32 #include <sys/sysmacros.h>
     33 #include <sys/resource.h>
     34 #include <sys/signal.h>
     35 #include <sys/cred.h>
     36 #include <sys/user.h>
     37 #include <sys/buf.h>
     38 #include <sys/vfs.h>
     39 #include <sys/vfs_opreg.h>
     40 #include <sys/stat.h>
     41 #include <sys/vnode.h>
     42 #include <sys/mode.h>
     43 #include <sys/proc.h>
     44 #include <sys/disp.h>
     45 #include <sys/file.h>
     46 #include <sys/fcntl.h>
     47 #include <sys/flock.h>
     48 #include <sys/kmem.h>
     49 #include <sys/uio.h>
     50 #include <sys/dnlc.h>
     51 #include <sys/conf.h>
     52 #include <sys/errno.h>
     53 #include <sys/mman.h>
     54 #include <sys/fbuf.h>
     55 #include <sys/pathname.h>
     56 #include <sys/debug.h>
     57 #include <sys/vmsystm.h>
     58 #include <sys/cmn_err.h>
     59 #include <sys/dirent.h>
     60 #include <sys/errno.h>
     61 #include <sys/modctl.h>
     62 #include <sys/statvfs.h>
     63 #include <sys/mount.h>
     64 #include <sys/sunddi.h>
     65 #include <sys/bootconf.h>
     66 #include <sys/policy.h>
     67 
     68 #include <vm/hat.h>
     69 #include <vm/page.h>
     70 #include <vm/pvn.h>
     71 #include <vm/as.h>
     72 #include <vm/seg.h>
     73 #include <vm/seg_map.h>
     74 #include <vm/seg_kmem.h>
     75 #include <vm/seg_vn.h>
     76 #include <vm/rm.h>
     77 #include <vm/page.h>
     78 #include <sys/swap.h>
     79 
     80 #include <fs/fs_subr.h>
     81 
     82 #include <sys/fs/udf_volume.h>
     83 #include <sys/fs/udf_inode.h>
     84 
     85 static int32_t udf_open(struct vnode **,
     86 	int32_t, struct cred *, caller_context_t *);
     87 static int32_t udf_close(struct vnode *,
     88 	int32_t, int32_t, offset_t, struct cred *, caller_context_t *);
     89 static int32_t udf_read(struct vnode *,
     90 	struct uio *, int32_t, struct cred *, caller_context_t *);
     91 static int32_t udf_write(struct vnode *,
     92 	struct uio *, int32_t, struct cred *, caller_context_t *);
     93 static int32_t udf_ioctl(struct vnode *,
     94 	int32_t, intptr_t, int32_t, struct cred *, int32_t *,
     95 	caller_context_t *);
     96 static int32_t udf_getattr(struct vnode *,
     97 	struct vattr *, int32_t, struct cred *, caller_context_t *);
     98 static int32_t udf_setattr(struct vnode *,
     99 	struct vattr *, int32_t, struct cred *, caller_context_t *);
    100 static int32_t udf_access(struct vnode *,
    101 	int32_t, int32_t, struct cred *, caller_context_t *);
    102 static int32_t udf_lookup(struct vnode *,
    103 	char *, struct vnode **, struct pathname *,
    104 	int32_t, struct vnode *, struct cred *,
    105 	caller_context_t *, int *, pathname_t *);
    106 static int32_t udf_create(struct vnode *,
    107 	char *, struct vattr *, enum vcexcl,
    108 	int32_t, struct vnode **, struct cred *, int32_t,
    109 	caller_context_t *, vsecattr_t *);
    110 static int32_t udf_remove(struct vnode *,
    111 	char *, struct cred *, caller_context_t *, int);
    112 static int32_t udf_link(struct vnode *,
    113 	struct vnode *, char *, struct cred *, caller_context_t *, int);
    114 static int32_t udf_rename(struct vnode *,
    115 	char *, struct vnode *, char *, struct cred *, caller_context_t *, int);
    116 static int32_t udf_mkdir(struct vnode *,
    117 	char *, struct vattr *, struct vnode **, struct cred *,
    118 	caller_context_t *, int, vsecattr_t *);
    119 static int32_t udf_rmdir(struct vnode *,
    120 	char *, struct vnode *, struct cred *, caller_context_t *, int);
    121 static int32_t udf_readdir(struct vnode *,
    122 	struct uio *, struct cred *, int32_t *, caller_context_t *, int);
    123 static int32_t udf_symlink(struct vnode *,
    124 	char *, struct vattr *, char *, struct cred *, caller_context_t *, int);
    125 static int32_t udf_readlink(struct vnode *,
    126 	struct uio *, struct cred *, caller_context_t *);
    127 static int32_t udf_fsync(struct vnode *,
    128 	int32_t, struct cred *, caller_context_t *);
    129 static void udf_inactive(struct vnode *,
    130 	struct cred *, caller_context_t *);
    131 static int32_t udf_fid(struct vnode *, struct fid *, caller_context_t *);
    132 static int udf_rwlock(struct vnode *, int32_t, caller_context_t *);
    133 static void udf_rwunlock(struct vnode *, int32_t, caller_context_t *);
    134 static int32_t udf_seek(struct vnode *, offset_t, offset_t *,
    135 	caller_context_t *);
    136 static int32_t udf_frlock(struct vnode *, int32_t,
    137 	struct flock64 *, int32_t, offset_t, struct flk_callback *, cred_t *,
    138 	caller_context_t *);
    139 static int32_t udf_space(struct vnode *, int32_t,
    140 	struct flock64 *, int32_t, offset_t, cred_t *, caller_context_t *);
    141 static int32_t udf_getpage(struct vnode *, offset_t,
    142 	size_t, uint32_t *, struct page **, size_t,
    143 	struct seg *, caddr_t, enum seg_rw, struct cred *, caller_context_t *);
    144 static int32_t udf_putpage(struct vnode *, offset_t,
    145 	size_t, int32_t, struct cred *, caller_context_t *);
    146 static int32_t udf_map(struct vnode *, offset_t, struct as *,
    147 	caddr_t *, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
    148 	caller_context_t *);
    149 static int32_t udf_addmap(struct vnode *, offset_t, struct as *,
    150 	caddr_t, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
    151 	caller_context_t *);
    152 static int32_t udf_delmap(struct vnode *, offset_t, struct as *,
    153 	caddr_t, size_t, uint32_t, uint32_t, uint32_t, struct cred *,
    154 	caller_context_t *);
    155 static int32_t udf_l_pathconf(struct vnode *, int32_t,
    156 	ulong_t *, struct cred *, caller_context_t *);
    157 static int32_t udf_pageio(struct vnode *, struct page *,
    158 	u_offset_t, size_t, int32_t, struct cred *, caller_context_t *);
    159 
    160 int32_t ud_getpage_miss(struct vnode *, u_offset_t,
    161 	size_t, struct seg *, caddr_t, page_t *pl[],
    162 	size_t, enum seg_rw, int32_t);
    163 void ud_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
    164 int32_t ud_putpages(struct vnode *, offset_t, size_t, int32_t, struct cred *);
    165 int32_t ud_page_fill(struct ud_inode *, page_t *,
    166 	u_offset_t, uint32_t, u_offset_t *);
    167 int32_t ud_iodone(struct buf *);
    168 int32_t ud_rdip(struct ud_inode *, struct uio *, int32_t, cred_t *);
    169 int32_t ud_wrip(struct ud_inode *, struct uio *, int32_t, cred_t *);
    170 int32_t ud_multi_strat(struct ud_inode *, page_t *, struct buf *, u_offset_t);
    171 int32_t ud_slave_done(struct buf *);
    172 
    173 /*
    174  * Structures to control multiple IO operations to get or put pages
    175  * that are backed by discontiguous blocks. The master struct is
    176  * a dummy that holds the original bp from pageio_setup. The
    177  * slave struct holds the working bp's to do the actual IO. Once
    178  * all the slave IOs complete. The master is processed as if a single
    179  * IO op has completed.
    180  */
    181 uint32_t master_index = 0;
    182 typedef struct mio_master {
    183 	kmutex_t	mm_mutex;	/* protect the fields below */
    184 	int32_t		mm_size;
    185 	buf_t		*mm_bp;		/* original bp */
    186 	int32_t		mm_resid;	/* bytes remaining to transfer */
    187 	int32_t		mm_error;	/* accumulated error from slaves */
    188 	int32_t		mm_index;	/* XXX debugging */
    189 } mio_master_t;
    190 
    191 typedef struct mio_slave {
    192 	buf_t		ms_buf;		/* working buffer for this IO chunk */
    193 	mio_master_t	*ms_ptr;	/* pointer to master */
    194 } mio_slave_t;
    195 
    196 struct vnodeops *udf_vnodeops;
    197 
    198 const fs_operation_def_t udf_vnodeops_template[] = {
    199 	VOPNAME_OPEN,		{ .vop_open = udf_open },
    200 	VOPNAME_CLOSE,		{ .vop_close = udf_close },
    201 	VOPNAME_READ,		{ .vop_read = udf_read },
    202 	VOPNAME_WRITE,		{ .vop_write = udf_write },
    203 	VOPNAME_IOCTL,		{ .vop_ioctl = udf_ioctl },
    204 	VOPNAME_GETATTR,	{ .vop_getattr = udf_getattr },
    205 	VOPNAME_SETATTR,	{ .vop_setattr = udf_setattr },
    206 	VOPNAME_ACCESS,		{ .vop_access = udf_access },
    207 	VOPNAME_LOOKUP,		{ .vop_lookup = udf_lookup },
    208 	VOPNAME_CREATE,		{ .vop_create = udf_create },
    209 	VOPNAME_REMOVE,		{ .vop_remove = udf_remove },
    210 	VOPNAME_LINK,		{ .vop_link = udf_link },
    211 	VOPNAME_RENAME,		{ .vop_rename = udf_rename },
    212 	VOPNAME_MKDIR,		{ .vop_mkdir = udf_mkdir },
    213 	VOPNAME_RMDIR,		{ .vop_rmdir = udf_rmdir },
    214 	VOPNAME_READDIR,	{ .vop_readdir = udf_readdir },
    215 	VOPNAME_SYMLINK,	{ .vop_symlink = udf_symlink },
    216 	VOPNAME_READLINK,	{ .vop_readlink = udf_readlink },
    217 	VOPNAME_FSYNC,		{ .vop_fsync = udf_fsync },
    218 	VOPNAME_INACTIVE,	{ .vop_inactive = udf_inactive },
    219 	VOPNAME_FID,		{ .vop_fid = udf_fid },
    220 	VOPNAME_RWLOCK,		{ .vop_rwlock = udf_rwlock },
    221 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = udf_rwunlock },
    222 	VOPNAME_SEEK,		{ .vop_seek = udf_seek },
    223 	VOPNAME_FRLOCK,		{ .vop_frlock = udf_frlock },
    224 	VOPNAME_SPACE,		{ .vop_space = udf_space },
    225 	VOPNAME_GETPAGE,	{ .vop_getpage = udf_getpage },
    226 	VOPNAME_PUTPAGE,	{ .vop_putpage = udf_putpage },
    227 	VOPNAME_MAP,		{ .vop_map = udf_map },
    228 	VOPNAME_ADDMAP,		{ .vop_addmap = udf_addmap },
    229 	VOPNAME_DELMAP,		{ .vop_delmap = udf_delmap },
    230 	VOPNAME_PATHCONF,	{ .vop_pathconf = udf_l_pathconf },
    231 	VOPNAME_PAGEIO,		{ .vop_pageio = udf_pageio },
    232 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
    233 	NULL,			NULL
    234 };
    235 
    236 /* ARGSUSED */
    237 static int32_t
    238 udf_open(
    239 	struct vnode **vpp,
    240 	int32_t flag,
    241 	struct cred *cr,
    242 	caller_context_t *ct)
    243 {
    244 	ud_printf("udf_open\n");
    245 
    246 	return (0);
    247 }
    248 
    249 /* ARGSUSED */
    250 static int32_t
    251 udf_close(
    252 	struct vnode *vp,
    253 	int32_t flag,
    254 	int32_t count,
    255 	offset_t offset,
    256 	struct cred *cr,
    257 	caller_context_t *ct)
    258 {
    259 	struct ud_inode *ip = VTOI(vp);
    260 
    261 	ud_printf("udf_close\n");
    262 
    263 	ITIMES(ip);
    264 
    265 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
    266 	cleanshares(vp, ttoproc(curthread)->p_pid);
    267 
    268 	/*
    269 	 * Push partially filled cluster at last close.
    270 	 * ``last close'' is approximated because the dnlc
    271 	 * may have a hold on the vnode.
    272 	 */
    273 	if (vp->v_count <= 2 && vp->v_type != VBAD) {
    274 		struct ud_inode *ip = VTOI(vp);
    275 		if (ip->i_delaylen) {
    276 			(void) ud_putpages(vp, ip->i_delayoff, ip->i_delaylen,
    277 			    B_ASYNC | B_FREE, cr);
    278 			ip->i_delaylen = 0;
    279 		}
    280 	}
    281 
    282 	return (0);
    283 }
    284 
    285 /* ARGSUSED */
    286 static int32_t
    287 udf_read(
    288 	struct vnode *vp,
    289 	struct uio *uiop,
    290 	int32_t ioflag,
    291 	struct cred *cr,
    292 	caller_context_t *ct)
    293 {
    294 	struct ud_inode *ip = VTOI(vp);
    295 	int32_t error;
    296 
    297 	ud_printf("udf_read\n");
    298 
    299 #ifdef	__lock_lint
    300 	rw_enter(&ip->i_rwlock, RW_READER);
    301 #endif
    302 
    303 	ASSERT(RW_READ_HELD(&ip->i_rwlock));
    304 
    305 	if (MANDLOCK(vp, ip->i_char)) {
    306 		/*
    307 		 * udf_getattr ends up being called by chklock
    308 		 */
    309 		error = chklock(vp, FREAD, uiop->uio_loffset,
    310 		    uiop->uio_resid, uiop->uio_fmode, ct);
    311 		if (error) {
    312 			goto end;
    313 		}
    314 	}
    315 
    316 	rw_enter(&ip->i_contents, RW_READER);
    317 	error = ud_rdip(ip, uiop, ioflag, cr);
    318 	rw_exit(&ip->i_contents);
    319 
    320 end:
    321 #ifdef	__lock_lint
    322 	rw_exit(&ip->i_rwlock);
    323 #endif
    324 
    325 	return (error);
    326 }
    327 
    328 
    329 int32_t ud_WRITES = 1;
    330 int32_t ud_HW = 96 * 1024;
    331 int32_t ud_LW = 64 * 1024;
    332 int32_t ud_throttles = 0;
    333 
    334 /* ARGSUSED */
    335 static int32_t
    336 udf_write(
    337 	struct vnode *vp,
    338 	struct uio *uiop,
    339 	int32_t ioflag,
    340 	struct cred *cr,
    341 	caller_context_t *ct)
    342 {
    343 	struct ud_inode *ip = VTOI(vp);
    344 	int32_t error = 0;
    345 
    346 	ud_printf("udf_write\n");
    347 
    348 #ifdef	__lock_lint
    349 	rw_enter(&ip->i_rwlock, RW_WRITER);
    350 #endif
    351 
    352 	ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
    353 
    354 	if (MANDLOCK(vp, ip->i_char)) {
    355 		/*
    356 		 * ud_getattr ends up being called by chklock
    357 		 */
    358 		error = chklock(vp, FWRITE, uiop->uio_loffset,
    359 		    uiop->uio_resid, uiop->uio_fmode, ct);
    360 		if (error) {
    361 			goto end;
    362 		}
    363 	}
    364 	/*
    365 	 * Throttle writes.
    366 	 */
    367 	mutex_enter(&ip->i_tlock);
    368 	if (ud_WRITES && (ip->i_writes > ud_HW)) {
    369 		while (ip->i_writes > ud_HW) {
    370 			ud_throttles++;
    371 			cv_wait(&ip->i_wrcv, &ip->i_tlock);
    372 		}
    373 	}
    374 	mutex_exit(&ip->i_tlock);
    375 
    376 	/*
    377 	 * Write to the file
    378 	 */
    379 	rw_enter(&ip->i_contents, RW_WRITER);
    380 	if ((ioflag & FAPPEND) != 0 && (ip->i_type == VREG)) {
    381 		/*
    382 		 * In append mode start at end of file.
    383 		 */
    384 		uiop->uio_loffset = ip->i_size;
    385 	}
    386 	error = ud_wrip(ip, uiop, ioflag, cr);
    387 	rw_exit(&ip->i_contents);
    388 
    389 end:
    390 #ifdef	__lock_lint
    391 	rw_exit(&ip->i_rwlock);
    392 #endif
    393 
    394 	return (error);
    395 }
    396 
    397 /* ARGSUSED */
    398 static int32_t
    399 udf_ioctl(
    400 	struct vnode *vp,
    401 	int32_t cmd,
    402 	intptr_t arg,
    403 	int32_t flag,
    404 	struct cred *cr,
    405 	int32_t *rvalp,
    406 	caller_context_t *ct)
    407 {
    408 	return (ENOTTY);
    409 }
    410 
    411 /* ARGSUSED */
    412 static int32_t
    413 udf_getattr(
    414 	struct vnode *vp,
    415 	struct vattr *vap,
    416 	int32_t flags,
    417 	struct cred *cr,
    418 	caller_context_t *ct)
    419 {
    420 	struct ud_inode *ip = VTOI(vp);
    421 
    422 	ud_printf("udf_getattr\n");
    423 
    424 	if (vap->va_mask == AT_SIZE) {
    425 		/*
    426 		 * for performance, if only the size is requested don't bother
    427 		 * with anything else.
    428 		 */
    429 		vap->va_size = ip->i_size;
    430 		return (0);
    431 	}
    432 
    433 	rw_enter(&ip->i_contents, RW_READER);
    434 
    435 	vap->va_type = vp->v_type;
    436 	vap->va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
    437 
    438 	vap->va_uid = ip->i_uid;
    439 	vap->va_gid = ip->i_gid;
    440 	vap->va_fsid = ip->i_dev;
    441 	vap->va_nodeid = ip->i_icb_lbano;
    442 	vap->va_nlink = ip->i_nlink;
    443 	vap->va_size = ip->i_size;
    444 	vap->va_seq = ip->i_seq;
    445 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
    446 		vap->va_rdev = ip->i_rdev;
    447 	} else {
    448 		vap->va_rdev = 0;
    449 	}
    450 
    451 	mutex_enter(&ip->i_tlock);
    452 	ITIMES_NOLOCK(ip);	/* mark correct time in inode */
    453 	vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
    454 	vap->va_atime.tv_nsec = ip->i_atime.tv_nsec;
    455 	vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
    456 	vap->va_mtime.tv_nsec = ip->i_mtime.tv_nsec;
    457 	vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
    458 	vap->va_ctime.tv_nsec = ip->i_ctime.tv_nsec;
    459 	mutex_exit(&ip->i_tlock);
    460 
    461 	switch (ip->i_type) {
    462 		case VBLK:
    463 			vap->va_blksize = MAXBSIZE;
    464 			break;
    465 		case VCHR:
    466 			vap->va_blksize = MAXBSIZE;
    467 			break;
    468 		default:
    469 			vap->va_blksize = ip->i_udf->udf_lbsize;
    470 			break;
    471 	}
    472 	vap->va_nblocks = ip->i_lbr << ip->i_udf->udf_l2d_shift;
    473 
    474 	rw_exit(&ip->i_contents);
    475 
    476 	return (0);
    477 }
    478 
    479 static int
    480 ud_iaccess_vmode(void *ip, int mode, struct cred *cr)
    481 {
    482 	return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr));
    483 }
    484 
    485 /*ARGSUSED4*/
    486 static int32_t
    487 udf_setattr(
    488 	struct vnode *vp,
    489 	struct vattr *vap,
    490 	int32_t flags,
    491 	struct cred *cr,
    492 	caller_context_t *ct)
    493 {
    494 	int32_t error = 0;
    495 	uint32_t mask = vap->va_mask;
    496 	struct ud_inode *ip;
    497 	timestruc_t now;
    498 	struct vattr ovap;
    499 
    500 	ud_printf("udf_setattr\n");
    501 
    502 	ip = VTOI(vp);
    503 
    504 	/*
    505 	 * not updates allowed to 4096 files
    506 	 */
    507 	if (ip->i_astrat == STRAT_TYPE4096) {
    508 		return (EINVAL);
    509 	}
    510 
    511 	/*
    512 	 * Cannot set these attributes
    513 	 */
    514 	if (mask & AT_NOSET) {
    515 		return (EINVAL);
    516 	}
    517 
    518 	rw_enter(&ip->i_rwlock, RW_WRITER);
    519 	rw_enter(&ip->i_contents, RW_WRITER);
    520 
    521 	ovap.va_uid = ip->i_uid;
    522 	ovap.va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
    523 	error = secpolicy_vnode_setattr(cr, vp, vap, &ovap, flags,
    524 	    ud_iaccess_vmode, ip);
    525 	if (error)
    526 		goto update_inode;
    527 
    528 	mask = vap->va_mask;
    529 	/*
    530 	 * Change file access modes.
    531 	 */
    532 	if (mask & AT_MODE) {
    533 		ip->i_perm = VA2UD_PERM(vap->va_mode);
    534 		ip->i_char = vap->va_mode & (VSUID | VSGID | VSVTX);
    535 		mutex_enter(&ip->i_tlock);
    536 		ip->i_flag |= ICHG;
    537 		mutex_exit(&ip->i_tlock);
    538 	}
    539 	if (mask & (AT_UID|AT_GID)) {
    540 		if (mask & AT_UID) {
    541 			ip->i_uid = vap->va_uid;
    542 		}
    543 		if (mask & AT_GID) {
    544 			ip->i_gid = vap->va_gid;
    545 		}
    546 		mutex_enter(&ip->i_tlock);
    547 		ip->i_flag |= ICHG;
    548 		mutex_exit(&ip->i_tlock);
    549 	}
    550 	/*
    551 	 * Truncate file.  Must have write permission and not be a directory.
    552 	 */
    553 	if (mask & AT_SIZE) {
    554 		if (vp->v_type == VDIR) {
    555 			error = EISDIR;
    556 			goto update_inode;
    557 		}
    558 		if (error = ud_iaccess(ip, IWRITE, cr)) {
    559 			goto update_inode;
    560 		}
    561 		if (vap->va_size > MAXOFFSET_T) {
    562 			error = EFBIG;
    563 			goto update_inode;
    564 		}
    565 		if (error = ud_itrunc(ip, vap->va_size, 0, cr)) {
    566 			goto update_inode;
    567 		}
    568 	}
    569 	/*
    570 	 * Change file access or modified times.
    571 	 */
    572 	if (mask & (AT_ATIME|AT_MTIME)) {
    573 		mutex_enter(&ip->i_tlock);
    574 		if (mask & AT_ATIME) {
    575 			ip->i_atime.tv_sec = vap->va_atime.tv_sec;
    576 			ip->i_atime.tv_nsec = vap->va_atime.tv_nsec;
    577 			ip->i_flag &= ~IACC;
    578 		}
    579 		if (mask & AT_MTIME) {
    580 			ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
    581 			ip->i_mtime.tv_nsec = vap->va_mtime.tv_nsec;
    582 			gethrestime(&now);
    583 			ip->i_ctime.tv_sec = now.tv_sec;
    584 			ip->i_ctime.tv_nsec = now.tv_nsec;
    585 			ip->i_flag &= ~(IUPD|ICHG);
    586 			ip->i_flag |= IMODTIME;
    587 		}
    588 		ip->i_flag |= IMOD;
    589 		mutex_exit(&ip->i_tlock);
    590 	}
    591 
    592 update_inode:
    593 	if (curthread->t_flag & T_DONTPEND) {
    594 		ud_iupdat(ip, 1);
    595 	} else {
    596 		ITIMES_NOLOCK(ip);
    597 	}
    598 	rw_exit(&ip->i_contents);
    599 	rw_exit(&ip->i_rwlock);
    600 
    601 	return (error);
    602 }
    603 
    604 /* ARGSUSED */
    605 static int32_t
    606 udf_access(
    607 	struct vnode *vp,
    608 	int32_t mode,
    609 	int32_t flags,
    610 	struct cred *cr,
    611 	caller_context_t *ct)
    612 {
    613 	struct ud_inode *ip = VTOI(vp);
    614 	int32_t error;
    615 
    616 	ud_printf("udf_access\n");
    617 
    618 	if (ip->i_udf == NULL) {
    619 		return (EIO);
    620 	}
    621 
    622 	error = ud_iaccess(ip, UD_UPERM2DPERM(mode), cr);
    623 
    624 	return (error);
    625 }
    626 
    627 int32_t udfs_stickyhack = 1;
    628 
    629 /* ARGSUSED */
    630 static int32_t
    631 udf_lookup(
    632 	struct vnode *dvp,
    633 	char *nm,
    634 	struct vnode **vpp,
    635 	struct pathname *pnp,
    636 	int32_t flags,
    637 	struct vnode *rdir,
    638 	struct cred *cr,
    639 	caller_context_t *ct,
    640 	int *direntflags,
    641 	pathname_t *realpnp)
    642 {
    643 	int32_t error;
    644 	struct vnode *vp;
    645 	struct ud_inode *ip, *xip;
    646 
    647 	ud_printf("udf_lookup\n");
    648 	/*
    649 	 * Null component name is a synonym for directory being searched.
    650 	 */
    651 	if (*nm == '\0') {
    652 		VN_HOLD(dvp);
    653 		*vpp = dvp;
    654 		error = 0;
    655 		goto out;
    656 	}
    657 
    658 	/*
    659 	 * Fast path: Check the directory name lookup cache.
    660 	 */
    661 	ip = VTOI(dvp);
    662 	if (vp = dnlc_lookup(dvp, nm)) {
    663 		/*
    664 		 * Check accessibility of directory.
    665 		 */
    666 		if ((error = ud_iaccess(ip, IEXEC, cr)) != 0) {
    667 			VN_RELE(vp);
    668 		}
    669 		xip = VTOI(vp);
    670 	} else {
    671 		error = ud_dirlook(ip, nm, &xip, cr, 1);
    672 		ITIMES(ip);
    673 	}
    674 
    675 	if (error == 0) {
    676 		ip = xip;
    677 		*vpp = ITOV(ip);
    678 		if ((ip->i_type != VDIR) &&
    679 		    (ip->i_char & ISVTX) &&
    680 		    ((ip->i_perm & IEXEC) == 0) &&
    681 		    udfs_stickyhack) {
    682 			mutex_enter(&(*vpp)->v_lock);
    683 			(*vpp)->v_flag |= VISSWAP;
    684 			mutex_exit(&(*vpp)->v_lock);
    685 		}
    686 		ITIMES(ip);
    687 		/*
    688 		 * If vnode is a device return special vnode instead.
    689 		 */
    690 		if (IS_DEVVP(*vpp)) {
    691 			struct vnode *newvp;
    692 			newvp = specvp(*vpp, (*vpp)->v_rdev,
    693 			    (*vpp)->v_type, cr);
    694 			VN_RELE(*vpp);
    695 			if (newvp == NULL) {
    696 				error = ENOSYS;
    697 			} else {
    698 				*vpp = newvp;
    699 			}
    700 		}
    701 	}
    702 out:
    703 	return (error);
    704 }
    705 
    706 /* ARGSUSED */
    707 static int32_t
    708 udf_create(
    709 	struct vnode *dvp,
    710 	char *name,
    711 	struct vattr *vap,
    712 	enum vcexcl excl,
    713 	int32_t mode,
    714 	struct vnode **vpp,
    715 	struct cred *cr,
    716 	int32_t flag,
    717 	caller_context_t *ct,
    718 	vsecattr_t *vsecp)
    719 {
    720 	int32_t error;
    721 	struct ud_inode *ip = VTOI(dvp), *xip;
    722 
    723 	ud_printf("udf_create\n");
    724 
    725 	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
    726 		vap->va_mode &= ~VSVTX;
    727 
    728 	if (*name == '\0') {
    729 		/*
    730 		 * Null component name refers to the directory itself.
    731 		 */
    732 		VN_HOLD(dvp);
    733 		ITIMES(ip);
    734 		error = EEXIST;
    735 	} else {
    736 		xip = NULL;
    737 		rw_enter(&ip->i_rwlock, RW_WRITER);
    738 		error = ud_direnter(ip, name, DE_CREATE,
    739 		    (struct ud_inode *)0, (struct ud_inode *)0,
    740 		    vap, &xip, cr, ct);
    741 		rw_exit(&ip->i_rwlock);
    742 		ITIMES(ip);
    743 		ip = xip;
    744 	}
    745 #ifdef	__lock_lint
    746 	rw_enter(&ip->i_contents, RW_WRITER);
    747 #else
    748 	if (ip != NULL) {
    749 		rw_enter(&ip->i_contents, RW_WRITER);
    750 	}
    751 #endif
    752 
    753 	/*
    754 	 * If the file already exists and this is a non-exclusive create,
    755 	 * check permissions and allow access for non-directories.
    756 	 * Read-only create of an existing directory is also allowed.
    757 	 * We fail an exclusive create of anything which already exists.
    758 	 */
    759 	if (error == EEXIST) {
    760 		if (excl == NONEXCL) {
    761 			if ((ip->i_type == VDIR) && (mode & VWRITE)) {
    762 				error = EISDIR;
    763 			} else if (mode) {
    764 				error = ud_iaccess(ip,
    765 				    UD_UPERM2DPERM(mode), cr);
    766 			} else {
    767 				error = 0;
    768 			}
    769 		}
    770 		if (error) {
    771 			rw_exit(&ip->i_contents);
    772 			VN_RELE(ITOV(ip));
    773 			goto out;
    774 		} else if ((ip->i_type == VREG) &&
    775 		    (vap->va_mask & AT_SIZE) && vap->va_size == 0) {
    776 			/*
    777 			 * Truncate regular files, if requested by caller.
    778 			 * Grab i_rwlock to make sure no one else is
    779 			 * currently writing to the file (we promised
    780 			 * bmap we would do this).
    781 			 * Must get the locks in the correct order.
    782 			 */
    783 			if (ip->i_size == 0) {
    784 				ip->i_flag |= ICHG | IUPD;
    785 			} else {
    786 				rw_exit(&ip->i_contents);
    787 				rw_enter(&ip->i_rwlock, RW_WRITER);
    788 				rw_enter(&ip->i_contents, RW_WRITER);
    789 				(void) ud_itrunc(ip, 0, 0, cr);
    790 				rw_exit(&ip->i_rwlock);
    791 			}
    792 			vnevent_create(ITOV(ip), ct);
    793 		}
    794 	}
    795 
    796 	if (error == 0) {
    797 		*vpp = ITOV(ip);
    798 		ITIMES(ip);
    799 	}
    800 #ifdef	__lock_lint
    801 	rw_exit(&ip->i_contents);
    802 #else
    803 	if (ip != NULL) {
    804 		rw_exit(&ip->i_contents);
    805 	}
    806 #endif
    807 	if (error) {
    808 		goto out;
    809 	}
    810 
    811 	/*
    812 	 * If vnode is a device return special vnode instead.
    813 	 */
    814 	if (!error && IS_DEVVP(*vpp)) {
    815 		struct vnode *newvp;
    816 
    817 		newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
    818 		VN_RELE(*vpp);
    819 		if (newvp == NULL) {
    820 			error = ENOSYS;
    821 			goto out;
    822 		}
    823 		*vpp = newvp;
    824 	}
    825 out:
    826 	return (error);
    827 }
    828 
    829 /* ARGSUSED */
    830 static int32_t
    831 udf_remove(
    832 	struct vnode *vp,
    833 	char *nm,
    834 	struct cred *cr,
    835 	caller_context_t *ct,
    836 	int flags)
    837 {
    838 	int32_t error;
    839 	struct ud_inode *ip = VTOI(vp);
    840 
    841 	ud_printf("udf_remove\n");
    842 
    843 	rw_enter(&ip->i_rwlock, RW_WRITER);
    844 	error = ud_dirremove(ip, nm,
    845 	    (struct ud_inode *)0, (struct vnode *)0, DR_REMOVE, cr, ct);
    846 	rw_exit(&ip->i_rwlock);
    847 	ITIMES(ip);
    848 
    849 	return (error);
    850 }
    851 
    852 /* ARGSUSED */
    853 static int32_t
    854 udf_link(
    855 	struct vnode *tdvp,
    856 	struct vnode *svp,
    857 	char *tnm,
    858 	struct cred *cr,
    859 	caller_context_t *ct,
    860 	int flags)
    861 {
    862 	int32_t error;
    863 	struct vnode *realvp;
    864 	struct ud_inode *sip;
    865 	struct ud_inode *tdp;
    866 
    867 	ud_printf("udf_link\n");
    868 	if (VOP_REALVP(svp, &realvp, ct) == 0) {
    869 		svp = realvp;
    870 	}
    871 
    872 	/*
    873 	 * Do not allow links to directories
    874 	 */
    875 	if (svp->v_type == VDIR) {
    876 		return (EPERM);
    877 	}
    878 
    879 	sip = VTOI(svp);
    880 
    881 	if (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)
    882 		return (EPERM);
    883 
    884 	tdp = VTOI(tdvp);
    885 
    886 	rw_enter(&tdp->i_rwlock, RW_WRITER);
    887 	error = ud_direnter(tdp, tnm, DE_LINK, (struct ud_inode *)0,
    888 	    sip, (struct vattr *)0, (struct ud_inode **)0, cr, ct);
    889 	rw_exit(&tdp->i_rwlock);
    890 	ITIMES(sip);
    891 	ITIMES(tdp);
    892 
    893 	if (error == 0) {
    894 		vnevent_link(svp, ct);
    895 	}
    896 
    897 	return (error);
    898 }
    899 
    900 /* ARGSUSED */
    901 static int32_t
    902 udf_rename(
    903 	struct vnode *sdvp,
    904 	char *snm,
    905 	struct vnode *tdvp,
    906 	char *tnm,
    907 	struct cred *cr,
    908 	caller_context_t *ct,
    909 	int flags)
    910 {
    911 	int32_t error = 0;
    912 	struct udf_vfs *udf_vfsp;
    913 	struct ud_inode *sip;		/* source inode */
    914 	struct ud_inode *sdp, *tdp;	/* source and target parent inode */
    915 	struct vnode *realvp;
    916 
    917 	ud_printf("udf_rename\n");
    918 
    919 	if (VOP_REALVP(tdvp, &realvp, ct) == 0) {
    920 		tdvp = realvp;
    921 	}
    922 
    923 	sdp = VTOI(sdvp);
    924 	tdp = VTOI(tdvp);
    925 
    926 	udf_vfsp = sdp->i_udf;
    927 
    928 	mutex_enter(&udf_vfsp->udf_rename_lck);
    929 	/*
    930 	 * Look up inode of file we're supposed to rename.
    931 	 */
    932 	if (error = ud_dirlook(sdp, snm, &sip, cr, 0)) {
    933 		mutex_exit(&udf_vfsp->udf_rename_lck);
    934 		return (error);
    935 	}
    936 	/*
    937 	 * be sure this is not a directory with another file system mounted
    938 	 * over it.  If it is just give up the locks, and return with
    939 	 * EBUSY
    940 	 */
    941 	if (vn_mountedvfs(ITOV(sip)) != NULL) {
    942 		error = EBUSY;
    943 		goto errout;
    944 	}
    945 	/*
    946 	 * Make sure we can delete the source entry.  This requires
    947 	 * write permission on the containing directory.  If that
    948 	 * directory is "sticky" it further requires (except for
    949 	 * privileged users) that the user own the directory or the
    950 	 * source entry, or else have permission to write the source
    951 	 * entry.
    952 	 */
    953 	rw_enter(&sdp->i_contents, RW_READER);
    954 	rw_enter(&sip->i_contents, RW_READER);
    955 	if ((error = ud_iaccess(sdp, IWRITE, cr)) != 0 ||
    956 	    (error = ud_sticky_remove_access(sdp, sip, cr)) != 0) {
    957 		rw_exit(&sip->i_contents);
    958 		rw_exit(&sdp->i_contents);
    959 		ITIMES(sip);
    960 		goto errout;
    961 	}
    962 
    963 	/*
    964 	 * Check for renaming '.' or '..' or alias of '.'
    965 	 */
    966 	if ((strcmp(snm, ".") == 0) ||
    967 	    (strcmp(snm, "..") == 0) ||
    968 	    (sdp == sip)) {
    969 		error = EINVAL;
    970 		rw_exit(&sip->i_contents);
    971 		rw_exit(&sdp->i_contents);
    972 		goto errout;
    973 	}
    974 	rw_exit(&sip->i_contents);
    975 	rw_exit(&sdp->i_contents);
    976 
    977 
    978 	/*
    979 	 * Link source to the target.
    980 	 */
    981 	rw_enter(&tdp->i_rwlock, RW_WRITER);
    982 	if (error = ud_direnter(tdp, tnm, DE_RENAME, sdp, sip,
    983 	    (struct vattr *)0, (struct ud_inode **)0, cr, ct)) {
    984 		/*
    985 		 * ESAME isn't really an error; it indicates that the
    986 		 * operation should not be done because the source and target
    987 		 * are the same file, but that no error should be reported.
    988 		 */
    989 		if (error == ESAME) {
    990 			error = 0;
    991 		}
    992 		rw_exit(&tdp->i_rwlock);
    993 		goto errout;
    994 	}
    995 	vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
    996 	rw_exit(&tdp->i_rwlock);
    997 
    998 	rw_enter(&sdp->i_rwlock, RW_WRITER);
    999 	/*
   1000 	 * Unlink the source.
   1001 	 * Remove the source entry.  ud_dirremove() checks that the entry
   1002 	 * still reflects sip, and returns an error if it doesn't.
   1003 	 * If the entry has changed just forget about it.  Release
   1004 	 * the source inode.
   1005 	 */
   1006 	if ((error = ud_dirremove(sdp, snm, sip, (struct vnode *)0,
   1007 	    DR_RENAME, cr, ct)) == ENOENT) {
   1008 		error = 0;
   1009 	}
   1010 	rw_exit(&sdp->i_rwlock);
   1011 errout:
   1012 	ITIMES(sdp);
   1013 	ITIMES(tdp);
   1014 	VN_RELE(ITOV(sip));
   1015 	mutex_exit(&udf_vfsp->udf_rename_lck);
   1016 
   1017 	return (error);
   1018 }
   1019 
   1020 /* ARGSUSED */
   1021 static int32_t
   1022 udf_mkdir(
   1023 	struct vnode *dvp,
   1024 	char *dirname,
   1025 	struct vattr *vap,
   1026 	struct vnode **vpp,
   1027 	struct cred *cr,
   1028 	caller_context_t *ct,
   1029 	int flags,
   1030 	vsecattr_t *vsecp)
   1031 {
   1032 	int32_t error;
   1033 	struct ud_inode *ip;
   1034 	struct ud_inode *xip;
   1035 
   1036 	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
   1037 
   1038 	ud_printf("udf_mkdir\n");
   1039 
   1040 	ip = VTOI(dvp);
   1041 	rw_enter(&ip->i_rwlock, RW_WRITER);
   1042 	error = ud_direnter(ip, dirname, DE_MKDIR,
   1043 	    (struct ud_inode *)0, (struct ud_inode *)0, vap, &xip, cr, ct);
   1044 	rw_exit(&ip->i_rwlock);
   1045 	ITIMES(ip);
   1046 	if (error == 0) {
   1047 		ip = xip;
   1048 		*vpp = ITOV(ip);
   1049 		ITIMES(ip);
   1050 	} else if (error == EEXIST) {
   1051 		ITIMES(xip);
   1052 		VN_RELE(ITOV(xip));
   1053 	}
   1054 
   1055 	return (error);
   1056 }
   1057 
   1058 /* ARGSUSED */
   1059 static int32_t
   1060 udf_rmdir(
   1061 	struct vnode *vp,
   1062 	char *nm,
   1063 	struct vnode *cdir,
   1064 	struct cred *cr,
   1065 	caller_context_t *ct,
   1066 	int flags)
   1067 {
   1068 	int32_t error;
   1069 	struct ud_inode *ip = VTOI(vp);
   1070 
   1071 	ud_printf("udf_rmdir\n");
   1072 
   1073 	rw_enter(&ip->i_rwlock, RW_WRITER);
   1074 	error = ud_dirremove(ip, nm, (struct ud_inode *)0, cdir, DR_RMDIR,
   1075 	    cr, ct);
   1076 	rw_exit(&ip->i_rwlock);
   1077 	ITIMES(ip);
   1078 
   1079 	return (error);
   1080 }
   1081 
   1082 /* ARGSUSED */
   1083 static int32_t
   1084 udf_readdir(
   1085 	struct vnode *vp,
   1086 	struct uio *uiop,
   1087 	struct cred *cr,
   1088 	int32_t *eofp,
   1089 	caller_context_t *ct,
   1090 	int flags)
   1091 {
   1092 	struct ud_inode *ip;
   1093 	struct dirent64 *nd;
   1094 	struct udf_vfs *udf_vfsp;
   1095 	int32_t error = 0, len, outcount = 0;
   1096 	uint32_t dirsiz, offset;
   1097 	uint32_t bufsize, ndlen, dummy;
   1098 	caddr_t outbuf;
   1099 	caddr_t outb, end_outb;
   1100 	struct iovec *iovp;
   1101 
   1102 	uint8_t *dname;
   1103 	int32_t length;
   1104 
   1105 	uint8_t *buf = NULL;
   1106 
   1107 	struct fbuf *fbp = NULL;
   1108 	struct file_id *fid;
   1109 	uint8_t *name;
   1110 
   1111 
   1112 	ud_printf("udf_readdir\n");
   1113 
   1114 	ip = VTOI(vp);
   1115 	udf_vfsp = ip->i_udf;
   1116 
   1117 	dirsiz = ip->i_size;
   1118 	if ((uiop->uio_offset >= dirsiz) ||
   1119 	    (ip->i_nlink <= 0)) {
   1120 		if (eofp) {
   1121 			*eofp = 1;
   1122 		}
   1123 		return (0);
   1124 	}
   1125 
   1126 	offset = uiop->uio_offset;
   1127 	iovp = uiop->uio_iov;
   1128 	bufsize = iovp->iov_len;
   1129 
   1130 	outb = outbuf = (char *)kmem_alloc((uint32_t)bufsize, KM_SLEEP);
   1131 	end_outb = outb + bufsize;
   1132 	nd = (struct dirent64 *)outbuf;
   1133 
   1134 	dname = (uint8_t *)kmem_zalloc(1024, KM_SLEEP);
   1135 	buf = (uint8_t *)kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP);
   1136 
   1137 	if (offset == 0) {
   1138 		len = DIRENT64_RECLEN(1);
   1139 		if (((caddr_t)nd + len) >= end_outb) {
   1140 			error = EINVAL;
   1141 			goto end;
   1142 		}
   1143 		nd->d_ino = ip->i_icb_lbano;
   1144 		nd->d_reclen = (uint16_t)len;
   1145 		nd->d_off = 0x10;
   1146 		nd->d_name[0] = '.';
   1147 		bzero(&nd->d_name[1], DIRENT64_NAMELEN(len) - 1);
   1148 		nd = (struct dirent64 *)((char *)nd + nd->d_reclen);
   1149 		outcount++;
   1150 	} else if (offset == 0x10) {
   1151 		offset = 0;
   1152 	}
   1153 
   1154 	while (offset < dirsiz) {
   1155 		error = ud_get_next_fid(ip, &fbp,
   1156 		    offset, &fid, &name, buf);
   1157 		if (error != 0) {
   1158 			break;
   1159 		}
   1160 
   1161 		if ((fid->fid_flags & FID_DELETED) == 0) {
   1162 			if (fid->fid_flags & FID_PARENT) {
   1163 
   1164 				len = DIRENT64_RECLEN(2);
   1165 				if (((caddr_t)nd + len) >= end_outb) {
   1166 					error = EINVAL;
   1167 					break;
   1168 				}
   1169 
   1170 				nd->d_ino = ip->i_icb_lbano;
   1171 				nd->d_reclen = (uint16_t)len;
   1172 				nd->d_off = offset + FID_LEN(fid);
   1173 				nd->d_name[0] = '.';
   1174 				nd->d_name[1] = '.';
   1175 				bzero(&nd->d_name[2],
   1176 				    DIRENT64_NAMELEN(len) - 2);
   1177 				nd = (struct dirent64 *)
   1178 				    ((char *)nd + nd->d_reclen);
   1179 			} else {
   1180 				if ((error = ud_uncompress(fid->fid_idlen,
   1181 				    &length, name, dname)) != 0) {
   1182 					break;
   1183 				}
   1184 				if (length == 0) {
   1185 					offset += FID_LEN(fid);
   1186 					continue;
   1187 				}
   1188 				len = DIRENT64_RECLEN(length);
   1189 				if (((caddr_t)nd + len) >= end_outb) {
   1190 					if (!outcount) {
   1191 						error = EINVAL;
   1192 					}
   1193 					break;
   1194 				}
   1195 				(void) strncpy(nd->d_name,
   1196 				    (caddr_t)dname, length);
   1197 				bzero(&nd->d_name[length],
   1198 				    DIRENT64_NAMELEN(len) - length);
   1199 				nd->d_ino = ud_xlate_to_daddr(udf_vfsp,
   1200 				    SWAP_16(fid->fid_icb.lad_ext_prn),
   1201 				    SWAP_32(fid->fid_icb.lad_ext_loc), 1,
   1202 				    &dummy);
   1203 				nd->d_reclen = (uint16_t)len;
   1204 				nd->d_off = offset + FID_LEN(fid);
   1205 				nd = (struct dirent64 *)
   1206 				    ((char *)nd + nd->d_reclen);
   1207 			}
   1208 			outcount++;
   1209 		}
   1210 
   1211 		offset += FID_LEN(fid);
   1212 	}
   1213 
   1214 end:
   1215 	if (fbp != NULL) {
   1216 		fbrelse(fbp, S_OTHER);
   1217 	}
   1218 	ndlen = ((char *)nd - outbuf);
   1219 	/*
   1220 	 * In case of error do not call uiomove.
   1221 	 * Return the error to the caller.
   1222 	 */
   1223 	if ((error == 0) && (ndlen != 0)) {
   1224 		error = uiomove(outbuf, (long)ndlen, UIO_READ, uiop);
   1225 		uiop->uio_offset = offset;
   1226 	}
   1227 	kmem_free((caddr_t)buf, udf_vfsp->udf_lbsize);
   1228 	kmem_free((caddr_t)dname, 1024);
   1229 	kmem_free(outbuf, (uint32_t)bufsize);
   1230 	if (eofp && error == 0) {
   1231 		*eofp = (uiop->uio_offset >= dirsiz);
   1232 	}
   1233 	return (error);
   1234 }
   1235 
   1236 /* ARGSUSED */
   1237 static int32_t
   1238 udf_symlink(
   1239 	struct vnode *dvp,
   1240 	char *linkname,
   1241 	struct vattr *vap,
   1242 	char *target,
   1243 	struct cred *cr,
   1244 	caller_context_t *ct,
   1245 	int flags)
   1246 {
   1247 	int32_t error = 0, outlen;
   1248 	uint32_t ioflag = 0;
   1249 	struct ud_inode *ip, *dip = VTOI(dvp);
   1250 
   1251 	struct path_comp *pc;
   1252 	int8_t *dname = NULL, *uname = NULL, *sp;
   1253 
   1254 	ud_printf("udf_symlink\n");
   1255 
   1256 	ip = (struct ud_inode *)0;
   1257 	vap->va_type = VLNK;
   1258 	vap->va_rdev = 0;
   1259 
   1260 	rw_enter(&dip->i_rwlock, RW_WRITER);
   1261 	error = ud_direnter(dip, linkname, DE_CREATE,
   1262 	    (struct ud_inode *)0, (struct ud_inode *)0, vap, &ip, cr, ct);
   1263 	rw_exit(&dip->i_rwlock);
   1264 	if (error == 0) {
   1265 		dname = kmem_zalloc(1024, KM_SLEEP);
   1266 		uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
   1267 
   1268 		pc = (struct path_comp *)uname;
   1269 		/*
   1270 		 * If the first character in target is "/"
   1271 		 * then skip it and create entry for it
   1272 		 */
   1273 		if (*target == '/') {
   1274 			pc->pc_type = 2;
   1275 			pc->pc_len = 0;
   1276 			pc = (struct path_comp *)(((char *)pc) + 4);
   1277 			while (*target == '/') {
   1278 				target++;
   1279 			}
   1280 		}
   1281 
   1282 		while (*target != NULL) {
   1283 			sp = target;
   1284 			while ((*target != '/') && (*target != '\0')) {
   1285 				target ++;
   1286 			}
   1287 			/*
   1288 			 * We got the next component of the
   1289 			 * path name. Create path_comp of
   1290 			 * appropriate type
   1291 			 */
   1292 			if (((target - sp) == 1) && (*sp == '.')) {
   1293 				/*
   1294 				 * Dot entry.
   1295 				 */
   1296 				pc->pc_type = 4;
   1297 				pc = (struct path_comp *)(((char *)pc) + 4);
   1298 			} else if (((target - sp) == 2) &&
   1299 			    (*sp == '.') && ((*(sp + 1)) == '.')) {
   1300 				/*
   1301 				 * DotDot entry.
   1302 				 */
   1303 				pc->pc_type = 3;
   1304 				pc = (struct path_comp *)(((char *)pc) + 4);
   1305 			} else {
   1306 				/*
   1307 				 * convert the user given name
   1308 				 * into appropriate form to be put
   1309 				 * on the media
   1310 				 */
   1311 				outlen = 1024;	/* set to size of dname */
   1312 				if (error = ud_compress(target - sp, &outlen,
   1313 				    (uint8_t *)sp, (uint8_t *)dname)) {
   1314 					break;
   1315 				}
   1316 				pc->pc_type = 5;
   1317 				/* LINTED */
   1318 				pc->pc_len = outlen;
   1319 				dname[outlen] = '\0';
   1320 				(void) strcpy((char *)pc->pc_id, dname);
   1321 				pc = (struct path_comp *)
   1322 				    (((char *)pc) + 4 + outlen);
   1323 			}
   1324 			while (*target == '/') {
   1325 				target++;
   1326 			}
   1327 			if (*target == NULL) {
   1328 				break;
   1329 			}
   1330 		}
   1331 
   1332 		rw_enter(&ip->i_contents, RW_WRITER);
   1333 		if (error == 0) {
   1334 			ioflag = FWRITE;
   1335 			if (curthread->t_flag & T_DONTPEND) {
   1336 				ioflag |= FDSYNC;
   1337 			}
   1338 			error = ud_rdwri(UIO_WRITE, ioflag, ip,
   1339 			    uname, ((int8_t *)pc) - uname,
   1340 			    (offset_t)0, UIO_SYSSPACE, (int32_t *)0, cr);
   1341 		}
   1342 		if (error) {
   1343 			ud_idrop(ip);
   1344 			rw_exit(&ip->i_contents);
   1345 			rw_enter(&dip->i_rwlock, RW_WRITER);
   1346 			(void) ud_dirremove(dip, linkname, (struct ud_inode *)0,
   1347 			    (struct vnode *)0, DR_REMOVE, cr, ct);
   1348 			rw_exit(&dip->i_rwlock);
   1349 			goto update_inode;
   1350 		}
   1351 		rw_exit(&ip->i_contents);
   1352 	}
   1353 
   1354 	if ((error == 0) || (error == EEXIST)) {
   1355 		VN_RELE(ITOV(ip));
   1356 	}
   1357 
   1358 update_inode:
   1359 	ITIMES(VTOI(dvp));
   1360 	if (uname != NULL) {
   1361 		kmem_free(uname, PAGESIZE);
   1362 	}
   1363 	if (dname != NULL) {
   1364 		kmem_free(dname, 1024);
   1365 	}
   1366 
   1367 	return (error);
   1368 }
   1369 
   1370 /* ARGSUSED */
   1371 static int32_t
   1372 udf_readlink(
   1373 	struct vnode *vp,
   1374 	struct uio *uiop,
   1375 	struct cred *cr,
   1376 	caller_context_t *ct)
   1377 {
   1378 	int32_t error = 0, off, id_len, size, len;
   1379 	int8_t *dname = NULL, *uname = NULL;
   1380 	struct ud_inode *ip;
   1381 	struct fbuf *fbp = NULL;
   1382 	struct path_comp *pc;
   1383 
   1384 	ud_printf("udf_readlink\n");
   1385 
   1386 	if (vp->v_type != VLNK) {
   1387 		return (EINVAL);
   1388 	}
   1389 
   1390 	ip = VTOI(vp);
   1391 	size = ip->i_size;
   1392 	if (size > PAGESIZE) {
   1393 		return (EIO);
   1394 	}
   1395 
   1396 	if (size == 0) {
   1397 		return (0);
   1398 	}
   1399 
   1400 	dname = kmem_zalloc(1024, KM_SLEEP);
   1401 	uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
   1402 
   1403 	rw_enter(&ip->i_contents, RW_READER);
   1404 
   1405 	if ((error = fbread(vp, 0, size, S_READ, &fbp)) != 0) {
   1406 		goto end;
   1407 	}
   1408 
   1409 	off = 0;
   1410 
   1411 	while (off < size) {
   1412 		pc = (struct path_comp *)(fbp->fb_addr + off);
   1413 		switch (pc->pc_type) {
   1414 			case 1 :
   1415 				(void) strcpy(uname, ip->i_udf->udf_fsmnt);
   1416 				(void) strcat(uname, "/");
   1417 				break;
   1418 			case 2 :
   1419 				if (pc->pc_len != 0) {
   1420 					goto end;
   1421 				}
   1422 				uname[0] = '/';
   1423 				uname[1] = '\0';
   1424 				break;
   1425 			case 3 :
   1426 				(void) strcat(uname, "../");
   1427 				break;
   1428 			case 4 :
   1429 				(void) strcat(uname, "./");
   1430 				break;
   1431 			case 5 :
   1432 				if ((error = ud_uncompress(pc->pc_len, &id_len,
   1433 				    pc->pc_id, (uint8_t *)dname)) != 0) {
   1434 					break;
   1435 				}
   1436 				dname[id_len] = '\0';
   1437 				(void) strcat(uname, dname);
   1438 				(void) strcat(uname, "/");
   1439 				break;
   1440 			default :
   1441 				error = EINVAL;
   1442 				goto end;
   1443 		}
   1444 		off += 4 + pc->pc_len;
   1445 	}
   1446 	len = strlen(uname) - 1;
   1447 	if (uname[len] == '/') {
   1448 		if (len == 0) {
   1449 			/*
   1450 			 * special case link to /
   1451 			 */
   1452 			len = 1;
   1453 		} else {
   1454 			uname[len] = '\0';
   1455 		}
   1456 	}
   1457 
   1458 	error = uiomove(uname, len, UIO_READ, uiop);
   1459 
   1460 	ITIMES(ip);
   1461 
   1462 end:
   1463 	if (fbp != NULL) {
   1464 		fbrelse(fbp, S_OTHER);
   1465 	}
   1466 	rw_exit(&ip->i_contents);
   1467 	if (uname != NULL) {
   1468 		kmem_free(uname, PAGESIZE);
   1469 	}
   1470 	if (dname != NULL) {
   1471 		kmem_free(dname, 1024);
   1472 	}
   1473 	return (error);
   1474 }
   1475 
   1476 /* ARGSUSED */
   1477 static int32_t
   1478 udf_fsync(
   1479 	struct vnode *vp,
   1480 	int32_t syncflag,
   1481 	struct cred *cr,
   1482 	caller_context_t *ct)
   1483 {
   1484 	int32_t error = 0;
   1485 	struct ud_inode *ip = VTOI(vp);
   1486 
   1487 	ud_printf("udf_fsync\n");
   1488 
   1489 	rw_enter(&ip->i_contents, RW_WRITER);
   1490 	if (!(IS_SWAPVP(vp))) {
   1491 		error = ud_syncip(ip, 0, I_SYNC); /* Do synchronous writes */
   1492 	}
   1493 	if (error == 0) {
   1494 		error = ud_sync_indir(ip);
   1495 	}
   1496 	ITIMES(ip);		/* XXX: is this necessary ??? */
   1497 	rw_exit(&ip->i_contents);
   1498 
   1499 	return (error);
   1500 }
   1501 
   1502 /* ARGSUSED */
   1503 static void
   1504 udf_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
   1505 {
   1506 	ud_printf("udf_iinactive\n");
   1507 
   1508 	ud_iinactive(VTOI(vp), cr);
   1509 }
   1510 
   1511 /* ARGSUSED */
   1512 static int32_t
   1513 udf_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
   1514 {
   1515 	struct udf_fid *udfidp;
   1516 	struct ud_inode *ip = VTOI(vp);
   1517 
   1518 	ud_printf("udf_fid\n");
   1519 
   1520 	if (fidp->fid_len < (sizeof (struct udf_fid) - sizeof (uint16_t))) {
   1521 		fidp->fid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
   1522 		return (ENOSPC);
   1523 	}
   1524 
   1525 	udfidp = (struct udf_fid *)fidp;
   1526 	bzero((char *)udfidp, sizeof (struct udf_fid));
   1527 	rw_enter(&ip->i_contents, RW_READER);
   1528 	udfidp->udfid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
   1529 	udfidp->udfid_uinq_lo = ip->i_uniqid & 0xffffffff;
   1530 	udfidp->udfid_prn = ip->i_icb_prn;
   1531 	udfidp->udfid_icb_lbn = ip->i_icb_block;
   1532 	rw_exit(&ip->i_contents);
   1533 
   1534 	return (0);
   1535 }
   1536 
   1537 /* ARGSUSED2 */
   1538 static int
   1539 udf_rwlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
   1540 {
   1541 	struct ud_inode *ip = VTOI(vp);
   1542 
   1543 	ud_printf("udf_rwlock\n");
   1544 
   1545 	if (write_lock) {
   1546 		rw_enter(&ip->i_rwlock, RW_WRITER);
   1547 	} else {
   1548 		rw_enter(&ip->i_rwlock, RW_READER);
   1549 	}
   1550 #ifdef	__lock_lint
   1551 	rw_exit(&ip->i_rwlock);
   1552 #endif
   1553 	return (write_lock);
   1554 }
   1555 
   1556 /* ARGSUSED */
   1557 static void
   1558 udf_rwunlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
   1559 {
   1560 	struct ud_inode *ip = VTOI(vp);
   1561 
   1562 	ud_printf("udf_rwunlock\n");
   1563 
   1564 #ifdef	__lock_lint
   1565 	rw_enter(&ip->i_rwlock, RW_WRITER);
   1566 #endif
   1567 
   1568 	rw_exit(&ip->i_rwlock);
   1569 
   1570 }
   1571 
   1572 /* ARGSUSED */
   1573 static int32_t
   1574 udf_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
   1575 {
   1576 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
   1577 }
   1578 
   1579 static int32_t
   1580 udf_frlock(
   1581 	struct vnode *vp,
   1582 	int32_t cmd,
   1583 	struct flock64 *bfp,
   1584 	int32_t flag,
   1585 	offset_t offset,
   1586 	struct flk_callback *flk_cbp,
   1587 	cred_t *cr,
   1588 	caller_context_t *ct)
   1589 {
   1590 	struct ud_inode *ip = VTOI(vp);
   1591 
   1592 	ud_printf("udf_frlock\n");
   1593 
   1594 	/*
   1595 	 * If file is being mapped, disallow frlock.
   1596 	 * XXX I am not holding tlock while checking i_mapcnt because the
   1597 	 * current locking strategy drops all locks before calling fs_frlock.
   1598 	 * So, mapcnt could change before we enter fs_frlock making is
   1599 	 * meaningless to have held tlock in the first place.
   1600 	 */
   1601 	if ((ip->i_mapcnt > 0) &&
   1602 	    (MANDLOCK(vp, ip->i_char))) {
   1603 		return (EAGAIN);
   1604 	}
   1605 
   1606 	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
   1607 }
   1608 
   1609 /*ARGSUSED6*/
   1610 static int32_t
   1611 udf_space(
   1612 	struct vnode *vp,
   1613 	int32_t cmd,
   1614 	struct flock64 *bfp,
   1615 	int32_t flag,
   1616 	offset_t offset,
   1617 	cred_t *cr,
   1618 	caller_context_t *ct)
   1619 {
   1620 	int32_t error = 0;
   1621 
   1622 	ud_printf("udf_space\n");
   1623 
   1624 	if (cmd != F_FREESP) {
   1625 		error =  EINVAL;
   1626 	} else if ((error = convoff(vp, bfp, 0, offset)) == 0) {
   1627 		error = ud_freesp(vp, bfp, flag, cr);
   1628 	}
   1629 
   1630 	return (error);
   1631 }
   1632 
   1633 /* ARGSUSED */
   1634 static int32_t
   1635 udf_getpage(
   1636 	struct vnode *vp,
   1637 	offset_t off,
   1638 	size_t len,
   1639 	uint32_t *protp,
   1640 	struct page **plarr,
   1641 	size_t plsz,
   1642 	struct seg *seg,
   1643 	caddr_t addr,
   1644 	enum seg_rw rw,
   1645 	struct cred *cr,
   1646 	caller_context_t *ct)
   1647 {
   1648 	struct ud_inode *ip = VTOI(vp);
   1649 	int32_t error, has_holes, beyond_eof, seqmode, dolock;
   1650 	int32_t pgsize = PAGESIZE;
   1651 	struct udf_vfs *udf_vfsp = ip->i_udf;
   1652 	page_t **pl;
   1653 	u_offset_t pgoff, eoff, uoff;
   1654 	krw_t rwtype;
   1655 	caddr_t pgaddr;
   1656 
   1657 	ud_printf("udf_getpage\n");
   1658 
   1659 	uoff = (u_offset_t)off; /* type conversion */
   1660 	if (protp) {
   1661 		*protp = PROT_ALL;
   1662 	}
   1663 	if (vp->v_flag & VNOMAP) {
   1664 		return (ENOSYS);
   1665 	}
   1666 	seqmode = ip->i_nextr == uoff && rw != S_CREATE;
   1667 
   1668 	rwtype = RW_READER;
   1669 	dolock = (rw_owner(&ip->i_contents) != curthread);
   1670 retrylock:
   1671 #ifdef	__lock_lint
   1672 	rw_enter(&ip->i_contents, rwtype);
   1673 #else
   1674 	if (dolock) {
   1675 		rw_enter(&ip->i_contents, rwtype);
   1676 	}
   1677 #endif
   1678 
   1679 	/*
   1680 	 * We may be getting called as a side effect of a bmap using
   1681 	 * fbread() when the blocks might be being allocated and the
   1682 	 * size has not yet been up'ed.  In this case we want to be
   1683 	 * able to return zero pages if we get back UDF_HOLE from
   1684 	 * calling bmap for a non write case here.  We also might have
   1685 	 * to read some frags from the disk into a page if we are
   1686 	 * extending the number of frags for a given lbn in bmap().
   1687 	 */
   1688 	beyond_eof = uoff + len > ip->i_size + PAGEOFFSET;
   1689 	if (beyond_eof && seg != segkmap) {
   1690 #ifdef	__lock_lint
   1691 		rw_exit(&ip->i_contents);
   1692 #else
   1693 		if (dolock) {
   1694 			rw_exit(&ip->i_contents);
   1695 		}
   1696 #endif
   1697 		return (EFAULT);
   1698 	}
   1699 
   1700 	/*
   1701 	 * Must hold i_contents lock throughout the call to pvn_getpages
   1702 	 * since locked pages are returned from each call to ud_getapage.
   1703 	 * Must *not* return locked pages and then try for contents lock
   1704 	 * due to lock ordering requirements (inode > page)
   1705 	 */
   1706 
   1707 	has_holes = ud_bmap_has_holes(ip);
   1708 
   1709 	if ((rw == S_WRITE || rw == S_CREATE) && (has_holes || beyond_eof)) {
   1710 		int32_t	blk_size, count;
   1711 		u_offset_t offset;
   1712 
   1713 		/*
   1714 		 * We must acquire the RW_WRITER lock in order to
   1715 		 * call bmap_write().
   1716 		 */
   1717 		if (dolock && rwtype == RW_READER) {
   1718 			rwtype = RW_WRITER;
   1719 
   1720 			if (!rw_tryupgrade(&ip->i_contents)) {
   1721 
   1722 				rw_exit(&ip->i_contents);
   1723 
   1724 				goto retrylock;
   1725 			}
   1726 		}
   1727 
   1728 		/*
   1729 		 * May be allocating disk blocks for holes here as
   1730 		 * a result of mmap faults. write(2) does the bmap_write
   1731 		 * in rdip/wrip, not here. We are not dealing with frags
   1732 		 * in this case.
   1733 		 */
   1734 		offset = uoff;
   1735 		while ((offset < uoff + len) &&
   1736 		    (offset < ip->i_size)) {
   1737 			/*
   1738 			 * the variable "bnp" is to simplify the expression for
   1739 			 * the compiler; * just passing in &bn to bmap_write
   1740 			 * causes a compiler "loop"
   1741 			 */
   1742 
   1743 			blk_size = udf_vfsp->udf_lbsize;
   1744 			if ((offset + blk_size) > ip->i_size) {
   1745 				count = ip->i_size - offset;
   1746 			} else {
   1747 				count = blk_size;
   1748 			}
   1749 			error = ud_bmap_write(ip, offset, count, 0, cr);
   1750 			if (error) {
   1751 				goto update_inode;
   1752 			}
   1753 			offset += count; /* XXX - make this contig */
   1754 		}
   1755 	}
   1756 
   1757 	/*
   1758 	 * Can be a reader from now on.
   1759 	 */
   1760 #ifdef	__lock_lint
   1761 	if (rwtype == RW_WRITER) {
   1762 		rw_downgrade(&ip->i_contents);
   1763 	}
   1764 #else
   1765 	if (dolock && rwtype == RW_WRITER) {
   1766 		rw_downgrade(&ip->i_contents);
   1767 	}
   1768 #endif
   1769 
   1770 	/*
   1771 	 * We remove PROT_WRITE in cases when the file has UDF holes
   1772 	 * because we don't  want to call bmap_read() to check each
   1773 	 * page if it is backed with a disk block.
   1774 	 */
   1775 	if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) {
   1776 		*protp &= ~PROT_WRITE;
   1777 	}
   1778 
   1779 	error = 0;
   1780 
   1781 	/*
   1782 	 * The loop looks up pages in the range <off, off + len).
   1783 	 * For each page, we first check if we should initiate an asynchronous
   1784 	 * read ahead before we call page_lookup (we may sleep in page_lookup
   1785 	 * for a previously initiated disk read).
   1786 	 */
   1787 	eoff = (uoff + len);
   1788 	for (pgoff = uoff, pgaddr = addr, pl = plarr;
   1789 	    pgoff < eoff; /* empty */) {
   1790 		page_t	*pp;
   1791 		u_offset_t	nextrio;
   1792 		se_t	se;
   1793 
   1794 		se = ((rw == S_CREATE) ? SE_EXCL : SE_SHARED);
   1795 
   1796 		/*
   1797 		 * Handle async getpage (faultahead)
   1798 		 */
   1799 		if (plarr == NULL) {
   1800 			ip->i_nextrio = pgoff;
   1801 			ud_getpage_ra(vp, pgoff, seg, pgaddr);
   1802 			pgoff += pgsize;
   1803 			pgaddr += pgsize;
   1804 			continue;
   1805 		}
   1806 
   1807 		/*
   1808 		 * Check if we should initiate read ahead of next cluster.
   1809 		 * We call page_exists only when we need to confirm that
   1810 		 * we have the current page before we initiate the read ahead.
   1811 		 */
   1812 		nextrio = ip->i_nextrio;
   1813 		if (seqmode &&
   1814 		    pgoff + RD_CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
   1815 		    nextrio < ip->i_size && page_exists(vp, pgoff))
   1816 			ud_getpage_ra(vp, pgoff, seg, pgaddr);
   1817 
   1818 		if ((pp = page_lookup(vp, pgoff, se)) != NULL) {
   1819 
   1820 			/*
   1821 			 * We found the page in the page cache.
   1822 			 */
   1823 			*pl++ = pp;
   1824 			pgoff += pgsize;
   1825 			pgaddr += pgsize;
   1826 			len -= pgsize;
   1827 			plsz -= pgsize;
   1828 		} else  {
   1829 
   1830 			/*
   1831 			 * We have to create the page, or read it from disk.
   1832 			 */
   1833 			if (error = ud_getpage_miss(vp, pgoff, len,
   1834 			    seg, pgaddr, pl, plsz, rw, seqmode)) {
   1835 				goto error_out;
   1836 			}
   1837 
   1838 			while (*pl != NULL) {
   1839 				pl++;
   1840 				pgoff += pgsize;
   1841 				pgaddr += pgsize;
   1842 				len -= pgsize;
   1843 				plsz -= pgsize;
   1844 			}
   1845 		}
   1846 	}
   1847 
   1848 	/*
   1849 	 * Return pages up to plsz if they are in the page cache.
   1850 	 * We cannot return pages if there is a chance that they are
   1851 	 * backed with a UDF hole and rw is S_WRITE or S_CREATE.
   1852 	 */
   1853 	if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {
   1854 
   1855 		ASSERT((protp == NULL) ||
   1856 		    !(has_holes && (*protp & PROT_WRITE)));
   1857 
   1858 		eoff = pgoff + plsz;
   1859 		while (pgoff < eoff) {
   1860 			page_t		*pp;
   1861 
   1862 			if ((pp = page_lookup_nowait(vp, pgoff,
   1863 			    SE_SHARED)) == NULL)
   1864 				break;
   1865 
   1866 			*pl++ = pp;
   1867 			pgoff += pgsize;
   1868 			plsz -= pgsize;
   1869 		}
   1870 	}
   1871 
   1872 	if (plarr)
   1873 		*pl = NULL;			/* Terminate page list */
   1874 	ip->i_nextr = pgoff;
   1875 
   1876 error_out:
   1877 	if (error && plarr) {
   1878 		/*
   1879 		 * Release any pages we have locked.
   1880 		 */
   1881 		while (pl > &plarr[0])
   1882 			page_unlock(*--pl);
   1883 
   1884 		plarr[0] = NULL;
   1885 	}
   1886 
   1887 update_inode:
   1888 #ifdef	__lock_lint
   1889 	rw_exit(&ip->i_contents);
   1890 #else
   1891 	if (dolock) {
   1892 		rw_exit(&ip->i_contents);
   1893 	}
   1894 #endif
   1895 
   1896 	/*
   1897 	 * If the inode is not already marked for IACC (in rwip() for read)
   1898 	 * and the inode is not marked for no access time update (in rwip()
   1899 	 * for write) then update the inode access time and mod time now.
   1900 	 */
   1901 	mutex_enter(&ip->i_tlock);
   1902 	if ((ip->i_flag & (IACC | INOACC)) == 0) {
   1903 		if ((rw != S_OTHER) && (ip->i_type != VDIR)) {
   1904 			ip->i_flag |= IACC;
   1905 		}
   1906 		if (rw == S_WRITE) {
   1907 			ip->i_flag |= IUPD;
   1908 		}
   1909 		ITIMES_NOLOCK(ip);
   1910 	}
   1911 	mutex_exit(&ip->i_tlock);
   1912 
   1913 	return (error);
   1914 }
   1915 
   1916 int32_t ud_delay = 1;
   1917 
   1918 /* ARGSUSED */
   1919 static int32_t
   1920 udf_putpage(
   1921 	struct vnode *vp,
   1922 	offset_t off,
   1923 	size_t len,
   1924 	int32_t flags,
   1925 	struct cred *cr,
   1926 	caller_context_t *ct)
   1927 {
   1928 	struct ud_inode *ip;
   1929 	int32_t error = 0;
   1930 
   1931 	ud_printf("udf_putpage\n");
   1932 
   1933 	ip = VTOI(vp);
   1934 #ifdef	__lock_lint
   1935 	rw_enter(&ip->i_contents, RW_WRITER);
   1936 #endif
   1937 
   1938 	if (vp->v_count == 0) {
   1939 		cmn_err(CE_WARN, "ud_putpage : bad v_count");
   1940 		error = EINVAL;
   1941 		goto out;
   1942 	}
   1943 
   1944 	if (vp->v_flag & VNOMAP) {
   1945 		error = ENOSYS;
   1946 		goto out;
   1947 	}
   1948 
   1949 	if (flags & B_ASYNC) {
   1950 		if (ud_delay && len &&
   1951 		    (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
   1952 			mutex_enter(&ip->i_tlock);
   1953 
   1954 			/*
   1955 			 * If nobody stalled, start a new cluster.
   1956 			 */
   1957 			if (ip->i_delaylen == 0) {
   1958 				ip->i_delayoff = off;
   1959 				ip->i_delaylen = len;
   1960 				mutex_exit(&ip->i_tlock);
   1961 				goto out;
   1962 			}
   1963 
   1964 			/*
   1965 			 * If we have a full cluster or they are not contig,
   1966 			 * then push last cluster and start over.
   1967 			 */
   1968 			if (ip->i_delaylen >= WR_CLUSTSZ(ip) ||
   1969 			    ip->i_delayoff + ip->i_delaylen != off) {
   1970 				u_offset_t doff;
   1971 				size_t dlen;
   1972 
   1973 				doff = ip->i_delayoff;
   1974 				dlen = ip->i_delaylen;
   1975 				ip->i_delayoff = off;
   1976 				ip->i_delaylen = len;
   1977 				mutex_exit(&ip->i_tlock);
   1978 				error = ud_putpages(vp, doff, dlen, flags, cr);
   1979 				/* LMXXX - flags are new val, not old */
   1980 				goto out;
   1981 			}
   1982 
   1983 			/*
   1984 			 * There is something there, it's not full, and
   1985 			 * it is contig.
   1986 			 */
   1987 			ip->i_delaylen += len;
   1988 			mutex_exit(&ip->i_tlock);
   1989 			goto out;
   1990 		}
   1991 
   1992 		/*
   1993 		 * Must have weird flags or we are not clustering.
   1994 		 */
   1995 	}
   1996 
   1997 	error = ud_putpages(vp, off, len, flags, cr);
   1998 
   1999 out:
   2000 #ifdef	__lock_lint
   2001 	rw_exit(&ip->i_contents);
   2002 #endif
   2003 	return (error);
   2004 }
   2005 
   2006 /* ARGSUSED */
   2007 static int32_t
   2008 udf_map(
   2009 	struct vnode *vp,
   2010 	offset_t off,
   2011 	struct as *as,
   2012 	caddr_t *addrp,
   2013 	size_t len,
   2014 	uint8_t prot,
   2015 	uint8_t maxprot,
   2016 	uint32_t flags,
   2017 	struct cred *cr,
   2018 	caller_context_t *ct)
   2019 {
   2020 	struct segvn_crargs vn_a;
   2021 	int32_t error = 0;
   2022 
   2023 	ud_printf("udf_map\n");
   2024 
   2025 	if (vp->v_flag & VNOMAP) {
   2026 		error = ENOSYS;
   2027 		goto end;
   2028 	}
   2029 
   2030 	if ((off < (offset_t)0) ||
   2031 	    ((off + len) < (offset_t)0)) {
   2032 		error = EINVAL;
   2033 		goto end;
   2034 	}
   2035 
   2036 	if (vp->v_type != VREG) {
   2037 		error = ENODEV;
   2038 		goto end;
   2039 	}
   2040 
   2041 	/*
   2042 	 * If file is being locked, disallow mapping.
   2043 	 */
   2044 	if (vn_has_mandatory_locks(vp, VTOI(vp)->i_char)) {
   2045 		error = EAGAIN;
   2046 		goto end;
   2047 	}
   2048 
   2049 	as_rangelock(as);
   2050 	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
   2051 	if (error != 0) {
   2052 		as_rangeunlock(as);
   2053 		goto end;
   2054 	}
   2055 
   2056 	vn_a.vp = vp;
   2057 	vn_a.offset = off;
   2058 	vn_a.type = flags & MAP_TYPE;
   2059 	vn_a.prot = prot;
   2060 	vn_a.maxprot = maxprot;
   2061 	vn_a.cred = cr;
   2062 	vn_a.amp = NULL;
   2063 	vn_a.flags = flags & ~MAP_TYPE;
   2064 	vn_a.szc = 0;
   2065 	vn_a.lgrp_mem_policy_flags = 0;
   2066 
   2067 	error = as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a);
   2068 	as_rangeunlock(as);
   2069 
   2070 end:
   2071 	return (error);
   2072 }
   2073 
   2074 /* ARGSUSED */
   2075 static int32_t
   2076 udf_addmap(struct vnode *vp,
   2077 	offset_t off,
   2078 	struct as *as,
   2079 	caddr_t addr,
   2080 	size_t len,
   2081 	uint8_t prot,
   2082 	uint8_t maxprot,
   2083 	uint32_t flags,
   2084 	struct cred *cr,
   2085 	caller_context_t *ct)
   2086 {
   2087 	struct ud_inode *ip = VTOI(vp);
   2088 
   2089 	ud_printf("udf_addmap\n");
   2090 
   2091 	if (vp->v_flag & VNOMAP) {
   2092 		return (ENOSYS);
   2093 	}
   2094 
   2095 	mutex_enter(&ip->i_tlock);
   2096 	ip->i_mapcnt += btopr(len);
   2097 	mutex_exit(&ip->i_tlock);
   2098 
   2099 	return (0);
   2100 }
   2101 
   2102 /* ARGSUSED */
   2103 static int32_t
   2104 udf_delmap(
   2105 	struct vnode *vp, offset_t off,
   2106 	struct as *as,
   2107 	caddr_t addr,
   2108 	size_t len,
   2109 	uint32_t prot,
   2110 	uint32_t maxprot,
   2111 	uint32_t flags,
   2112 	struct cred *cr,
   2113 	caller_context_t *ct)
   2114 {
   2115 	struct ud_inode *ip = VTOI(vp);
   2116 
   2117 	ud_printf("udf_delmap\n");
   2118 
   2119 	if (vp->v_flag & VNOMAP) {
   2120 		return (ENOSYS);
   2121 	}
   2122 
   2123 	mutex_enter(&ip->i_tlock);
   2124 	ip->i_mapcnt -= btopr(len); 	/* Count released mappings */
   2125 	ASSERT(ip->i_mapcnt >= 0);
   2126 	mutex_exit(&ip->i_tlock);
   2127 
   2128 	return (0);
   2129 }
   2130 
   2131 /* ARGSUSED */
   2132 static int32_t
   2133 udf_l_pathconf(
   2134 	struct vnode *vp,
   2135 	int32_t cmd,
   2136 	ulong_t *valp,
   2137 	struct cred *cr,
   2138 	caller_context_t *ct)
   2139 {
   2140 	int32_t error = 0;
   2141 
   2142 	ud_printf("udf_l_pathconf\n");
   2143 
   2144 	if (cmd == _PC_FILESIZEBITS) {
   2145 		/*
   2146 		 * udf supports 64 bits as file size
   2147 		 * but there are several other restrictions
   2148 		 * it only supports 32-bit block numbers and
   2149 		 * daddr32_t is only and int32_t so taking these
   2150 		 * into account we can stay just as where ufs is
   2151 		 */
   2152 		*valp = 41;
   2153 	} else if (cmd == _PC_TIMESTAMP_RESOLUTION) {
   2154 		/* nanosecond timestamp resolution */
   2155 		*valp = 1L;
   2156 	} else {
   2157 		error = fs_pathconf(vp, cmd, valp, cr, ct);
   2158 	}
   2159 
   2160 	return (error);
   2161 }
   2162 
   2163 uint32_t ud_pageio_reads = 0, ud_pageio_writes = 0;
   2164 #ifndef	__lint
   2165 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_reads))
   2166 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_writes))
   2167 #endif
   2168 /*
   2169  * Assumption is that there will not be a pageio request
   2170  * to a enbedded file
   2171  */
   2172 /* ARGSUSED */
   2173 static int32_t
   2174 udf_pageio(
   2175 	struct vnode *vp,
   2176 	struct page *pp,
   2177 	u_offset_t io_off,
   2178 	size_t io_len,
   2179 	int32_t flags,
   2180 	struct cred *cr,
   2181 	caller_context_t *ct)
   2182 {
   2183 	daddr_t bn;
   2184 	struct buf *bp;
   2185 	struct ud_inode *ip = VTOI(vp);
   2186 	int32_t dolock, error = 0, contig, multi_io;
   2187 	size_t done_len = 0, cur_len = 0;
   2188 	page_t *npp = NULL, *opp = NULL, *cpp = pp;
   2189 
   2190 	if (pp == NULL) {
   2191 		return (EINVAL);
   2192 	}
   2193 
   2194 	dolock = (rw_owner(&ip->i_contents) != curthread);
   2195 
   2196 	/*
   2197 	 * We need a better check.  Ideally, we would use another
   2198 	 * vnodeops so that hlocked and forcibly unmounted file
   2199 	 * systems would return EIO where appropriate and w/o the
   2200 	 * need for these checks.
   2201 	 */
   2202 	if (ip->i_udf == NULL) {
   2203 		return (EIO);
   2204 	}
   2205 
   2206 #ifdef	__lock_lint
   2207 	rw_enter(&ip->i_contents, RW_READER);
   2208 #else
   2209 	if (dolock) {
   2210 		rw_enter(&ip->i_contents, RW_READER);
   2211 	}
   2212 #endif
   2213 
   2214 	/*
   2215 	 * Break the io request into chunks, one for each contiguous
   2216 	 * stretch of disk blocks in the target file.
   2217 	 */
   2218 	while (done_len < io_len) {
   2219 		ASSERT(cpp);
   2220 		bp = NULL;
   2221 		contig = 0;
   2222 		if (error = ud_bmap_read(ip, (u_offset_t)(io_off + done_len),
   2223 		    &bn, &contig)) {
   2224 			break;
   2225 		}
   2226 
   2227 		if (bn == UDF_HOLE) {   /* No holey swapfiles */
   2228 			cmn_err(CE_WARN, "SWAP file has HOLES");
   2229 			error = EINVAL;
   2230 			break;
   2231 		}
   2232 
   2233 		cur_len = MIN(io_len - done_len, contig);
   2234 
   2235 		/*
   2236 		 * Check if more than one I/O is
   2237 		 * required to complete the given
   2238 		 * I/O operation
   2239 		 */
   2240 		if (ip->i_udf->udf_lbsize < PAGESIZE) {
   2241 			if (cur_len >= PAGESIZE) {
   2242 				multi_io = 0;
   2243 				cur_len &= PAGEMASK;
   2244 			} else {
   2245 				multi_io = 1;
   2246 				cur_len = MIN(io_len - done_len, PAGESIZE);
   2247 			}
   2248 		}
   2249 		page_list_break(&cpp, &npp, btop(cur_len));
   2250 
   2251 		bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
   2252 		ASSERT(bp != NULL);
   2253 
   2254 		bp->b_edev = ip->i_dev;
   2255 		bp->b_dev = cmpdev(ip->i_dev);
   2256 		bp->b_blkno = bn;
   2257 		bp->b_un.b_addr = (caddr_t)0;
   2258 		bp->b_file = vp;
   2259 		bp->b_offset = (offset_t)(io_off + done_len);
   2260 
   2261 /*
   2262  *		ub.ub_pageios.value.ul++;
   2263  */
   2264 		if (multi_io == 0) {
   2265 			(void) bdev_strategy(bp);
   2266 		} else {
   2267 			error = ud_multi_strat(ip, cpp, bp,
   2268 			    (u_offset_t)(io_off + done_len));
   2269 			if (error != 0) {
   2270 				pageio_done(bp);
   2271 				break;
   2272 			}
   2273 		}
   2274 		if (flags & B_READ) {
   2275 			ud_pageio_reads++;
   2276 		} else {
   2277 			ud_pageio_writes++;
   2278 		}
   2279 
   2280 		/*
   2281 		 * If the request is not B_ASYNC, wait for i/o to complete
   2282 		 * and re-assemble the page list to return to the caller.
   2283 		 * If it is B_ASYNC we leave the page list in pieces and
   2284 		 * cleanup() will dispose of them.
   2285 		 */
   2286 		if ((flags & B_ASYNC) == 0) {
   2287 			error = biowait(bp);
   2288 			pageio_done(bp);
   2289 			if (error) {
   2290 				break;
   2291 			}
   2292 			page_list_concat(&opp, &cpp);
   2293 		}
   2294 		cpp = npp;
   2295 		npp = NULL;
   2296 		done_len += cur_len;
   2297 	}
   2298 
   2299 	ASSERT(error || (cpp == NULL && npp == NULL && done_len == io_len));
   2300 	if (error) {
   2301 		if (flags & B_ASYNC) {
   2302 			/* Cleanup unprocessed parts of list */
   2303 			page_list_concat(&cpp, &npp);
   2304 			if (flags & B_READ) {
   2305 				pvn_read_done(cpp, B_ERROR);
   2306 			} else {
   2307 				pvn_write_done(cpp, B_ERROR);
   2308 			}
   2309 		} else {
   2310 			/* Re-assemble list and let caller clean up */
   2311 			page_list_concat(&opp, &cpp);
   2312 			page_list_concat(&opp, &npp);
   2313 		}
   2314 	}
   2315 
   2316 #ifdef	__lock_lint
   2317 	rw_exit(&ip->i_contents);
   2318 #else
   2319 	if (dolock) {
   2320 		rw_exit(&ip->i_contents);
   2321 	}
   2322 #endif
   2323 	return (error);
   2324 }
   2325 
   2326 
   2327 
   2328 
   2329 /* -------------------- local functions --------------------------- */
   2330 
   2331 
   2332 
   2333 int32_t
   2334 ud_rdwri(enum uio_rw rw, int32_t ioflag,
   2335 	struct ud_inode *ip, caddr_t base, int32_t len,
   2336 	offset_t offset, enum uio_seg seg, int32_t *aresid, struct cred *cr)
   2337 {
   2338 	int32_t error;
   2339 	struct uio auio;
   2340 	struct iovec aiov;
   2341 
   2342 	ud_printf("ud_rdwri\n");
   2343 
   2344 	bzero((caddr_t)&auio, sizeof (uio_t));
   2345 	bzero((caddr_t)&aiov, sizeof (iovec_t));
   2346 
   2347 	aiov.iov_base = base;
   2348 	aiov.iov_len = len;
   2349 	auio.uio_iov = &aiov;
   2350 	auio.uio_iovcnt = 1;
   2351 	auio.uio_loffset = offset;
   2352 	auio.uio_segflg = (int16_t)seg;
   2353 	auio.uio_resid = len;
   2354 
   2355 	if (rw == UIO_WRITE) {
   2356 		auio.uio_fmode = FWRITE;
   2357 		auio.uio_extflg = UIO_COPY_DEFAULT;
   2358 		auio.uio_llimit = curproc->p_fsz_ctl;
   2359 		error = ud_wrip(ip, &auio, ioflag, cr);
   2360 	} else {
   2361 		auio.uio_fmode = FREAD;
   2362 		auio.uio_extflg = UIO_COPY_CACHED;
   2363 		auio.uio_llimit = MAXOFFSET_T;
   2364 		error = ud_rdip(ip, &auio, ioflag, cr);
   2365 	}
   2366 
   2367 	if (aresid) {
   2368 		*aresid = auio.uio_resid;
   2369 	} else if (auio.uio_resid) {
   2370 		error = EIO;
   2371 	}
   2372 	return (error);
   2373 }
   2374 
   2375 /*
   2376  * Free behind hacks.  The pager is busted.
   2377  * XXX - need to pass the information down to writedone() in a flag like B_SEQ
   2378  * or B_FREE_IF_TIGHT_ON_MEMORY.
   2379  */
   2380 int32_t ud_freebehind = 1;
   2381 int32_t ud_smallfile = 32 * 1024;
   2382 
   2383 /* ARGSUSED */
   2384 int32_t
   2385 ud_getpage_miss(struct vnode *vp, u_offset_t off,
   2386 	size_t len, struct seg *seg, caddr_t addr, page_t *pl[],
   2387 	size_t plsz, enum seg_rw rw, int32_t seq)
   2388 {
   2389 	struct ud_inode *ip = VTOI(vp);
   2390 	int32_t err = 0;
   2391 	size_t io_len;
   2392 	u_offset_t io_off;
   2393 	u_offset_t pgoff;
   2394 	page_t *pp;
   2395 
   2396 	pl[0] = NULL;
   2397 
   2398 	/*
   2399 	 * Figure out whether the page can be created, or must be
   2400 	 * read from the disk
   2401 	 */
   2402 	if (rw == S_CREATE) {
   2403 		if ((pp = page_create_va(vp, off,
   2404 		    PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
   2405 			cmn_err(CE_WARN, "ud_getpage_miss: page_create");
   2406 			return (EINVAL);
   2407 		}
   2408 		io_len = PAGESIZE;
   2409 	} else {
   2410 		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
   2411 		    &io_len, off, PAGESIZE, 0);
   2412 
   2413 		/*
   2414 		 * Some other thread has entered the page.
   2415 		 * ud_getpage will retry page_lookup.
   2416 		 */
   2417 		if (pp == NULL) {
   2418 			return (0);
   2419 		}
   2420 
   2421 		/*
   2422 		 * Fill the page with as much data as we can from the file.
   2423 		 */
   2424 		err = ud_page_fill(ip, pp, off, B_READ, &pgoff);
   2425 		if (err) {
   2426 			pvn_read_done(pp, B_ERROR);
   2427 			return (err);
   2428 		}
   2429 
   2430 		/*
   2431 		 * XXX ??? ufs has io_len instead of pgoff below
   2432 		 */
   2433 		ip->i_nextrio = off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
   2434 
   2435 		/*
   2436 		 * If the file access is sequential, initiate read ahead
   2437 		 * of the next cluster.
   2438 		 */
   2439 		if (seq && ip->i_nextrio < ip->i_size) {
   2440 			ud_getpage_ra(vp, off, seg, addr);
   2441 		}
   2442 	}
   2443 
   2444 outmiss:
   2445 	pvn_plist_init(pp, pl, plsz, (offset_t)off, io_len, rw);
   2446 	return (err);
   2447 }
   2448 
   2449 /* ARGSUSED */
   2450 void
   2451 ud_getpage_ra(struct vnode *vp,
   2452 	u_offset_t off, struct seg *seg, caddr_t addr)
   2453 {
   2454 	page_t *pp;
   2455 	size_t io_len;
   2456 	struct ud_inode *ip = VTOI(vp);
   2457 	u_offset_t io_off = ip->i_nextrio, pgoff;
   2458 	caddr_t addr2 = addr + (io_off - off);
   2459 	daddr_t bn;
   2460 	int32_t contig = 0;
   2461 
   2462 	/*
   2463 	 * Is this test needed?
   2464 	 */
   2465 
   2466 	if (addr2 >= seg->s_base + seg->s_size) {
   2467 		return;
   2468 	}
   2469 
   2470 	contig = 0;
   2471 	if (ud_bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UDF_HOLE) {
   2472 		return;
   2473 	}
   2474 
   2475 	pp = pvn_read_kluster(vp, io_off, seg, addr2,
   2476 	    &io_off, &io_len, io_off, PAGESIZE, 1);
   2477 
   2478 	/*
   2479 	 * Some other thread has entered the page.
   2480 	 * So no read head done here (ie we will have to and wait
   2481 	 * for the read when needed).
   2482 	 */
   2483 
   2484 	if (pp == NULL) {
   2485 		return;
   2486 	}
   2487 
   2488 	(void) ud_page_fill(ip, pp, io_off, (B_READ|B_ASYNC), &pgoff);
   2489 	ip->i_nextrio =  io_off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
   2490 }
   2491 
   2492 int
   2493 ud_page_fill(struct ud_inode *ip, page_t *pp, u_offset_t off,
   2494 	uint32_t bflgs, u_offset_t *pg_off)
   2495 {
   2496 	daddr_t bn;
   2497 	struct buf *bp;
   2498 	caddr_t kaddr, caddr;
   2499 	int32_t error = 0, contig = 0, multi_io = 0;
   2500 	int32_t lbsize = ip->i_udf->udf_lbsize;
   2501 	int32_t lbmask = ip->i_udf->udf_lbmask;
   2502 	uint64_t isize;
   2503 
   2504 	isize = (ip->i_size + lbmask) & (~lbmask);
   2505 	if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
   2506 
   2507 		/*
   2508 		 * Embedded file read file_entry
   2509 		 * from buffer cache and copy the required
   2510 		 * portions
   2511 		 */
   2512 		bp = ud_bread(ip->i_dev,
   2513 		    ip->i_icb_lbano << ip->i_udf->udf_l2d_shift, lbsize);
   2514 		if ((bp->b_error == 0) &&
   2515 		    (bp->b_resid == 0)) {
   2516 
   2517 			caddr = bp->b_un.b_addr + ip->i_data_off;
   2518 
   2519 			/*
   2520 			 * mapin to kvm
   2521 			 */
   2522 			kaddr = (caddr_t)ppmapin(pp,
   2523 			    PROT_READ | PROT_WRITE, (caddr_t)-1);
   2524 			(void) kcopy(caddr, kaddr, ip->i_size);
   2525 
   2526 			/*
   2527 			 * mapout of kvm
   2528 			 */
   2529 			ppmapout(kaddr);
   2530 		}
   2531 		brelse(bp);
   2532 		contig = ip->i_size;
   2533 	} else {
   2534 
   2535 		/*
   2536 		 * Get the continuous size and block number
   2537 		 * at offset "off"
   2538 		 */
   2539 		if (error = ud_bmap_read(ip, off, &bn, &contig))
   2540 			goto out;
   2541 		contig = MIN(contig, PAGESIZE);
   2542 		contig = (contig + lbmask) & (~lbmask);
   2543 
   2544 		/*
   2545 		 * Zero part of the page which we are not
   2546 		 * going to read from the disk.
   2547 		 */
   2548 
   2549 		if (bn == UDF_HOLE) {
   2550 
   2551 			/*
   2552 			 * This is a HOLE. Just zero out
   2553 			 * the page
   2554 			 */
   2555 			if (((off + contig) == isize) ||
   2556 			    (contig == PAGESIZE)) {
   2557 				pagezero(pp->p_prev, 0, PAGESIZE);
   2558 				goto out;
   2559 			}
   2560 		}
   2561 
   2562 		if (contig < PAGESIZE) {
   2563 			uint64_t count;
   2564 
   2565 			count = isize - off;
   2566 			if (contig != count) {
   2567 				multi_io = 1;
   2568 				contig = (int32_t)(MIN(count, PAGESIZE));
   2569 			} else {
   2570 				pagezero(pp->p_prev, contig, PAGESIZE - contig);
   2571 			}
   2572 		}
   2573 
   2574 		/*
   2575 		 * Get a bp and initialize it
   2576 		 */
   2577 		bp = pageio_setup(pp, contig, ip->i_devvp, bflgs);
   2578 		ASSERT(bp != NULL);
   2579 
   2580 		bp->b_edev = ip->i_dev;
   2581 		bp->b_dev = cmpdev(ip->i_dev);
   2582 		bp->b_blkno = bn;
   2583 		bp->b_un.b_addr = 0;
   2584 		bp->b_file = ip->i_vnode;
   2585 
   2586 		/*
   2587 		 * Start I/O
   2588 		 */
   2589 		if (multi_io == 0) {
   2590 
   2591 			/*
   2592 			 * Single I/O is sufficient for this page
   2593 			 */
   2594 			(void) bdev_strategy(bp);
   2595 		} else {
   2596 
   2597 			/*
   2598 			 * We need to do the I/O in
   2599 			 * piece's
   2600 			 */
   2601 			error = ud_multi_strat(ip, pp, bp, off);
   2602 			if (error != 0) {
   2603 				goto out;
   2604 			}
   2605 		}
   2606 		if ((bflgs & B_ASYNC) == 0) {
   2607 
   2608 			/*
   2609 			 * Wait for i/o to complete.
   2610 			 */
   2611 
   2612 			error = biowait(bp);
   2613 			pageio_done(bp);
   2614 			if (error) {
   2615 				goto out;
   2616 			}
   2617 		}
   2618 	}
   2619 	if ((off + contig) >= ip->i_size) {
   2620 		contig = ip->i_size - off;
   2621 	}
   2622 
   2623 out:
   2624 	*pg_off = contig;
   2625 	return (error);
   2626 }
   2627 
   2628 int32_t
   2629 ud_putpages(struct vnode *vp, offset_t off,
   2630 	size_t len, int32_t flags, struct cred *cr)
   2631 {
   2632 	struct ud_inode *ip;
   2633 	page_t *pp;
   2634 	u_offset_t io_off;
   2635 	size_t io_len;
   2636 	u_offset_t eoff;
   2637 	int32_t err = 0;
   2638 	int32_t dolock;
   2639 
   2640 	ud_printf("ud_putpages\n");
   2641 
   2642 	if (vp->v_count == 0) {
   2643 		cmn_err(CE_WARN, "ud_putpages: bad v_count");
   2644 		return (EINVAL);
   2645 	}
   2646 
   2647 	ip = VTOI(vp);
   2648 
   2649 	/*
   2650 	 * Acquire the readers/write inode lock before locking
   2651 	 * any pages in this inode.
   2652 	 * The inode lock is held during i/o.
   2653 	 */
   2654 	if (len == 0) {
   2655 		mutex_enter(&ip->i_tlock);
   2656 		ip->i_delayoff = ip->i_delaylen = 0;
   2657 		mutex_exit(&ip->i_tlock);
   2658 	}
   2659 #ifdef	__lock_lint
   2660 	rw_enter(&ip->i_contents, RW_READER);
   2661 #else
   2662 	dolock = (rw_owner(&ip->i_contents) != curthread);
   2663 	if (dolock) {
   2664 		rw_enter(&ip->i_contents, RW_READER);
   2665 	}
   2666 #endif
   2667 
   2668 	if (!vn_has_cached_data(vp)) {
   2669 #ifdef	__lock_lint
   2670 		rw_exit(&ip->i_contents);
   2671 #else
   2672 		if (dolock) {
   2673 			rw_exit(&ip->i_contents);
   2674 		}
   2675 #endif
   2676 		return (0);
   2677 	}
   2678 
   2679 	if (len == 0) {
   2680 		/*
   2681 		 * Search the entire vp list for pages >= off.
   2682 		 */
   2683 		err = pvn_vplist_dirty(vp, (u_offset_t)off, ud_putapage,
   2684 		    flags, cr);
   2685 	} else {
   2686 		/*
   2687 		 * Loop over all offsets in the range looking for
   2688 		 * pages to deal with.
   2689 		 */
   2690 		if ((eoff = blkroundup(ip->i_udf, ip->i_size)) != 0) {
   2691 			eoff = MIN(off + len, eoff);
   2692 		} else {
   2693 			eoff = off + len;
   2694 		}
   2695 
   2696 		for (io_off = off; io_off < eoff; io_off += io_len) {
   2697 			/*
   2698 			 * If we are not invalidating, synchronously
   2699 			 * freeing or writing pages, use the routine
   2700 			 * page_lookup_nowait() to prevent reclaiming
   2701 			 * them from the free list.
   2702 			 */
   2703 			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
   2704 				pp = page_lookup(vp, io_off,
   2705 				    (flags & (B_INVAL | B_FREE)) ?
   2706 				    SE_EXCL : SE_SHARED);
   2707 			} else {
   2708 				pp = page_lookup_nowait(vp, io_off,
   2709 				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
   2710 			}
   2711 
   2712 			if (pp == NULL || pvn_getdirty(pp, flags) == 0) {
   2713 				io_len = PAGESIZE;
   2714 			} else {
   2715 
   2716 				err = ud_putapage(vp, pp,
   2717 				    &io_off, &io_len, flags, cr);
   2718 				if (err != 0) {
   2719 					break;
   2720 				}
   2721 				/*
   2722 				 * "io_off" and "io_len" are returned as
   2723 				 * the range of pages we actually wrote.
   2724 				 * This allows us to skip ahead more quickly
   2725 				 * since several pages may've been dealt
   2726 				 * with by this iteration of the loop.
   2727 				 */
   2728 			}
   2729 		}
   2730 	}
   2731 	if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
   2732 		/*
   2733 		 * We have just sync'ed back all the pages on
   2734 		 * the inode, turn off the IMODTIME flag.
   2735 		 */
   2736 		mutex_enter(&ip->i_tlock);
   2737 		ip->i_flag &= ~IMODTIME;
   2738 		mutex_exit(&ip->i_tlock);
   2739 	}
   2740 #ifdef	__lock_lint
   2741 	rw_exit(&ip->i_contents);
   2742 #else
   2743 	if (dolock) {
   2744 		rw_exit(&ip->i_contents);
   2745 	}
   2746 #endif
   2747 	return (err);
   2748 }
   2749 
   2750 /* ARGSUSED */
   2751 int32_t
   2752 ud_putapage(struct vnode *vp,
   2753 	page_t *pp, u_offset_t *offp,
   2754 	size_t *lenp, int32_t flags, struct cred *cr)
   2755 {
   2756 	daddr_t bn;
   2757 	size_t io_len;
   2758 	struct ud_inode *ip;
   2759 	int32_t error = 0, contig, multi_io = 0;
   2760 	struct udf_vfs *udf_vfsp;
   2761 	u_offset_t off, io_off;
   2762 	caddr_t kaddr, caddr;
   2763 	struct buf *bp = NULL;
   2764 	int32_t lbmask;
   2765 	uint64_t isize;
   2766 	int32_t crc_len;
   2767 	struct file_entry *fe;
   2768 
   2769 	ud_printf("ud_putapage\n");
   2770 
   2771 	ip = VTOI(vp);
   2772 	ASSERT(ip);
   2773 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
   2774 	lbmask = ip->i_udf->udf_lbmask;
   2775 	isize = (ip->i_size + lbmask) & (~lbmask);
   2776 
   2777 	udf_vfsp = ip->i_udf;
   2778 	ASSERT(udf_vfsp->udf_flags & UDF_FL_RW);
   2779 
   2780 	/*
   2781 	 * If the modified time on the inode has not already been
   2782 	 * set elsewhere (e.g. for write/setattr) we set the time now.
   2783 	 * This gives us approximate modified times for mmap'ed files
   2784 	 * which are modified via stores in the user address space.
   2785 	 */
   2786 	if (((ip->i_flag & IMODTIME) == 0) || (flags & B_FORCE)) {
   2787 		mutex_enter(&ip->i_tlock);
   2788 		ip->i_flag |= IUPD;
   2789 		ITIMES_NOLOCK(ip);
   2790 		mutex_exit(&ip->i_tlock);
   2791 	}
   2792 
   2793 
   2794 	/*
   2795 	 * Align the request to a block boundry (for old file systems),
   2796 	 * and go ask bmap() how contiguous things are for this file.
   2797 	 */
   2798 	off = pp->p_offset & ~(offset_t)lbmask;
   2799 				/* block align it */
   2800 
   2801 
   2802 	if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
   2803 		ASSERT(ip->i_size <= ip->i_max_emb);
   2804 
   2805 		pp = pvn_write_kluster(vp, pp, &io_off,
   2806 		    &io_len, off, PAGESIZE, flags);
   2807 		if (io_len == 0) {
   2808 			io_len = PAGESIZE;
   2809 		}
   2810 
   2811 		bp = ud_bread(ip->i_dev,
   2812 		    ip->i_icb_lbano << udf_vfsp->udf_l2d_shift,
   2813 		    udf_vfsp->udf_lbsize);
   2814 		fe = (struct file_entry *)bp->b_un.b_addr;
   2815 		if ((bp->b_flags & B_ERROR) ||
   2816 		    (ud_verify_tag_and_desc(&fe->fe_tag, UD_FILE_ENTRY,
   2817 		    ip->i_icb_block,
   2818 		    1, udf_vfsp->udf_lbsize) != 0)) {
   2819 			if (pp != NULL)
   2820 				pvn_write_done(pp, B_ERROR | B_WRITE | flags);
   2821 			if (bp->b_flags & B_ERROR) {
   2822 				error = EIO;
   2823 			} else {
   2824 				error = EINVAL;
   2825 			}
   2826 			brelse(bp);
   2827 			return (error);
   2828 		}
   2829 		if ((bp->b_error == 0) &&
   2830 		    (bp->b_resid == 0)) {
   2831 
   2832 			caddr = bp->b_un.b_addr + ip->i_data_off;
   2833 			kaddr = (caddr_t)ppmapin(pp,
   2834 			    PROT_READ | PROT_WRITE, (caddr_t)-1);
   2835 			(void) kcopy(kaddr, caddr, ip->i_size);
   2836 			ppmapout(kaddr);
   2837 		}
   2838 		crc_len = ((uint32_t)&((struct file_entry *)0)->fe_spec) +
   2839 		    SWAP_32(fe->fe_len_ear);
   2840 		crc_len += ip->i_size;
   2841 		ud_make_tag(ip->i_udf, &fe->fe_tag,
   2842 		    UD_FILE_ENTRY, ip->i_icb_block, crc_len);
   2843 
   2844 		bwrite(bp);
   2845 
   2846 		if (flags & B_ASYNC) {
   2847 			pvn_write_done(pp, flags);
   2848 		}
   2849 		contig = ip->i_size;
   2850 	} else {
   2851 
   2852 		if (error = ud_bmap_read(ip, off, &bn, &contig)) {
   2853 			goto out;
   2854 		}
   2855 		contig = MIN(contig, PAGESIZE);
   2856 		contig = (contig + lbmask) & (~lbmask);
   2857 
   2858 		if (contig < PAGESIZE) {
   2859 			uint64_t count;
   2860 
   2861 			count = isize - off;
   2862 			if (contig != count) {
   2863 				multi_io = 1;
   2864 				contig = (int32_t)(MIN(count, PAGESIZE));
   2865 			}
   2866 		}
   2867 
   2868 		if ((off + contig) > isize) {
   2869 			contig = isize - off;
   2870 		}
   2871 
   2872 		if (contig > PAGESIZE) {
   2873 			if (contig & PAGEOFFSET) {
   2874 				contig &= PAGEMASK;
   2875 			}
   2876 		}
   2877 
   2878 		pp = pvn_write_kluster(vp, pp, &io_off,
   2879 		    &io_len, off, contig, flags);
   2880 		if (io_len == 0) {
   2881 			io_len = PAGESIZE;
   2882 		}
   2883 
   2884 		bp = pageio_setup(pp, contig, ip->i_devvp, B_WRITE | flags);
   2885 		ASSERT(bp != NULL);
   2886 
   2887 		bp->b_edev = ip->i_dev;
   2888 		bp->b_dev = cmpdev(ip->i_dev);
   2889 		bp->b_blkno = bn;
   2890 		bp->b_un.b_addr = 0;
   2891 		bp->b_file = vp;
   2892 		bp->b_offset = (offset_t)off;
   2893 
   2894 
   2895 		/*
   2896 		 * write throttle
   2897 		 */
   2898 		ASSERT(bp->b_iodone == NULL);
   2899 		bp->b_iodone = ud_iodone;
   2900 		mutex_enter(&ip->i_tlock);
   2901 		ip->i_writes += bp->b_bcount;
   2902 		mutex_exit(&ip->i_tlock);
   2903 
   2904 		if (multi_io == 0) {
   2905 
   2906 			(void) bdev_strategy(bp);
   2907 		} else {
   2908 			error = ud_multi_strat(ip, pp, bp, off);
   2909 			if (error != 0) {
   2910 				goto out;
   2911 			}
   2912 		}
   2913 
   2914 		if ((flags & B_ASYNC) == 0) {
   2915 			/*
   2916 			 * Wait for i/o to complete.
   2917 			 */
   2918 			error = biowait(bp);
   2919 			pageio_done(bp);
   2920 		}
   2921 	}
   2922 
   2923 	if ((flags & B_ASYNC) == 0) {
   2924 		pvn_write_done(pp, ((error) ? B_ERROR : 0) | B_WRITE | flags);
   2925 	}
   2926 
   2927 	pp = NULL;
   2928 
   2929 out:
   2930 	if (error != 0 && pp != NULL) {
   2931 		pvn_write_done(pp, B_ERROR | B_WRITE | flags);
   2932 	}
   2933 
   2934 	if (offp) {
   2935 		*offp = io_off;
   2936 	}
   2937 	if (lenp) {
   2938 		*lenp = io_len;
   2939 	}
   2940 
   2941 	return (error);
   2942 }
   2943 
   2944 
   2945 int32_t
   2946 ud_iodone(struct buf *bp)
   2947 {
   2948 	struct ud_inode *ip;
   2949 
   2950 	ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ));
   2951 
   2952 	bp->b_iodone = NULL;
   2953 
   2954 	ip = VTOI(bp->b_pages->p_vnode);
   2955 
   2956 	mutex_enter(&ip->i_tlock);
   2957 	if (ip->i_writes >= ud_LW) {
   2958 		if ((ip->i_writes -= bp->b_bcount) <= ud_LW) {
   2959 			if (ud_WRITES) {
   2960 				cv_broadcast(&ip->i_wrcv); /* wake all up */
   2961 			}
   2962 		}
   2963 	} else {
   2964 		ip->i_writes -= bp->b_bcount;
   2965 	}
   2966 	mutex_exit(&ip->i_tlock);
   2967 	iodone(bp);
   2968 	return (0);
   2969 }
   2970 
   2971 /* ARGSUSED3 */
   2972 int32_t
   2973 ud_rdip(struct ud_inode *ip, struct uio *uio, int32_t ioflag, cred_t *cr)
   2974 {
   2975 	struct vnode *vp;
   2976 	struct udf_vfs *udf_vfsp;
   2977 	krw_t rwtype;
   2978 	caddr_t base;
   2979 	uint32_t flags;
   2980 	int32_t error, n, on, mapon, dofree;
   2981 	u_offset_t off;
   2982 	long oresid = uio->uio_resid;
   2983 
   2984 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
   2985 	if ((ip->i_type != VREG) &&
   2986 	    (ip->i_type != VDIR) &&
   2987 	    (ip->i_type != VLNK)) {
   2988 		return (EIO);
   2989 	}
   2990 
   2991 	if (uio->uio_loffset > MAXOFFSET_T) {
   2992 		return (0);
   2993 	}
   2994 
   2995 	if ((uio->uio_loffset < (offset_t)0) ||
   2996 	    ((uio->uio_loffset + uio->uio_resid) < 0)) {
   2997 		return (EINVAL);
   2998 	}
   2999 	if (uio->uio_resid == 0) {
   3000 		return (0);
   3001 	}
   3002 
   3003 	vp = ITOV(ip);
   3004 	udf_vfsp = ip->i_udf;
   3005 	mutex_enter(&ip->i_tlock);
   3006 	ip->i_flag |= IACC;
   3007 	mutex_exit(&ip->i_tlock);
   3008 
   3009 	rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
   3010 
   3011 	do {
   3012 		offset_t diff;
   3013 		u_offset_t uoff = uio->uio_loffset;
   3014 		off = uoff & (offset_t)MAXBMASK;
   3015 		mapon = (int)(uoff & (offset_t)MAXBOFFSET);
   3016 		on = (int)blkoff(udf_vfsp, uoff);
   3017 		n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
   3018 
   3019 		diff = ip->i_size - uoff;
   3020 
   3021 		if (diff <= (offset_t)0) {
   3022 			error = 0;
   3023 			goto out;
   3024 		}
   3025 		if (diff < (offset_t)n) {
   3026 			n = (int)diff;
   3027 		}
   3028 		dofree = ud_freebehind &&
   3029 		    ip->i_nextr == (off & PAGEMASK) &&
   3030 		    off > ud_smallfile;
   3031 
   3032 #ifndef	__lock_lint
   3033 		if (rwtype == RW_READER) {
   3034 			rw_exit(&ip->i_contents);
   3035 		}
   3036 #endif
   3037 
   3038 		base = segmap_getmapflt(segkmap, vp, (off + mapon),
   3039 		    (uint32_t)n, 1, S_READ);
   3040 		error = uiomove(base + mapon, (long)n, UIO_READ, uio);
   3041 
   3042 		flags = 0;
   3043 		if (!error) {
   3044 			/*
   3045 			 * If read a whole block, or read to eof,
   3046 			 * won't need this buffer again soon.
   3047 			 */
   3048 			if (n + on == MAXBSIZE && ud_freebehind && dofree &&
   3049 			    freemem < lotsfree + pages_before_pager) {
   3050 				flags = SM_FREE | SM_DONTNEED |SM_ASYNC;
   3051 			}
   3052 			/*
   3053 			 * In POSIX SYNC (FSYNC and FDSYNC) read mode,
   3054 			 * we want to make sure that the page which has
   3055 			 * been read, is written on disk if it is dirty.
   3056 			 * And corresponding indirect blocks should also
   3057 			 * be flushed out.
   3058 			 */
   3059 			if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
   3060 				flags &= ~SM_ASYNC;
   3061 				flags |= SM_WRITE;
   3062 			}
   3063 			error = segmap_release(segkmap, base, flags);
   3064 		} else    {
   3065 			(void) segmap_release(segkmap, base, flags);
   3066 		}
   3067 
   3068 #ifndef __lock_lint
   3069 		if (rwtype == RW_READER) {
   3070 			rw_enter(&ip->i_contents, rwtype);
   3071 		}
   3072 #endif
   3073 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
   3074 out:
   3075 	/*
   3076 	 * Inode is updated according to this table if FRSYNC is set.
   3077 	 *
   3078 	 *	FSYNC	FDSYNC(posix.4)
   3079 	 *	--------------------------
   3080 	 *	always	IATTCHG|IBDWRITE
   3081 	 */
   3082 	if (ioflag & FRSYNC) {
   3083 		if ((ioflag & FSYNC) ||
   3084 		    ((ioflag & FDSYNC) &&
   3085 		    (ip->i_flag & (IATTCHG|IBDWRITE)))) {
   3086 		rw_exit(&ip->i_contents);
   3087 		rw_enter(&ip->i_contents, RW_WRITER);
   3088 		ud_iupdat(ip, 1);
   3089 		}
   3090 	}
   3091 	/*
   3092 	 * If we've already done a partial read, terminate
   3093 	 * the read but return no error.
   3094 	 */
   3095 	if (oresid != uio->uio_resid) {
   3096 		error = 0;
   3097 	}
   3098 	ITIMES(ip);
   3099 
   3100 	return (error);
   3101 }
   3102 
   3103 int32_t
   3104 ud_wrip(struct ud_inode *ip, struct uio *uio, int ioflag, struct cred *cr)
   3105 {
   3106 	caddr_t base;
   3107 	struct vnode *vp;
   3108 	struct udf_vfs *udf_vfsp;
   3109 	uint32_t flags;
   3110 	int32_t error = 0, iupdat_flag, n, on, mapon, i_size_changed = 0;
   3111 	int32_t pagecreate, newpage;
   3112 	uint64_t old_i_size;
   3113 	u_offset_t off;
   3114 	long start_resid = uio->uio_resid, premove_resid;
   3115 	rlim64_t limit = uio->uio_limit;
   3116 
   3117 
   3118 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
   3119 	if ((ip->i_type != VREG) &&
   3120 	    (ip->i_type != VDIR) &&
   3121 	    (ip->i_type != VLNK)) {
   3122 		return (EIO);
   3123 	}
   3124 
   3125 	if (uio->uio_loffset >= MAXOFFSET_T) {
   3126 		return (EFBIG);
   3127 	}
   3128 	/*
   3129 	 * see udf_l_pathconf
   3130 	 */
   3131 	if (limit > (((uint64_t)1 << 40) - 1)) {
   3132 		limit = ((uint64_t)1 << 40) - 1;
   3133 	}
   3134 	if (uio->uio_loffset >= limit) {
   3135 		proc_t *p = ttoproc(curthread);
   3136 
   3137 		mutex_enter(&p->p_lock);
   3138 		(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
   3139 		    p, RCA_UNSAFE_SIGINFO);
   3140 		mutex_exit(&p->p_lock);
   3141 		return (EFBIG);
   3142 	}
   3143 	if ((uio->uio_loffset < (offset_t)0) ||
   3144 	    ((uio->uio_loffset + uio->uio_resid) < 0)) {
   3145 		return (EINVAL);
   3146 	}
   3147 	if (uio->uio_resid == 0) {
   3148 		return (0);
   3149 	}
   3150 
   3151 	mutex_enter(&ip->i_tlock);
   3152 	ip->i_flag |= INOACC;
   3153 
   3154 	if (ioflag & (FSYNC | FDSYNC)) {
   3155 		ip->i_flag |= ISYNC;
   3156 		iupdat_flag = 1;
   3157 	}
   3158 	mutex_exit(&ip->i_tlock);
   3159 
   3160 	udf_vfsp = ip->i_udf;
   3161 	vp = ITOV(ip);
   3162 
   3163 	do {
   3164 		u_offset_t uoff = uio->uio_loffset;
   3165 		off = uoff & (offset_t)MAXBMASK;
   3166 		mapon = (int)(uoff & (offset_t)MAXBOFFSET);
   3167 		on = (int)blkoff(udf_vfsp, uoff);
   3168 		n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
   3169 
   3170 		if (ip->i_type == VREG && uoff + n >= limit) {
   3171 			if (uoff >= limit) {
   3172 				error = EFBIG;
   3173 				goto out;
   3174 			}
   3175 			n = (int)(limit - (rlim64_t)uoff);
   3176 		}
   3177 		if (uoff + n > ip->i_size) {
   3178 			/*
   3179 			 * We are extending the length of the file.
   3180 			 * bmap is used so that we are sure that
   3181 			 * if we need to allocate new blocks, that it
   3182 			 * is done here before we up the file size.
   3183 			 */
   3184 			error = ud_bmap_write(ip, uoff,
   3185 			    (int)(on + n), mapon == 0, cr);
   3186 			if (error) {
   3187 				break;
   3188 			}
   3189 			i_size_changed = 1;
   3190 			old_i_size = ip->i_size;
   3191 			ip->i_size = uoff + n;
   3192 			/*
   3193 			 * If we are writing from the beginning of
   3194 			 * the mapping, we can just create the
   3195 			 * pages without having to read them.
   3196 			 */
   3197 			pagecreate = (mapon == 0);
   3198 		} else if (n == MAXBSIZE) {
   3199 			/*
   3200 			 * Going to do a whole mappings worth,
   3201 			 * so we can just create the pages w/o
   3202 			 * having to read them in.  But before
   3203 			 * we do that, we need to make sure any
   3204 			 * needed blocks are allocated first.
   3205 			 */
   3206 			error = ud_bmap_write(ip, uoff,
   3207 			    (int)(on + n), 1, cr);
   3208 			if (error) {
   3209 				break;
   3210 			}
   3211 			pagecreate = 1;
   3212 		} else {
   3213 			pagecreate = 0;
   3214 		}
   3215 
   3216 		rw_exit(&ip->i_contents);
   3217 
   3218 		/*
   3219 		 * Touch the page and fault it in if it is not in
   3220 		 * core before segmap_getmapflt can lock it. This
   3221 		 * is to avoid the deadlock if the buffer is mapped
   3222 		 * to the same file through mmap which we want to
   3223 		 * write to.
   3224 		 */
   3225 		uio_prefaultpages((long)n, uio);
   3226 
   3227 		base = segmap_getmapflt(segkmap, vp, (off + mapon),
   3228 		    (uint32_t)n, !pagecreate, S_WRITE);
   3229 
   3230 		/*
   3231 		 * segmap_pagecreate() returns 1 if it calls
   3232 		 * page_create_va() to allocate any pages.
   3233 		 */
   3234 		newpage = 0;
   3235 		if (pagecreate) {
   3236 			newpage = segmap_pagecreate(segkmap, base,
   3237 			    (size_t)n, 0);
   3238 		}
   3239 
   3240 		premove_resid = uio->uio_resid;
   3241 		error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
   3242 
   3243 		if (pagecreate &&
   3244 		    uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
   3245 			/*
   3246 			 * We created pages w/o initializing them completely,
   3247 			 * thus we need to zero the part that wasn't set up.
   3248 			 * This happens on most EOF write cases and if
   3249 			 * we had some sort of error during the uiomove.
   3250 			 */
   3251 			int nzero, nmoved;
   3252 
   3253 			nmoved = (int)(uio->uio_loffset - (off + mapon));
   3254 			ASSERT(nmoved >= 0 && nmoved <= n);
   3255 			nzero = roundup(on + n, PAGESIZE) - nmoved;
   3256 			ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
   3257 			(void) kzero(base + mapon + nmoved, (uint32_t)nzero);
   3258 		}
   3259 
   3260 		/*
   3261 		 * Unlock the pages allocated by page_create_va()
   3262 		 * in segmap_pagecreate()
   3263 		 */
   3264 		if (newpage) {
   3265 			segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
   3266 		}
   3267 
   3268 		if (error) {
   3269 			/*
   3270 			 * If we failed on a write, we may have already
   3271 			 * allocated file blocks as well as pages.  It's
   3272 			 * hard to undo the block allocation, but we must
   3273 			 * be sure to invalidate any pages that may have
   3274 			 * been allocated.
   3275 			 */
   3276 			(void) segmap_release(segkmap, base, SM_INVAL);
   3277 		} else {
   3278 			flags = 0;
   3279 			/*
   3280 			 * Force write back for synchronous write cases.
   3281 			 */
   3282 			if ((ioflag & (FSYNC|FDSYNC)) || ip->i_type == VDIR) {
   3283 				/*
   3284 				 * If the sticky bit is set but the
   3285 				 * execute bit is not set, we do a
   3286 				 * synchronous write back and free
   3287 				 * the page when done.  We set up swap
   3288 				 * files to be handled this way to
   3289 				 * prevent servers from keeping around
   3290 				 * the client's swap pages too long.
   3291 				 * XXX - there ought to be a better way.
   3292 				 */
   3293 				if (IS_SWAPVP(vp)) {
   3294 					flags = SM_WRITE | SM_FREE |
   3295 					    SM_DONTNEED;
   3296 					iupdat_flag = 0;
   3297 				} else {
   3298 					flags = SM_WRITE;
   3299 				}
   3300 			} else if (((mapon + n) == MAXBSIZE) ||
   3301 			    IS_SWAPVP(vp)) {
   3302 				/*
   3303 				 * Have written a whole block.
   3304 				 * Start an asynchronous write and
   3305 				 * mark the buffer to indicate that
   3306 				 * it won't be needed again soon.
   3307 				 */
   3308 				flags = SM_WRITE |SM_ASYNC | SM_DONTNEED;
   3309 			}
   3310 			error = segmap_release(segkmap, base, flags);
   3311 
   3312 			/*
   3313 			 * If the operation failed and is synchronous,
   3314 			 * then we need to unwind what uiomove() last
   3315 			 * did so we can potentially return an error to
   3316 			 * the caller.  If this write operation was
   3317 			 * done in two pieces and the first succeeded,
   3318 			 * then we won't return an error for the second
   3319 			 * piece that failed.  However, we only want to
   3320 			 * return a resid value that reflects what was
   3321 			 * really done.
   3322 			 *
   3323 			 * Failures for non-synchronous operations can
   3324 			 * be ignored since the page subsystem will
   3325 			 * retry the operation until it succeeds or the
   3326 			 * file system is unmounted.
   3327 			 */
   3328 			if (error) {
   3329 				if ((ioflag & (FSYNC | FDSYNC)) ||
   3330 				    ip->i_type == VDIR) {
   3331 					uio->uio_resid = premove_resid;
   3332 				} else {
   3333 					error = 0;
   3334 				}
   3335 			}
   3336 		}
   3337 
   3338 		/*
   3339 		 * Re-acquire contents lock.
   3340 		 */
   3341 		rw_enter(&ip->i_contents, RW_WRITER);
   3342 		/*
   3343 		 * If the uiomove() failed or if a synchronous
   3344 		 * page push failed, fix up i_size.
   3345 		 */
   3346 		if (error) {
   3347 			if (i_size_changed) {
   3348 				/*
   3349 				 * The uiomove failed, and we
   3350 				 * allocated blocks,so get rid
   3351 				 * of them.
   3352 				 */
   3353 				(void) ud_itrunc(ip, old_i_size, 0, cr);
   3354 			}
   3355 		} else {
   3356 			/*
   3357 			 * XXX - Can this be out of the loop?
   3358 			 */
   3359 			ip->i_flag |= IUPD | ICHG;
   3360 			if (i_size_changed) {
   3361 				ip->i_flag |= IATTCHG;
   3362 			}
   3363 			if ((ip->i_perm & (IEXEC | (IEXEC >> 5) |
   3364 			    (IEXEC >> 10))) != 0 &&
   3365 			    (ip->i_char & (ISUID | ISGID)) != 0 &&
   3366 			    secpolicy_vnode_setid_retain(cr,
   3367 			    (ip->i_char & ISUID) != 0 && ip->i_uid == 0) != 0) {
   3368 				/*
   3369 				 * Clear Set-UID & Set-GID bits on
   3370 				 * successful write if not privileged
   3371 				 * and at least one of the execute bits
   3372 				 * is set.  If we always clear Set-GID,
   3373 				 * mandatory file and record locking is
   3374 				 * unuseable.
   3375 				 */
   3376 				ip->i_char &= ~(ISUID | ISGID);
   3377 			}
   3378 		}
   3379 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
   3380 
   3381 out:
   3382 	/*
   3383 	 * Inode is updated according to this table -
   3384 	 *
   3385 	 *	FSYNC	FDSYNC(posix.4)
   3386 	 *	--------------------------
   3387 	 *	always@	IATTCHG|IBDWRITE
   3388 	 *
   3389 	 * @ -  If we are doing synchronous write the only time we should
   3390 	 *	not be sync'ing the ip here is if we have the stickyhack
   3391 	 *	activated, the file is marked with the sticky bit and
   3392 	 *	no exec bit, the file length has not been changed and
   3393 	 *	no new blocks have been allocated during this write.
   3394 	 */
   3395 	if ((ip->i_flag & ISYNC) != 0) {
   3396 		/*
   3397 		 * we have eliminated nosync
   3398 		 */
   3399 		if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
   3400 		    ((ioflag & FSYNC) && iupdat_flag)) {
   3401 			ud_iupdat(ip, 1);
   3402 		}
   3403 	}
   3404 
   3405 	/*
   3406 	 * If we've already done a partial-write, terminate
   3407 	 * the write but return no error.
   3408 	 */
   3409 	if (start_resid != uio->uio_resid) {
   3410 		error = 0;
   3411 	}
   3412 	ip->i_flag &= ~(INOACC | ISYNC);
   3413 	ITIMES_NOLOCK(ip);
   3414 
   3415 	return (error);
   3416 }
   3417 
   3418 int32_t
   3419 ud_multi_strat(struct ud_inode *ip,
   3420 	page_t *pp, struct buf *bp, u_offset_t start)
   3421 {
   3422 	daddr_t bn;
   3423 	int32_t error = 0, io_count, contig, alloc_sz, i;
   3424 	uint32_t io_off;
   3425 	mio_master_t *mm = NULL;
   3426 	mio_slave_t *ms = NULL;
   3427 	struct buf *rbp;
   3428 
   3429 	ASSERT(!(start & PAGEOFFSET));
   3430 
   3431 	/*
   3432 	 * Figure out how many buffers to allocate
   3433 	 */
   3434 	io_count = 0;
   3435 	for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
   3436 		contig = 0;
   3437 		if (error = ud_bmap_read(ip, (u_offset_t)(start + io_off),
   3438 		    &bn, &contig)) {
   3439 			goto end;
   3440 		}
   3441 		if (contig == 0) {
   3442 			goto end;
   3443 		}
   3444 		contig = MIN(contig, PAGESIZE - io_off);
   3445 		if (bn != UDF_HOLE) {
   3446 			io_count ++;
   3447 		} else {
   3448 			/*
   3449 			 * HOLE
   3450 			 */
   3451 			if (bp->b_flags & B_READ) {
   3452 
   3453 				/*
   3454 				 * This is a hole and is read
   3455 				 * it should be filled with 0's
   3456 				 */
   3457 				pagezero(pp, io_off, contig);
   3458 			}
   3459 		}
   3460 	}
   3461 
   3462 
   3463 	if (io_count != 0) {
   3464 
   3465 		/*
   3466 		 * Allocate memory for all the
   3467 		 * required number of buffers
   3468 		 */
   3469 		alloc_sz = sizeof (mio_master_t) +
   3470 		    (sizeof (mio_slave_t) * io_count);
   3471 		mm = (mio_master_t *)kmem_zalloc(alloc_sz, KM_SLEEP);
   3472 		if (mm == NULL) {
   3473 			error = ENOMEM;
   3474 			goto end;
   3475 		}
   3476 
   3477 		/*
   3478 		 * initialize master
   3479 		 */
   3480 		mutex_init(&mm->mm_mutex, NULL, MUTEX_DEFAULT, NULL);
   3481 		mm->mm_size = alloc_sz;
   3482 		mm->mm_bp = bp;
   3483 		mm->mm_resid = 0;
   3484 		mm->mm_error = 0;
   3485 		mm->mm_index = master_index++;
   3486 
   3487 		ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
   3488 
   3489 		/*
   3490 		 * Initialize buffers
   3491 		 */
   3492 		io_count = 0;
   3493 		for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
   3494 			contig = 0;
   3495 			if (error = ud_bmap_read(ip,
   3496 			    (u_offset_t)(start + io_off),
   3497 			    &bn, &contig)) {
   3498 				goto end;
   3499 			}
   3500 			ASSERT(contig);
   3501 			if ((io_off + contig) > bp->b_bcount) {
   3502 				contig = bp->b_bcount - io_off;
   3503 			}
   3504 			if (bn != UDF_HOLE) {
   3505 				/*
   3506 				 * Clone the buffer
   3507 				 * and prepare to start I/O
   3508 				 */
   3509 				ms->ms_ptr = mm;
   3510 				bioinit(&ms->ms_buf);
   3511 				rbp = bioclone(bp, io_off, (size_t)contig,
   3512 				    bp->b_edev, bn, ud_slave_done,
   3513 				    &ms->ms_buf, KM_NOSLEEP);
   3514 				ASSERT(rbp == &ms->ms_buf);
   3515 				mm->mm_resid += contig;
   3516 				io_count++;
   3517 				ms ++;
   3518 			}
   3519 		}
   3520 
   3521 		/*
   3522 		 * Start I/O's
   3523 		 */
   3524 		ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
   3525 		for (i = 0; i < io_count; i++) {
   3526 			(void) bdev_strategy(&ms->ms_buf);
   3527 			ms ++;
   3528 		}
   3529 	}
   3530 
   3531 end:
   3532 	if (error != 0) {
   3533 		bp->b_flags |= B_ERROR;
   3534 		bp->b_error = error;
   3535 		if (mm != NULL) {
   3536 			mutex_destroy(&mm->mm_mutex);
   3537 			kmem_free(mm, mm->mm_size);
   3538 		}
   3539 	}
   3540 	return (error);
   3541 }
   3542 
   3543 int32_t
   3544 ud_slave_done(struct buf *bp)
   3545 {
   3546 	mio_master_t *mm;
   3547 	int32_t resid;
   3548 
   3549 	ASSERT(SEMA_HELD(&bp->b_sem));
   3550 	ASSERT((bp->b_flags & B_DONE) == 0);
   3551 
   3552 	mm = ((mio_slave_t *)bp)->ms_ptr;
   3553 
   3554 	/*
   3555 	 * Propagate error and byte count info from slave struct to
   3556 	 * the master struct
   3557 	 */
   3558 	mutex_enter(&mm->mm_mutex);
   3559 	if (bp->b_flags & B_ERROR) {
   3560 
   3561 		/*
   3562 		 * If multiple slave buffers get
   3563 		 * error we forget the old errors
   3564 		 * this is ok because we any way
   3565 		 * cannot return multiple errors
   3566 		 */
   3567 		mm->mm_error = bp->b_error;
   3568 	}
   3569 	mm->mm_resid -= bp->b_bcount;
   3570 	resid = mm->mm_resid;
   3571 	mutex_exit(&mm->mm_mutex);
   3572 
   3573 	/*
   3574 	 * free up the resources allocated to cloned buffers.
   3575 	 */
   3576 	bp_mapout(bp);
   3577 	biofini(bp);
   3578 
   3579 	if (resid == 0) {
   3580 
   3581 		/*
   3582 		 * This is the last I/O operation
   3583 		 * clean up and return the original buffer
   3584 		 */
   3585 		if (mm->mm_error) {
   3586 			mm->mm_bp->b_flags |= B_ERROR;
   3587 			mm->mm_bp->b_error = mm->mm_error;
   3588 		}
   3589 		biodone(mm->mm_bp);
   3590 		mutex_destroy(&mm->mm_mutex);
   3591 		kmem_free(mm, mm->mm_size);
   3592 	}
   3593 	return (0);
   3594 }
   3595