1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Portions Copyright 2007 Jeremy Teo */ 27 28 #pragma ident "%Z%%M% %I% %E% SMI" 29 30 #include <sys/types.h> 31 #include <sys/param.h> 32 #include <sys/time.h> 33 #include <sys/systm.h> 34 #include <sys/sysmacros.h> 35 #include <sys/resource.h> 36 #include <sys/vfs.h> 37 #include <sys/vfs_opreg.h> 38 #include <sys/vnode.h> 39 #include <sys/file.h> 40 #include <sys/stat.h> 41 #include <sys/kmem.h> 42 #include <sys/taskq.h> 43 #include <sys/uio.h> 44 #include <sys/vmsystm.h> 45 #include <sys/atomic.h> 46 #include <sys/vm.h> 47 #include <vm/seg_vn.h> 48 #include <vm/pvn.h> 49 #include <vm/as.h> 50 #include <sys/mman.h> 51 #include <sys/pathname.h> 52 #include <sys/cmn_err.h> 53 #include <sys/errno.h> 54 #include <sys/unistd.h> 55 #include <sys/zfs_dir.h> 56 #include <sys/zfs_acl.h> 57 #include <sys/zfs_ioctl.h> 58 #include <sys/fs/zfs.h> 59 #include <sys/dmu.h> 60 #include <sys/spa.h> 61 #include <sys/txg.h> 62 #include <sys/dbuf.h> 63 #include <sys/zap.h> 64 #include <sys/dirent.h> 65 #include <sys/policy.h> 66 #include <sys/sunddi.h> 67 #include <sys/filio.h> 68 #include "fs/fs_subr.h" 69 #include <sys/zfs_ctldir.h> 70 #include <sys/zfs_fuid.h> 71 #include <sys/dnlc.h> 72 #include <sys/zfs_rlock.h> 73 #include <sys/extdirent.h> 74 #include <sys/kidmap.h> 75 #include <sys/cred_impl.h> 76 #include <sys/attr.h> 77 78 /* 79 * Programming rules. 80 * 81 * Each vnode op performs some logical unit of work. To do this, the ZPL must 82 * properly lock its in-core state, create a DMU transaction, do the work, 83 * record this work in the intent log (ZIL), commit the DMU transaction, 84 * and wait for the intent log to commit if it is a synchronous operation. 85 * Moreover, the vnode ops must work in both normal and log replay context. 86 * The ordering of events is important to avoid deadlocks and references 87 * to freed memory. The example below illustrates the following Big Rules: 88 * 89 * (1) A check must be made in each zfs thread for a mounted file system. 90 * This is done avoiding races using ZFS_ENTER(zfsvfs). 91 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 92 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 93 * can return EIO from the calling function. 94 * 95 * (2) VN_RELE() should always be the last thing except for zil_commit() 96 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 97 * First, if it's the last reference, the vnode/znode 98 * can be freed, so the zp may point to freed memory. Second, the last 99 * reference will call zfs_zinactive(), which may induce a lot of work -- 100 * pushing cached pages (which acquires range locks) and syncing out 101 * cached atime changes. Third, zfs_zinactive() may require a new tx, 102 * which could deadlock the system if you were already holding one. 103 * 104 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 105 * as they can span dmu_tx_assign() calls. 106 * 107 * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). 108 * In normal operation, this will be TXG_NOWAIT. During ZIL replay, 109 * it will be a specific txg. Either way, dmu_tx_assign() never blocks. 110 * This is critical because we don't want to block while holding locks. 111 * Note, in particular, that if a lock is sometimes acquired before 112 * the tx assigns, and sometimes after (e.g. z_lock), then failing to 113 * use a non-blocking assign can deadlock the system. The scenario: 114 * 115 * Thread A has grabbed a lock before calling dmu_tx_assign(). 116 * Thread B is in an already-assigned tx, and blocks for this lock. 117 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 118 * forever, because the previous txg can't quiesce until B's tx commits. 119 * 120 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 121 * then drop all locks, call dmu_tx_wait(), and try again. 122 * 123 * (5) If the operation succeeded, generate the intent log entry for it 124 * before dropping locks. This ensures that the ordering of events 125 * in the intent log matches the order in which they actually occurred. 126 * 127 * (6) At the end of each vnode op, the DMU tx must always commit, 128 * regardless of whether there were any errors. 129 * 130 * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) 131 * to ensure that synchronous semantics are provided when necessary. 132 * 133 * In general, this is how things should be ordered in each vnode op: 134 * 135 * ZFS_ENTER(zfsvfs); // exit if unmounted 136 * top: 137 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) 138 * rw_enter(...); // grab any other locks you need 139 * tx = dmu_tx_create(...); // get DMU tx 140 * dmu_tx_hold_*(); // hold each object you might modify 141 * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign 142 * if (error) { 143 * rw_exit(...); // drop locks 144 * zfs_dirent_unlock(dl); // unlock directory entry 145 * VN_RELE(...); // release held vnodes 146 * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 147 * dmu_tx_wait(tx); 148 * dmu_tx_abort(tx); 149 * goto top; 150 * } 151 * dmu_tx_abort(tx); // abort DMU tx 152 * ZFS_EXIT(zfsvfs); // finished in zfs 153 * return (error); // really out of space 154 * } 155 * error = do_real_work(); // do whatever this VOP does 156 * if (error == 0) 157 * zfs_log_*(...); // on success, make ZIL entry 158 * dmu_tx_commit(tx); // commit DMU tx -- error or not 159 * rw_exit(...); // drop locks 160 * zfs_dirent_unlock(dl); // unlock directory entry 161 * VN_RELE(...); // release held vnodes 162 * zil_commit(zilog, seq, foid); // synchronous when necessary 163 * ZFS_EXIT(zfsvfs); // finished in zfs 164 * return (error); // done, report error 165 */ 166 167 /* ARGSUSED */ 168 static int 169 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 170 { 171 znode_t *zp = VTOZ(*vpp); 172 173 if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) && 174 ((flag & FAPPEND) == 0)) { 175 return (EPERM); 176 } 177 178 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 179 ZTOV(zp)->v_type == VREG && 180 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 181 zp->z_phys->zp_size > 0) 182 if (fs_vscan(*vpp, cr, 0) != 0) 183 return (EACCES); 184 185 /* Keep a count of the synchronous opens in the znode */ 186 if (flag & (FSYNC | FDSYNC)) 187 atomic_inc_32(&zp->z_sync_cnt); 188 189 return (0); 190 } 191 192 /* ARGSUSED */ 193 static int 194 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 195 caller_context_t *ct) 196 { 197 znode_t *zp = VTOZ(vp); 198 199 /* Decrement the synchronous opens in the znode */ 200 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 201 atomic_dec_32(&zp->z_sync_cnt); 202 203 /* 204 * Clean up any locks held by this process on the vp. 205 */ 206 cleanlocks(vp, ddi_get_pid(), 0); 207 cleanshares(vp, ddi_get_pid()); 208 209 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 210 ZTOV(zp)->v_type == VREG && 211 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 212 zp->z_phys->zp_size > 0) 213 VERIFY(fs_vscan(vp, cr, 1) == 0); 214 215 return (0); 216 } 217 218 /* 219 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 220 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 221 */ 222 static int 223 zfs_holey(vnode_t *vp, int cmd, offset_t *off) 224 { 225 znode_t *zp = VTOZ(vp); 226 uint64_t noff = (uint64_t)*off; /* new offset */ 227 uint64_t file_sz; 228 int error; 229 boolean_t hole; 230 231 file_sz = zp->z_phys->zp_size; 232 if (noff >= file_sz) { 233 return (ENXIO); 234 } 235 236 if (cmd == _FIO_SEEK_HOLE) 237 hole = B_TRUE; 238 else 239 hole = B_FALSE; 240 241 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 242 243 /* end of file? */ 244 if ((error == ESRCH) || (noff > file_sz)) { 245 /* 246 * Handle the virtual hole at the end of file. 247 */ 248 if (hole) { 249 *off = file_sz; 250 return (0); 251 } 252 return (ENXIO); 253 } 254 255 if (noff < *off) 256 return (error); 257 *off = noff; 258 return (error); 259 } 260 261 /* ARGSUSED */ 262 static int 263 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, 264 int *rvalp, caller_context_t *ct) 265 { 266 offset_t off; 267 int error; 268 zfsvfs_t *zfsvfs; 269 znode_t *zp; 270 271 switch (com) { 272 case _FIOFFS: 273 return (zfs_sync(vp->v_vfsp, 0, cred)); 274 275 /* 276 * The following two ioctls are used by bfu. Faking out, 277 * necessary to avoid bfu errors. 278 */ 279 case _FIOGDIO: 280 case _FIOSDIO: 281 return (0); 282 283 case _FIO_SEEK_DATA: 284 case _FIO_SEEK_HOLE: 285 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 286 return (EFAULT); 287 288 zp = VTOZ(vp); 289 zfsvfs = zp->z_zfsvfs; 290 ZFS_ENTER(zfsvfs); 291 ZFS_VERIFY_ZP(zp); 292 293 /* offset parameter is in/out */ 294 error = zfs_holey(vp, com, &off); 295 ZFS_EXIT(zfsvfs); 296 if (error) 297 return (error); 298 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 299 return (EFAULT); 300 return (0); 301 } 302 return (ENOTTY); 303 } 304 305 /* 306 * When a file is memory mapped, we must keep the IO data synchronized 307 * between the DMU cache and the memory mapped pages. What this means: 308 * 309 * On Write: If we find a memory mapped page, we write to *both* 310 * the page and the dmu buffer. 311 * 312 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 313 * the file is memory mapped. 314 */ 315 static int 316 mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) 317 { 318 znode_t *zp = VTOZ(vp); 319 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 320 int64_t start, off; 321 int len = nbytes; 322 int error = 0; 323 324 start = uio->uio_loffset; 325 off = start & PAGEOFFSET; 326 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 327 page_t *pp; 328 uint64_t bytes = MIN(PAGESIZE - off, len); 329 uint64_t woff = uio->uio_loffset; 330 331 /* 332 * We don't want a new page to "appear" in the middle of 333 * the file update (because it may not get the write 334 * update data), so we grab a lock to block 335 * zfs_getpage(). 336 */ 337 rw_enter(&zp->z_map_lock, RW_WRITER); 338 if (pp = page_lookup(vp, start, SE_SHARED)) { 339 caddr_t va; 340 341 rw_exit(&zp->z_map_lock); 342 va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1L); 343 error = uiomove(va+off, bytes, UIO_WRITE, uio); 344 if (error == 0) { 345 dmu_write(zfsvfs->z_os, zp->z_id, 346 woff, bytes, va+off, tx); 347 } 348 ppmapout(va); 349 page_unlock(pp); 350 } else { 351 error = dmu_write_uio(zfsvfs->z_os, zp->z_id, 352 uio, bytes, tx); 353 rw_exit(&zp->z_map_lock); 354 } 355 len -= bytes; 356 off = 0; 357 if (error) 358 break; 359 } 360 return (error); 361 } 362 363 /* 364 * When a file is memory mapped, we must keep the IO data synchronized 365 * between the DMU cache and the memory mapped pages. What this means: 366 * 367 * On Read: We "read" preferentially from memory mapped pages, 368 * else we default from the dmu buffer. 369 * 370 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 371 * the file is memory mapped. 372 */ 373 static int 374 mappedread(vnode_t *vp, int nbytes, uio_t *uio) 375 { 376 znode_t *zp = VTOZ(vp); 377 objset_t *os = zp->z_zfsvfs->z_os; 378 int64_t start, off; 379 int len = nbytes; 380 int error = 0; 381 382 start = uio->uio_loffset; 383 off = start & PAGEOFFSET; 384 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 385 page_t *pp; 386 uint64_t bytes = MIN(PAGESIZE - off, len); 387 388 if (pp = page_lookup(vp, start, SE_SHARED)) { 389 caddr_t va; 390 391 va = ppmapin(pp, PROT_READ, (caddr_t)-1L); 392 error = uiomove(va + off, bytes, UIO_READ, uio); 393 ppmapout(va); 394 page_unlock(pp); 395 } else { 396 error = dmu_read_uio(os, zp->z_id, uio, bytes); 397 } 398 len -= bytes; 399 off = 0; 400 if (error) 401 break; 402 } 403 return (error); 404 } 405 406 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 407 408 /* 409 * Read bytes from specified file into supplied buffer. 410 * 411 * IN: vp - vnode of file to be read from. 412 * uio - structure supplying read location, range info, 413 * and return buffer. 414 * ioflag - SYNC flags; used to provide FRSYNC semantics. 415 * cr - credentials of caller. 416 * ct - caller context 417 * 418 * OUT: uio - updated offset and range, buffer filled. 419 * 420 * RETURN: 0 if success 421 * error code if failure 422 * 423 * Side Effects: 424 * vp - atime updated if byte count > 0 425 */ 426 /* ARGSUSED */ 427 static int 428 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 429 { 430 znode_t *zp = VTOZ(vp); 431 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 432 objset_t *os; 433 ssize_t n, nbytes; 434 int error; 435 rl_t *rl; 436 437 ZFS_ENTER(zfsvfs); 438 ZFS_VERIFY_ZP(zp); 439 os = zfsvfs->z_os; 440 441 if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) { 442 ZFS_EXIT(zfsvfs); 443 return (EACCES); 444 } 445 446 /* 447 * Validate file offset 448 */ 449 if (uio->uio_loffset < (offset_t)0) { 450 ZFS_EXIT(zfsvfs); 451 return (EINVAL); 452 } 453 454 /* 455 * Fasttrack empty reads 456 */ 457 if (uio->uio_resid == 0) { 458 ZFS_EXIT(zfsvfs); 459 return (0); 460 } 461 462 /* 463 * Check for mandatory locks 464 */ 465 if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { 466 if (error = chklock(vp, FREAD, 467 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 468 ZFS_EXIT(zfsvfs); 469 return (error); 470 } 471 } 472 473 /* 474 * If we're in FRSYNC mode, sync out this znode before reading it. 475 */ 476 if (ioflag & FRSYNC) 477 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 478 479 /* 480 * Lock the range against changes. 481 */ 482 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 483 484 /* 485 * If we are reading past end-of-file we can skip 486 * to the end; but we might still need to set atime. 487 */ 488 if (uio->uio_loffset >= zp->z_phys->zp_size) { 489 error = 0; 490 goto out; 491 } 492 493 ASSERT(uio->uio_loffset < zp->z_phys->zp_size); 494 n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); 495 496 while (n > 0) { 497 nbytes = MIN(n, zfs_read_chunk_size - 498 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 499 500 if (vn_has_cached_data(vp)) 501 error = mappedread(vp, nbytes, uio); 502 else 503 error = dmu_read_uio(os, zp->z_id, uio, nbytes); 504 if (error) 505 break; 506 507 n -= nbytes; 508 } 509 510 out: 511 zfs_range_unlock(rl); 512 513 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 514 ZFS_EXIT(zfsvfs); 515 return (error); 516 } 517 518 /* 519 * Fault in the pages of the first n bytes specified by the uio structure. 520 * 1 byte in each page is touched and the uio struct is unmodified. 521 * Any error will exit this routine as this is only a best 522 * attempt to get the pages resident. This is a copy of ufs_trans_touch(). 523 */ 524 static void 525 zfs_prefault_write(ssize_t n, struct uio *uio) 526 { 527 struct iovec *iov; 528 ulong_t cnt, incr; 529 caddr_t p; 530 uint8_t tmp; 531 532 iov = uio->uio_iov; 533 534 while (n) { 535 cnt = MIN(iov->iov_len, n); 536 if (cnt == 0) { 537 /* empty iov entry */ 538 iov++; 539 continue; 540 } 541 n -= cnt; 542 /* 543 * touch each page in this segment. 544 */ 545 p = iov->iov_base; 546 while (cnt) { 547 switch (uio->uio_segflg) { 548 case UIO_USERSPACE: 549 case UIO_USERISPACE: 550 if (fuword8(p, &tmp)) 551 return; 552 break; 553 case UIO_SYSSPACE: 554 if (kcopy(p, &tmp, 1)) 555 return; 556 break; 557 } 558 incr = MIN(cnt, PAGESIZE); 559 p += incr; 560 cnt -= incr; 561 } 562 /* 563 * touch the last byte in case it straddles a page. 564 */ 565 p--; 566 switch (uio->uio_segflg) { 567 case UIO_USERSPACE: 568 case UIO_USERISPACE: 569 if (fuword8(p, &tmp)) 570 return; 571 break; 572 case UIO_SYSSPACE: 573 if (kcopy(p, &tmp, 1)) 574 return; 575 break; 576 } 577 iov++; 578 } 579 } 580 581 /* 582 * Write the bytes to a file. 583 * 584 * IN: vp - vnode of file to be written to. 585 * uio - structure supplying write location, range info, 586 * and data buffer. 587 * ioflag - FAPPEND flag set if in append mode. 588 * cr - credentials of caller. 589 * ct - caller context (NFS/CIFS fem monitor only) 590 * 591 * OUT: uio - updated offset and range. 592 * 593 * RETURN: 0 if success 594 * error code if failure 595 * 596 * Timestamps: 597 * vp - ctime|mtime updated if byte count > 0 598 */ 599 /* ARGSUSED */ 600 static int 601 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 602 { 603 znode_t *zp = VTOZ(vp); 604 rlim64_t limit = uio->uio_llimit; 605 ssize_t start_resid = uio->uio_resid; 606 ssize_t tx_bytes; 607 uint64_t end_size; 608 dmu_tx_t *tx; 609 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 610 zilog_t *zilog; 611 offset_t woff; 612 ssize_t n, nbytes; 613 rl_t *rl; 614 int max_blksz = zfsvfs->z_max_blksz; 615 uint64_t pflags; 616 int error; 617 618 /* 619 * Fasttrack empty write 620 */ 621 n = start_resid; 622 if (n == 0) 623 return (0); 624 625 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 626 limit = MAXOFFSET_T; 627 628 ZFS_ENTER(zfsvfs); 629 ZFS_VERIFY_ZP(zp); 630 631 /* 632 * If immutable or not appending then return EPERM 633 */ 634 pflags = zp->z_phys->zp_flags; 635 if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || 636 ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 637 (uio->uio_loffset < zp->z_phys->zp_size))) { 638 ZFS_EXIT(zfsvfs); 639 return (EPERM); 640 } 641 642 zilog = zfsvfs->z_log; 643 644 /* 645 * Pre-fault the pages to ensure slow (eg NFS) pages 646 * don't hold up txg. 647 */ 648 zfs_prefault_write(n, uio); 649 650 /* 651 * If in append mode, set the io offset pointer to eof. 652 */ 653 if (ioflag & FAPPEND) { 654 /* 655 * Range lock for a file append: 656 * The value for the start of range will be determined by 657 * zfs_range_lock() (to guarantee append semantics). 658 * If this write will cause the block size to increase, 659 * zfs_range_lock() will lock the entire file, so we must 660 * later reduce the range after we grow the block size. 661 */ 662 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 663 if (rl->r_len == UINT64_MAX) { 664 /* overlocked, zp_size can't change */ 665 woff = uio->uio_loffset = zp->z_phys->zp_size; 666 } else { 667 woff = uio->uio_loffset = rl->r_off; 668 } 669 } else { 670 woff = uio->uio_loffset; 671 /* 672 * Validate file offset 673 */ 674 if (woff < 0) { 675 ZFS_EXIT(zfsvfs); 676 return (EINVAL); 677 } 678 679 /* 680 * If we need to grow the block size then zfs_range_lock() 681 * will lock a wider range than we request here. 682 * Later after growing the block size we reduce the range. 683 */ 684 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 685 } 686 687 if (woff >= limit) { 688 zfs_range_unlock(rl); 689 ZFS_EXIT(zfsvfs); 690 return (EFBIG); 691 } 692 693 if ((woff + n) > limit || woff > (limit - n)) 694 n = limit - woff; 695 696 /* 697 * Check for mandatory locks 698 */ 699 if (MANDMODE((mode_t)zp->z_phys->zp_mode) && 700 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 701 zfs_range_unlock(rl); 702 ZFS_EXIT(zfsvfs); 703 return (error); 704 } 705 end_size = MAX(zp->z_phys->zp_size, woff + n); 706 707 /* 708 * Write the file in reasonable size chunks. Each chunk is written 709 * in a separate transaction; this keeps the intent log records small 710 * and allows us to do more fine-grained space accounting. 711 */ 712 while (n > 0) { 713 /* 714 * Start a transaction. 715 */ 716 woff = uio->uio_loffset; 717 tx = dmu_tx_create(zfsvfs->z_os); 718 dmu_tx_hold_bonus(tx, zp->z_id); 719 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 720 error = dmu_tx_assign(tx, zfsvfs->z_assign); 721 if (error) { 722 if (error == ERESTART && 723 zfsvfs->z_assign == TXG_NOWAIT) { 724 dmu_tx_wait(tx); 725 dmu_tx_abort(tx); 726 continue; 727 } 728 dmu_tx_abort(tx); 729 break; 730 } 731 732 /* 733 * If zfs_range_lock() over-locked we grow the blocksize 734 * and then reduce the lock range. This will only happen 735 * on the first iteration since zfs_range_reduce() will 736 * shrink down r_len to the appropriate size. 737 */ 738 if (rl->r_len == UINT64_MAX) { 739 uint64_t new_blksz; 740 741 if (zp->z_blksz > max_blksz) { 742 ASSERT(!ISP2(zp->z_blksz)); 743 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); 744 } else { 745 new_blksz = MIN(end_size, max_blksz); 746 } 747 zfs_grow_blocksize(zp, new_blksz, tx); 748 zfs_range_reduce(rl, woff, n); 749 } 750 751 /* 752 * XXX - should we really limit each write to z_max_blksz? 753 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 754 */ 755 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 756 rw_enter(&zp->z_map_lock, RW_READER); 757 758 tx_bytes = uio->uio_resid; 759 if (vn_has_cached_data(vp)) { 760 rw_exit(&zp->z_map_lock); 761 error = mappedwrite(vp, nbytes, uio, tx); 762 } else { 763 error = dmu_write_uio(zfsvfs->z_os, zp->z_id, 764 uio, nbytes, tx); 765 rw_exit(&zp->z_map_lock); 766 } 767 tx_bytes -= uio->uio_resid; 768 769 /* 770 * If we made no progress, we're done. If we made even 771 * partial progress, update the znode and ZIL accordingly. 772 */ 773 if (tx_bytes == 0) { 774 dmu_tx_commit(tx); 775 ASSERT(error != 0); 776 break; 777 } 778 779 /* 780 * Clear Set-UID/Set-GID bits on successful write if not 781 * privileged and at least one of the excute bits is set. 782 * 783 * It would be nice to to this after all writes have 784 * been done, but that would still expose the ISUID/ISGID 785 * to another app after the partial write is committed. 786 * 787 * Note: we don't call zfs_fuid_map_id() here because 788 * user 0 is not an ephemeral uid. 789 */ 790 mutex_enter(&zp->z_acl_lock); 791 if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | 792 (S_IXUSR >> 6))) != 0 && 793 (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && 794 secpolicy_vnode_setid_retain(cr, 795 (zp->z_phys->zp_mode & S_ISUID) != 0 && 796 zp->z_phys->zp_uid == 0) != 0) { 797 zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); 798 } 799 mutex_exit(&zp->z_acl_lock); 800 801 /* 802 * Update time stamp. NOTE: This marks the bonus buffer as 803 * dirty, so we don't have to do it again for zp_size. 804 */ 805 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 806 807 /* 808 * Update the file size (zp_size) if it has changed; 809 * account for possible concurrent updates. 810 */ 811 while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) 812 (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, 813 uio->uio_loffset); 814 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 815 dmu_tx_commit(tx); 816 817 if (error != 0) 818 break; 819 ASSERT(tx_bytes == nbytes); 820 n -= nbytes; 821 } 822 823 zfs_range_unlock(rl); 824 825 /* 826 * If we're in replay mode, or we made no progress, return error. 827 * Otherwise, it's at least a partial write, so it's successful. 828 */ 829 if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { 830 ZFS_EXIT(zfsvfs); 831 return (error); 832 } 833 834 if (ioflag & (FSYNC | FDSYNC)) 835 zil_commit(zilog, zp->z_last_itx, zp->z_id); 836 837 ZFS_EXIT(zfsvfs); 838 return (0); 839 } 840 841 void 842 zfs_get_done(dmu_buf_t *db, void *vzgd) 843 { 844 zgd_t *zgd = (zgd_t *)vzgd; 845 rl_t *rl = zgd->zgd_rl; 846 vnode_t *vp = ZTOV(rl->r_zp); 847 848 dmu_buf_rele(db, vzgd); 849 zfs_range_unlock(rl); 850 VN_RELE(vp); 851 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 852 kmem_free(zgd, sizeof (zgd_t)); 853 } 854 855 /* 856 * Get data to generate a TX_WRITE intent log record. 857 */ 858 int 859 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 860 { 861 zfsvfs_t *zfsvfs = arg; 862 objset_t *os = zfsvfs->z_os; 863 znode_t *zp; 864 uint64_t off = lr->lr_offset; 865 dmu_buf_t *db; 866 rl_t *rl; 867 zgd_t *zgd; 868 int dlen = lr->lr_length; /* length of user data */ 869 int error = 0; 870 871 ASSERT(zio); 872 ASSERT(dlen != 0); 873 874 /* 875 * Nothing to do if the file has been removed 876 */ 877 if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) 878 return (ENOENT); 879 if (zp->z_unlinked) { 880 VN_RELE(ZTOV(zp)); 881 return (ENOENT); 882 } 883 884 /* 885 * Write records come in two flavors: immediate and indirect. 886 * For small writes it's cheaper to store the data with the 887 * log record (immediate); for large writes it's cheaper to 888 * sync the data and get a pointer to it (indirect) so that 889 * we don't have to write the data twice. 890 */ 891 if (buf != NULL) { /* immediate write */ 892 rl = zfs_range_lock(zp, off, dlen, RL_READER); 893 /* test for truncation needs to be done while range locked */ 894 if (off >= zp->z_phys->zp_size) { 895 error = ENOENT; 896 goto out; 897 } 898 VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); 899 } else { /* indirect write */ 900 uint64_t boff; /* block starting offset */ 901 902 /* 903 * Have to lock the whole block to ensure when it's 904 * written out and it's checksum is being calculated 905 * that no one can change the data. We need to re-check 906 * blocksize after we get the lock in case it's changed! 907 */ 908 for (;;) { 909 if (ISP2(zp->z_blksz)) { 910 boff = P2ALIGN_TYPED(off, zp->z_blksz, 911 uint64_t); 912 } else { 913 boff = 0; 914 } 915 dlen = zp->z_blksz; 916 rl = zfs_range_lock(zp, boff, dlen, RL_READER); 917 if (zp->z_blksz == dlen) 918 break; 919 zfs_range_unlock(rl); 920 } 921 /* test for truncation needs to be done while range locked */ 922 if (off >= zp->z_phys->zp_size) { 923 error = ENOENT; 924 goto out; 925 } 926 zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); 927 zgd->zgd_rl = rl; 928 zgd->zgd_zilog = zfsvfs->z_log; 929 zgd->zgd_bp = &lr->lr_blkptr; 930 VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); 931 ASSERT(boff == db->db_offset); 932 lr->lr_blkoff = off - boff; 933 error = dmu_sync(zio, db, &lr->lr_blkptr, 934 lr->lr_common.lrc_txg, zfs_get_done, zgd); 935 ASSERT((error && error != EINPROGRESS) || 936 lr->lr_length <= zp->z_blksz); 937 if (error == 0) 938 zil_add_block(zfsvfs->z_log, &lr->lr_blkptr); 939 /* 940 * If we get EINPROGRESS, then we need to wait for a 941 * write IO initiated by dmu_sync() to complete before 942 * we can release this dbuf. We will finish everything 943 * up in the zfs_get_done() callback. 944 */ 945 if (error == EINPROGRESS) 946 return (0); 947 dmu_buf_rele(db, zgd); 948 kmem_free(zgd, sizeof (zgd_t)); 949 } 950 out: 951 zfs_range_unlock(rl); 952 VN_RELE(ZTOV(zp)); 953 return (error); 954 } 955 956 /*ARGSUSED*/ 957 static int 958 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 959 caller_context_t *ct) 960 { 961 znode_t *zp = VTOZ(vp); 962 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 963 int error; 964 965 ZFS_ENTER(zfsvfs); 966 ZFS_VERIFY_ZP(zp); 967 968 if (flag & V_ACE_MASK) 969 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 970 else 971 error = zfs_zaccess_rwx(zp, mode, flag, cr); 972 973 ZFS_EXIT(zfsvfs); 974 return (error); 975 } 976 977 /* 978 * Lookup an entry in a directory, or an extended attribute directory. 979 * If it exists, return a held vnode reference for it. 980 * 981 * IN: dvp - vnode of directory to search. 982 * nm - name of entry to lookup. 983 * pnp - full pathname to lookup [UNUSED]. 984 * flags - LOOKUP_XATTR set if looking for an attribute. 985 * rdir - root directory vnode [UNUSED]. 986 * cr - credentials of caller. 987 * ct - caller context 988 * direntflags - directory lookup flags 989 * realpnp - returned pathname. 990 * 991 * OUT: vpp - vnode of located entry, NULL if not found. 992 * 993 * RETURN: 0 if success 994 * error code if failure 995 * 996 * Timestamps: 997 * NA 998 */ 999 /* ARGSUSED */ 1000 static int 1001 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 1002 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 1003 int *direntflags, pathname_t *realpnp) 1004 { 1005 znode_t *zdp = VTOZ(dvp); 1006 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1007 int error; 1008 1009 ZFS_ENTER(zfsvfs); 1010 ZFS_VERIFY_ZP(zdp); 1011 1012 *vpp = NULL; 1013 1014 if (flags & LOOKUP_XATTR) { 1015 /* 1016 * If the xattr property is off, refuse the lookup request. 1017 */ 1018 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1019 ZFS_EXIT(zfsvfs); 1020 return (EINVAL); 1021 } 1022 1023 /* 1024 * We don't allow recursive attributes.. 1025 * Maybe someday we will. 1026 */ 1027 if (zdp->z_phys->zp_flags & ZFS_XATTR) { 1028 ZFS_EXIT(zfsvfs); 1029 return (EINVAL); 1030 } 1031 1032 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1033 ZFS_EXIT(zfsvfs); 1034 return (error); 1035 } 1036 1037 /* 1038 * Do we have permission to get into attribute directory? 1039 */ 1040 1041 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1042 B_FALSE, cr)) { 1043 VN_RELE(*vpp); 1044 *vpp = NULL; 1045 } 1046 1047 ZFS_EXIT(zfsvfs); 1048 return (error); 1049 } 1050 1051 if (dvp->v_type != VDIR) { 1052 ZFS_EXIT(zfsvfs); 1053 return (ENOTDIR); 1054 } 1055 1056 /* 1057 * Check accessibility of directory. 1058 */ 1059 1060 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1061 ZFS_EXIT(zfsvfs); 1062 return (error); 1063 } 1064 1065 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1066 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1067 ZFS_EXIT(zfsvfs); 1068 return (EILSEQ); 1069 } 1070 1071 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); 1072 if (error == 0) { 1073 /* 1074 * Convert device special files 1075 */ 1076 if (IS_DEVVP(*vpp)) { 1077 vnode_t *svp; 1078 1079 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1080 VN_RELE(*vpp); 1081 if (svp == NULL) 1082 error = ENOSYS; 1083 else 1084 *vpp = svp; 1085 } 1086 } 1087 1088 ZFS_EXIT(zfsvfs); 1089 return (error); 1090 } 1091 1092 /* 1093 * Attempt to create a new entry in a directory. If the entry 1094 * already exists, truncate the file if permissible, else return 1095 * an error. Return the vp of the created or trunc'd file. 1096 * 1097 * IN: dvp - vnode of directory to put new file entry in. 1098 * name - name of new file entry. 1099 * vap - attributes of new file. 1100 * excl - flag indicating exclusive or non-exclusive mode. 1101 * mode - mode to open file with. 1102 * cr - credentials of caller. 1103 * flag - large file flag [UNUSED]. 1104 * ct - caller context 1105 * vsecp - ACL to be set 1106 * 1107 * OUT: vpp - vnode of created or trunc'd entry. 1108 * 1109 * RETURN: 0 if success 1110 * error code if failure 1111 * 1112 * Timestamps: 1113 * dvp - ctime|mtime updated if new entry created 1114 * vp - ctime|mtime always, atime if new 1115 */ 1116 1117 /* ARGSUSED */ 1118 static int 1119 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, 1120 int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, 1121 vsecattr_t *vsecp) 1122 { 1123 znode_t *zp, *dzp = VTOZ(dvp); 1124 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1125 zilog_t *zilog; 1126 objset_t *os; 1127 zfs_dirlock_t *dl; 1128 dmu_tx_t *tx; 1129 int error; 1130 zfs_acl_t *aclp = NULL; 1131 zfs_fuid_info_t *fuidp = NULL; 1132 1133 /* 1134 * If we have an ephemeral id, ACL, or XVATTR then 1135 * make sure file system is at proper version 1136 */ 1137