Home | History | Annotate | Download | only in dcfs
      1 
      2 /*
      3  * CDDL HEADER START
      4  *
      5  * The contents of this file are subject to the terms of the
      6  * Common Development and Distribution License (the "License").
      7  * You may not use this file except in compliance with the License.
      8  *
      9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
     10  * or http://www.opensolaris.org/os/licensing.
     11  * See the License for the specific language governing permissions
     12  * and limitations under the License.
     13  *
     14  * When distributing Covered Code, include this CDDL HEADER in each
     15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     16  * If applicable, add the following below this CDDL HEADER, with the
     17  * fields enclosed by brackets "[]" replaced with your own identifying
     18  * information: Portions Copyright [yyyy] [name of copyright owner]
     19  *
     20  * CDDL HEADER END
     21  */
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
     28 /*	  All Rights Reserved  	*/
     29 
     30 /*
     31  * University Copyright- Copyright (c) 1982, 1986, 1988
     32  * The Regents of the University of California
     33  * All Rights Reserved
     34  *
     35  * University Acknowledgment- Portions of this document are derived from
     36  * software developed by the University of California, Berkeley, and its
     37  * contributors.
     38  */
     39 
     40 #include <sys/types.h>
     41 #include <sys/thread.h>
     42 #include <sys/t_lock.h>
     43 #include <sys/param.h>
     44 #include <sys/systm.h>
     45 #include <sys/bitmap.h>
     46 #include <sys/buf.h>
     47 #include <sys/cmn_err.h>
     48 #include <sys/conf.h>
     49 #include <sys/ddi.h>
     50 #include <sys/debug.h>
     51 #include <sys/errno.h>
     52 #include <sys/time.h>
     53 #include <sys/fcntl.h>
     54 #include <sys/flock.h>
     55 #include <sys/file.h>
     56 #include <sys/kmem.h>
     57 #include <sys/mman.h>
     58 #include <sys/vmsystm.h>
     59 #include <sys/open.h>
     60 #include <sys/swap.h>
     61 #include <sys/sysmacros.h>
     62 #include <sys/uio.h>
     63 #include <sys/vfs.h>
     64 #include <sys/vfs_opreg.h>
     65 #include <sys/vnode.h>
     66 #include <sys/stat.h>
     67 #include <sys/poll.h>
     68 #include <sys/zmod.h>
     69 #include <sys/fs/decomp.h>
     70 
     71 #include <vm/hat.h>
     72 #include <vm/as.h>
     73 #include <vm/page.h>
     74 #include <vm/pvn.h>
     75 #include <vm/seg_vn.h>
     76 #include <vm/seg_kmem.h>
     77 #include <vm/seg_map.h>
     78 
     79 #include <fs/fs_subr.h>
     80 
     81 /*
     82  * dcfs - A filesystem for automatic decompressing of fiocompressed files
     83  *
     84  * This filesystem is a layered filesystem that sits on top of a normal
     85  * persistent filesystem and provides automatic decompression of files
     86  * that have been previously compressed and stored on the host file system.
     87  * This is a pseudo filesystem in that it does not persist data, rather it
     88  * intercepts file lookup requests on the host filesystem and provides
     89  * transparent decompression of those files. Currently the only supported
     90  * host filesystem is ufs.
     91  *
     92  * A file is compressed via a userland utility (currently cmd/boot/fiocompress)
     93  * and marked by fiocompress as a compressed file via a flag in the on-disk
     94  * inode (set via a ufs ioctl() - see `ufs_vnops.c`ufs_ioctl()`_FIO_COMPRESSED
     95  * ufs_lookup checks for this flag and if set, passes control to decompvp
     96  * a function defined in this (dcfs) filesystem. decomvp uncompresses the file
     97  * and returns a dcfs vnode to the VFS layer.
     98  *
     99  * dcfs is layered on top of ufs and passes requests involving persistence
    100  * to the underlying ufs filesystem. The compressed files currently cannot be
    101  * written to.
    102  */
    103 
    104 
    105 /*
    106  * Define data structures within this file.
    107  */
    108 #define	DCSHFT		5
    109 #define	DCTABLESIZE	16
    110 
    111 #if ((DCTABLESIZE & (DCTABLESIZE - 1)) == 0)
    112 #define	DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) & (DCTABLESIZE - 1))
    113 #else
    114 #define	DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) % DTABLESIZEC)
    115 #endif
    116 
    117 #define	DCLRUSIZE	16
    118 
    119 #define	DCCACHESIZE	4
    120 
    121 #define	rounddown(x, y)	((x) & ~((y) - 1))
    122 
    123 struct dcnode	*dctable[DCTABLESIZE];
    124 
    125 struct dcnode	*dclru;
    126 static int	dclru_len;
    127 
    128 kmutex_t	dctable_lock;
    129 
    130 dev_t		dcdev;
    131 struct vfs	dc_vfs;
    132 
    133 struct kmem_cache *dcnode_cache;
    134 struct kmem_cache *dcbuf_cache[DCCACHESIZE];
    135 
    136 kmutex_t	dccache_lock;
    137 
    138 static int dcinit(int, char *);
    139 
    140 static struct dcnode	*dcnode_alloc(void);
    141 static void		dcnode_free(struct dcnode *);
    142 static void		dcnode_recycle(struct dcnode *);
    143 
    144 static void		dcinsert(struct dcnode *);
    145 static void		dcdelete(struct dcnode *);
    146 static struct dcnode	*dcfind(struct vnode *);
    147 static void		dclru_add(struct dcnode *);
    148 static void		dclru_sub(struct dcnode *);
    149 
    150 
    151 /*
    152  * This is the loadable module wrapper.
    153  */
    154 #include <sys/modctl.h>
    155 
    156 struct vfsops *dc_vfsops;
    157 
    158 static vfsdef_t vfw = {
    159 	VFSDEF_VERSION,
    160 	"dcfs",
    161 	dcinit,
    162 	0,
    163 	NULL
    164 };
    165 
    166 /*
    167  * Module linkage information for the kernel.
    168  */
    169 extern struct mod_ops mod_fsops;
    170 
    171 static struct modlfs modlfs = {
    172 	&mod_fsops, "compressed filesystem", &vfw
    173 };
    174 
    175 static struct modlinkage modlinkage = {
    176 	MODREV_1, (void *)&modlfs, NULL
    177 };
    178 
    179 int
    180 _init()
    181 {
    182 	return (mod_install(&modlinkage));
    183 }
    184 
    185 int
    186 _info(struct modinfo *modinfop)
    187 {
    188 	return (mod_info(&modlinkage, modinfop));
    189 }
    190 
    191 
    192 static int dc_open(struct vnode **, int, struct cred *, caller_context_t *);
    193 static int dc_close(struct vnode *, int, int, offset_t,
    194     struct cred *, caller_context_t *);
    195 static int dc_read(struct vnode *, struct uio *, int, struct cred *,
    196     struct caller_context *);
    197 static int dc_getattr(struct vnode *, struct vattr *, int,
    198     struct cred *, caller_context_t *);
    199 static int dc_setattr(struct vnode *, struct vattr *, int, struct cred *,
    200     struct caller_context *);
    201 static int dc_access(struct vnode *, int, int,
    202     struct cred *, caller_context_t *);
    203 static int dc_fsync(struct vnode *, int, struct cred *, caller_context_t *);
    204 static void dc_inactive(struct vnode *, struct cred *, caller_context_t *);
    205 static int dc_fid(struct vnode *, struct fid *, caller_context_t *);
    206 static int dc_seek(struct vnode *, offset_t, offset_t *, caller_context_t *);
    207 static int dc_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
    208     struct flk_callback *, struct cred *, caller_context_t *);
    209 static int dc_getpage(struct vnode *, offset_t, size_t, uint_t *,
    210     struct page **, size_t, struct seg *, caddr_t, enum seg_rw,
    211     struct cred *, caller_context_t *);
    212 static int dc_putpage(struct vnode *, offset_t, size_t, int,
    213     struct cred *, caller_context_t *);
    214 static int dc_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
    215     uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
    216 static int dc_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
    217     uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
    218 static int dc_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
    219     uint_t, uint_t, uint_t, struct cred *, caller_context_t *);
    220 
    221 struct vnodeops *dc_vnodeops;
    222 
    223 const fs_operation_def_t dc_vnodeops_template[] = {
    224 	VOPNAME_OPEN,			{ .vop_open = dc_open },
    225 	VOPNAME_CLOSE,			{ .vop_close = dc_close },
    226 	VOPNAME_READ,			{ .vop_read = dc_read },
    227 	VOPNAME_GETATTR,		{ .vop_getattr =  dc_getattr },
    228 	VOPNAME_SETATTR,		{ .vop_setattr = dc_setattr },
    229 	VOPNAME_ACCESS,			{ .vop_access = dc_access },
    230 	VOPNAME_FSYNC,			{ .vop_fsync = dc_fsync },
    231 	VOPNAME_INACTIVE,		{ .vop_inactive = dc_inactive },
    232 	VOPNAME_FID,			{ .vop_fid = dc_fid },
    233 	VOPNAME_SEEK,			{ .vop_seek = dc_seek },
    234 	VOPNAME_FRLOCK,			{ .vop_frlock = dc_frlock },
    235 	VOPNAME_GETPAGE,		{ .vop_getpage = dc_getpage },
    236 	VOPNAME_PUTPAGE,		{ .vop_putpage = dc_putpage },
    237 	VOPNAME_MAP,			{ .vop_map = dc_map },
    238 	VOPNAME_ADDMAP,			{ .vop_addmap = dc_addmap },
    239 	VOPNAME_DELMAP,			{ .vop_delmap = dc_delmap },
    240 	NULL,				NULL
    241 };
    242 
    243 /*ARGSUSED*/
    244 static int
    245 dc_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ctp)
    246 {
    247 	return (0);
    248 }
    249 
    250 /*ARGSUSED*/
    251 static int
    252 dc_close(struct vnode *vp, int flag, int count, offset_t off,
    253     struct cred *cr, caller_context_t *ctp)
    254 {
    255 	(void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
    256 	cleanshares(vp, ttoproc(curthread)->p_pid);
    257 	return (0);
    258 }
    259 
    260 /*ARGSUSED*/
    261 static int
    262 dc_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
    263 	struct caller_context *ct)
    264 {
    265 	struct dcnode *dp = VTODC(vp);
    266 	size_t rdsize = MAX(MAXBSIZE, dp->dc_hdr->ch_blksize);
    267 	size_t fsize = dp->dc_hdr->ch_fsize;
    268 	int error;
    269 
    270 	/*
    271 	 * Loop through file with segmap, decompression will occur
    272 	 * in dc_getapage
    273 	 */
    274 	do {
    275 		caddr_t base;
    276 		size_t n;
    277 		offset_t mapon;
    278 
    279 		/*
    280 		 * read to end of block or file
    281 		 */
    282 		mapon = uiop->uio_loffset & (rdsize - 1);
    283 		n = MIN(rdsize - mapon, uiop->uio_resid);
    284 		n = MIN(n, fsize - uiop->uio_loffset);
    285 		if (n == 0)
    286 			return (0);	/* at EOF */
    287 
    288 		base = segmap_getmapflt(segkmap, vp, uiop->uio_loffset, n, 1,
    289 		    S_READ);
    290 		error = uiomove(base + mapon, n, UIO_READ, uiop);
    291 		if (!error) {
    292 			uint_t flags;
    293 
    294 			if (n + mapon == rdsize || uiop->uio_loffset == fsize)
    295 				flags = SM_DONTNEED;
    296 			else
    297 				flags = 0;
    298 			error = segmap_release(segkmap, base, flags);
    299 		} else
    300 			(void) segmap_release(segkmap, base, 0);
    301 	} while (!error && uiop->uio_resid);
    302 
    303 	return (error);
    304 }
    305 
    306 static int
    307 dc_getattr(struct vnode *vp, struct vattr *vap, int flags,
    308     cred_t *cred, caller_context_t *ctp)
    309 {
    310 	struct dcnode *dp = VTODC(vp);
    311 	struct vnode *subvp = dp->dc_subvp;
    312 	int error;
    313 
    314 	error = VOP_GETATTR(subvp, vap, flags, cred, ctp);
    315 
    316 	/* substitute uncompressed size */
    317 	vap->va_size = dp->dc_hdr->ch_fsize;
    318 	return (error);
    319 }
    320 
    321 static int
    322 dc_setattr(struct vnode *vp, struct vattr *vap, int flags, cred_t *cred,
    323     caller_context_t *ctp)
    324 {
    325 	struct dcnode *dp = VTODC(vp);
    326 	struct vnode *subvp = dp->dc_subvp;
    327 
    328 	return (VOP_SETATTR(subvp, vap, flags, cred, ctp));
    329 }
    330 
    331 static int
    332 dc_access(struct vnode *vp, int mode, int flags,
    333     cred_t *cred, caller_context_t *ctp)
    334 {
    335 	struct dcnode *dp = VTODC(vp);
    336 	struct vnode *subvp = dp->dc_subvp;
    337 
    338 	return (VOP_ACCESS(subvp, mode, flags, cred, ctp));
    339 }
    340 
    341 /*ARGSUSED*/
    342 static int
    343 dc_fsync(vnode_t *vp, int syncflag, cred_t *cred, caller_context_t *ctp)
    344 {
    345 	return (0);
    346 }
    347 
    348 /*ARGSUSED*/
    349 static void
    350 dc_inactive(struct vnode *vp, cred_t *cr, caller_context_t *ctp)
    351 {
    352 	struct dcnode *dp = VTODC(vp);
    353 
    354 	mutex_enter(&dctable_lock);
    355 	mutex_enter(&vp->v_lock);
    356 	ASSERT(vp->v_count >= 1);
    357 	if (--vp->v_count != 0) {
    358 		/*
    359 		 * Somebody accessed the dcnode before we got a chance to
    360 		 * remove it.  They will remove it when they do a vn_rele.
    361 		 */
    362 		mutex_exit(&vp->v_lock);
    363 		mutex_exit(&dctable_lock);
    364 		return;
    365 	}
    366 	mutex_exit(&vp->v_lock);
    367 
    368 	dcnode_free(dp);
    369 
    370 	mutex_exit(&dctable_lock);
    371 }
    372 
    373 static int
    374 dc_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ctp)
    375 {
    376 	struct dcnode *dp = VTODC(vp);
    377 	struct vnode *subvp = dp->dc_subvp;
    378 
    379 	return (VOP_FID(subvp, fidp, ctp));
    380 }
    381 
    382 static int
    383 dc_seek(struct vnode *vp, offset_t oof, offset_t *noffp, caller_context_t *ctp)
    384 {
    385 	struct dcnode *dp = VTODC(vp);
    386 	struct vnode *subvp = dp->dc_subvp;
    387 
    388 	return (VOP_SEEK(subvp, oof, noffp, ctp));
    389 }
    390 
    391 static int
    392 dc_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
    393     offset_t offset, struct flk_callback *flk_cbp,
    394     cred_t *cr, caller_context_t *ctp)
    395 {
    396 	struct dcnode *dp = VTODC(vp);
    397 
    398 	/*
    399 	 * If file is being mapped, disallow frlock.
    400 	 */
    401 	if (dp->dc_mapcnt > 0)
    402 		return (EAGAIN);
    403 
    404 	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ctp));
    405 }
    406 
    407 /*ARGSUSED*/
    408 static int
    409 dc_getblock_miss(struct vnode *vp, offset_t off, size_t len, struct page **ppp,
    410     struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr)
    411 {
    412 	struct dcnode *dp = VTODC(vp);
    413 	struct comphdr *hdr = dp->dc_hdr;
    414 	struct page *pp;
    415 	struct buf *bp;
    416 	caddr_t saddr;
    417 	off_t cblkno;
    418 	size_t rdoff, rdsize, dsize;
    419 	long xlen;
    420 	int error, zerr;
    421 
    422 	ASSERT(len == hdr->ch_blksize);
    423 	/*
    424 	 * Get destination pages and make them addressable
    425 	 */
    426 	pp = page_create_va(vp, off, len, PG_WAIT, seg, addr);
    427 	bp = pageio_setup(pp, len, vp, B_READ);
    428 	bp_mapin(bp);
    429 
    430 	/*
    431 	 * read compressed data from subordinate vnode
    432 	 */
    433 	saddr = kmem_cache_alloc(dp->dc_bufcache, KM_SLEEP);
    434 	cblkno = off / len;
    435 	rdoff = hdr->ch_blkmap[cblkno];
    436 	rdsize = hdr->ch_blkmap[cblkno + 1] - rdoff;
    437 	error = vn_rdwr(UIO_READ, dp->dc_subvp, saddr, rdsize, rdoff,
    438 	    UIO_SYSSPACE, 0, 0, cr, NULL);
    439 	if (error)
    440 		goto cleanup;
    441 
    442 	/*
    443 	 * Uncompress
    444 	 */
    445 	dsize = len;
    446 	zerr = z_uncompress(bp->b_un.b_addr, &dsize, saddr, dp->dc_zmax);
    447 	if (zerr != Z_OK) {
    448 		error = EIO;
    449 		goto cleanup;
    450 	}
    451 
    452 	/*
    453 	 * Handle EOF
    454 	 */
    455 	xlen = hdr->ch_fsize - off;
    456 	if (xlen < len) {
    457 		bzero(bp->b_un.b_addr + xlen, len - xlen);
    458 		if (dsize != xlen)
    459 			error = EIO;
    460 	} else if (dsize != len)
    461 		error = EIO;
    462 
    463 	/*
    464 	 * Clean up
    465 	 */
    466 cleanup:
    467 	kmem_cache_free(dp->dc_bufcache, saddr);
    468 	pageio_done(bp);
    469 	*ppp = pp;
    470 	return (error);
    471 }
    472 
    473 static int
    474 dc_getblock(struct vnode *vp, offset_t off, size_t len, struct page **ppp,
    475     struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr)
    476 {
    477 	struct page *pp, *plist = NULL;
    478 	offset_t pgoff;
    479 	int rdblk;
    480 
    481 	/*
    482 	 * pvn_read_kluster() doesn't quite do what we want, since it
    483 	 * thinks sub block reads are ok.  Here we always decompress
    484 	 * a full block.
    485 	 */
    486 
    487 	/*
    488 	 * Check page cache
    489 	 */
    490 	rdblk = 0;
    491 	for (pgoff = off; pgoff < off + len; pgoff += PAGESIZE) {
    492 		pp = page_lookup(vp, pgoff, SE_EXCL);
    493 		if (pp == NULL) {
    494 			rdblk = 1;
    495 			break;
    496 		}
    497 		page_io_lock(pp);
    498 		page_add(&plist, pp);
    499 		plist = plist->p_next;
    500 	}
    501 	if (!rdblk) {
    502 		*ppp = plist;
    503 		return (0);	/* all pages in cache */
    504 	}
    505 
    506 	/*
    507 	 * Undo any locks so getblock_miss has an open field
    508 	 */
    509 	if (plist != NULL)
    510 		pvn_io_done(plist);
    511 
    512 	return (dc_getblock_miss(vp, off, len, ppp, seg, addr, rw, cr));
    513 }
    514 
    515 /*ARGSUSED10*/
    516 static int
    517 dc_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
    518     struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr,
    519     enum seg_rw rw, struct cred *cr, caller_context_t *ctp)
    520 {
    521 	struct dcnode *dp = VTODC(vp);
    522 	struct comphdr *hdr = dp->dc_hdr;
    523 	struct page *pp, *plist = NULL;
    524 	caddr_t vp_baddr;
    525 	offset_t vp_boff, vp_bend;
    526 	size_t bsize = hdr->ch_blksize;
    527 	int nblks, error;
    528 
    529 	/* does not support write */
    530 	if (rw == S_WRITE) {
    531 		panic("write attempt on compressed file");
    532 		/*NOTREACHED*/
    533 	}
    534 
    535 	if (protp)
    536 		*protp = PROT_ALL;
    537 	/*
    538 	 * We don't support asynchronous operation at the moment, so
    539 	 * just pretend we did it.  If the pages are ever actually
    540 	 * needed, they'll get brought in then.
    541 	 */
    542 	if (pl == NULL)
    543 		return (0);
    544 
    545 	/*
    546 	 * Calc block start and end offsets
    547 	 */
    548 	vp_boff = rounddown(off, bsize);
    549 	vp_bend = roundup(off + len, bsize);
    550 	vp_baddr = (caddr_t)rounddown((uintptr_t)addr, bsize);
    551 
    552 	nblks = (vp_bend - vp_boff) / bsize;
    553 	while (nblks--) {
    554 		error = dc_getblock(vp, vp_boff, bsize, &pp, seg, vp_baddr,
    555 		    rw, cr);
    556 		page_list_concat(&plist, &pp);
    557 		vp_boff += bsize;
    558 		vp_baddr += bsize;
    559 	}
    560 	if (!error)
    561 		pvn_plist_init(plist, pl, plsz, off, len, rw);
    562 	else
    563 		pvn_read_done(plist, B_ERROR);
    564 	return (error);
    565 }
    566 
    567 /*
    568  * This function should never be called. We need to have it to pass
    569  * it as an argument to other functions.
    570  */
    571 /*ARGSUSED*/
    572 static int
    573 dc_putapage(struct vnode *vp, struct page *pp, u_offset_t *offp, size_t *lenp,
    574     int flags, struct cred *cr)
    575 {
    576 	/* should never happen */
    577 	cmn_err(CE_PANIC, "dcfs: dc_putapage: dirty page");
    578 	/*NOTREACHED*/
    579 	return (0);
    580 }
    581 
    582 
    583 /*
    584  * The only flags we support are B_INVAL, B_FREE and B_DONTNEED.
    585  * B_INVAL is set by:
    586  *
    587  *	1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag.
    588  *	2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice
    589  *	   which translates to an MC_SYNC with the MS_INVALIDATE flag.
    590  *
    591  * The B_FREE (as well as the B_DONTNEED) flag is set when the
    592  * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked
    593  * from SEGVN to release pages behind a pagefault.
    594  */
    595 /*ARGSUSED5*/
    596 static int
    597 dc_putpage(struct vnode *vp, offset_t off, size_t len, int flags,
    598     struct cred *cr, caller_context_t *ctp)
    599 {
    600 	int error = 0;
    601 
    602 	if (vp->v_count == 0) {
    603 		panic("dcfs_putpage: bad v_count");
    604 		/*NOTREACHED*/
    605 	}
    606 
    607 	if (vp->v_flag & VNOMAP)
    608 		return (ENOSYS);
    609 
    610 	if (!vn_has_cached_data(vp))	/* no pages mapped */
    611 		return (0);
    612 
    613 	if (len == 0)		/* from 'off' to EOF */
    614 		error = pvn_vplist_dirty(vp, off, dc_putapage, flags, cr);
    615 	else {
    616 		offset_t io_off;
    617 		se_t se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
    618 
    619 		for (io_off = off; io_off < off + len; io_off += PAGESIZE) {
    620 			page_t *pp;
    621 
    622 			/*
    623 			 * We insist on getting the page only if we are
    624 			 * about to invalidate, free or write it and
    625 			 * the B_ASYNC flag is not set.
    626 			 */
    627 			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0))
    628 				pp = page_lookup(vp, io_off, se);
    629 			else
    630 				pp = page_lookup_nowait(vp, io_off, se);
    631 
    632 			if (pp == NULL)
    633 				continue;
    634 			/*
    635 			 * Normally pvn_getdirty() should return 0, which
    636 			 * impies that it has done the job for us.
    637 			 * The shouldn't-happen scenario is when it returns 1.
    638 			 * This means that the page has been modified and
    639 			 * needs to be put back.
    640 			 * Since we can't write to a dcfs compressed file,
    641 			 * we fake a failed I/O and force pvn_write_done()
    642 			 * to destroy the page.
    643 			 */
    644 			if (pvn_getdirty(pp, flags) == 1) {
    645 				cmn_err(CE_NOTE, "dc_putpage: dirty page");
    646 				pvn_write_done(pp, flags |
    647 				    B_ERROR | B_WRITE | B_INVAL | B_FORCE);
    648 			}
    649 		}
    650 	}
    651 	return (error);
    652 }
    653 
    654 static int
    655 dc_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp,
    656     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
    657     struct cred *cred, caller_context_t *ctp)
    658 {
    659 	struct vattr vattr;
    660 	struct segvn_crargs vn_a;
    661 	int error;
    662 
    663 	if (vp->v_flag & VNOMAP)
    664 		return (ENOSYS);
    665 
    666 	if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0)
    667 		return (ENXIO);
    668 
    669 	/*
    670 	 * If file is being locked, disallow mapping.
    671 	 */
    672 	if (error = VOP_GETATTR(VTODC(vp)->dc_subvp, &vattr, 0, cred, ctp))
    673 		return (error);
    674 	if (vn_has_mandatory_locks(vp, vattr.va_mode))
    675 		return (EAGAIN);
    676 
    677 	as_rangelock(as);
    678 
    679 	if ((flags & MAP_FIXED) == 0) {
    680 		map_addr(addrp, len, off, 1, flags);
    681 		if (*addrp == NULL) {
    682 			as_rangeunlock(as);
    683 			return (ENOMEM);
    684 		}
    685 	} else {
    686 		/*
    687 		 * User specified address - blow away any previous mappings
    688 		 */
    689 		(void) as_unmap(as, *addrp, len);
    690 	}
    691 
    692 	vn_a.vp = vp;
    693 	vn_a.offset = off;
    694 	vn_a.type = flags & MAP_TYPE;
    695 	vn_a.prot = prot;
    696 	vn_a.maxprot = maxprot;
    697 	vn_a.flags = flags & ~MAP_TYPE;
    698 	vn_a.cred = cred;
    699 	vn_a.amp = NULL;
    700 	vn_a.szc = 0;
    701 	vn_a.lgrp_mem_policy_flags = 0;
    702 
    703 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
    704 	as_rangeunlock(as);
    705 	return (error);
    706 }
    707 
    708 /*ARGSUSED*/
    709 static int
    710 dc_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
    711     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
    712     struct cred *cr, caller_context_t *ctp)
    713 {
    714 	struct dcnode *dp;
    715 
    716 	if (vp->v_flag & VNOMAP)
    717 		return (ENOSYS);
    718 
    719 	dp = VTODC(vp);
    720 	mutex_enter(&dp->dc_lock);
    721 	dp->dc_mapcnt += btopr(len);
    722 	mutex_exit(&dp->dc_lock);
    723 	return (0);
    724 }
    725 
    726 /*ARGSUSED*/
    727 static int
    728 dc_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
    729     size_t len, uint_t prot, uint_t maxprot, uint_t flags,
    730     struct cred *cr, caller_context_t *ctp)
    731 {
    732 	struct dcnode *dp;
    733 
    734 	if (vp->v_flag & VNOMAP)
    735 		return (ENOSYS);
    736 
    737 	dp = VTODC(vp);
    738 	mutex_enter(&dp->dc_lock);
    739 	dp->dc_mapcnt -= btopr(len);
    740 	ASSERT(dp->dc_mapcnt >= 0);
    741 	mutex_exit(&dp->dc_lock);
    742 	return (0);
    743 }
    744 
    745 /*
    746  * Constructor/destructor routines for dcnodes
    747  */
    748 /*ARGSUSED1*/
    749 static int
    750 dcnode_constructor(void *buf, void *cdrarg, int kmflags)
    751 {
    752 	struct dcnode *dp = buf;
    753 	struct vnode *vp;
    754 
    755 	vp = dp->dc_vp = vn_alloc(kmflags);
    756 	if (vp == NULL) {
    757 		return (-1);
    758 	}
    759 	vp->v_data = dp;
    760 	vp->v_type = VREG;
    761 	vp->v_flag = VNOSWAP;
    762 	vp->v_vfsp = &dc_vfs;
    763 	vn_setops(vp, dc_vnodeops);
    764 	vn_exists(vp);
    765 
    766 	mutex_init(&dp->dc_lock, NULL, MUTEX_DEFAULT, NULL);
    767 	dp->dc_mapcnt = 0;
    768 	dp->dc_lrunext = dp->dc_lruprev = NULL;
    769 	dp->dc_hdr = NULL;
    770 	dp->dc_subvp = NULL;
    771 	return (0);
    772 }
    773 
    774 /*ARGSUSED*/
    775 static void
    776 dcnode_destructor(void *buf, void *cdrarg)
    777 {
    778 	struct dcnode *dp = buf;
    779 	struct vnode *vp = DCTOV(dp);
    780 
    781 	mutex_destroy(&dp->dc_lock);
    782 
    783 	VERIFY(dp->dc_hdr == NULL);
    784 	VERIFY(dp->dc_subvp == NULL);
    785 	vn_invalid(vp);
    786 	vn_free(vp);
    787 }
    788 
    789 static struct dcnode *
    790 dcnode_alloc(void)
    791 {
    792 	struct dcnode *dp;
    793 
    794 	/*
    795 	 * If the free list is above DCLRUSIZE
    796 	 * re-use one from it
    797 	 */
    798 	mutex_enter(&dctable_lock);
    799 	if (dclru_len < DCLRUSIZE) {
    800 		mutex_exit(&dctable_lock);
    801 		dp = kmem_cache_alloc(dcnode_cache, KM_SLEEP);
    802 	} else {
    803 		ASSERT(dclru != NULL);
    804 		dp = dclru;
    805 		dclru_sub(dp);
    806 		dcdelete(dp);
    807 		mutex_exit(&dctable_lock);
    808 		dcnode_recycle(dp);
    809 	}
    810 	return (dp);
    811 }
    812 
    813 static void
    814 dcnode_free(struct dcnode *dp)
    815 {
    816 	struct vnode *vp = DCTOV(dp);
    817 
    818 	ASSERT(MUTEX_HELD(&dctable_lock));
    819 
    820 	/*
    821 	 * If no cached pages, no need to put it on lru
    822 	 */
    823 	if (!vn_has_cached_data(vp)) {
    824 		dcdelete(dp);
    825 		dcnode_recycle(dp);
    826 		kmem_cache_free(dcnode_cache, dp);
    827 		return;
    828 	}
    829 
    830 	/*
    831 	 * Add to lru, if it's over the limit, free from head
    832 	 */
    833 	dclru_add(dp);
    834 	if (dclru_len > DCLRUSIZE) {
    835 		dp = dclru;
    836 		dclru_sub(dp);
    837 		dcdelete(dp);
    838 		dcnode_recycle(dp);
    839 		kmem_cache_free(dcnode_cache, dp);
    840 	}
    841 }
    842 
    843 static void
    844 dcnode_recycle(struct dcnode *dp)
    845 {
    846 	struct vnode *vp;
    847 
    848 	vp = DCTOV(dp);
    849 
    850 	VN_RELE(dp->dc_subvp);
    851 	dp->dc_subvp = NULL;
    852 	(void) pvn_vplist_dirty(vp, 0, dc_putapage, B_INVAL, NULL);
    853 	kmem_free(dp->dc_hdr, dp->dc_hdrsize);
    854 	dp->dc_hdr = NULL;
    855 	dp->dc_hdrsize = dp->dc_zmax = 0;
    856 	dp->dc_bufcache = NULL;
    857 	dp->dc_mapcnt = 0;
    858 	vn_reinit(vp);
    859 	vp->v_type = VREG;
    860 	vp->v_flag = VNOSWAP;
    861 	vp->v_vfsp = &dc_vfs;
    862 }
    863 
    864 static int
    865 dcinit(int fstype, char *name)
    866 {
    867 	static const fs_operation_def_t dc_vfsops_template[] = {
    868 		NULL, NULL
    869 	};
    870 	int error;
    871 	major_t dev;
    872 
    873 	error = vfs_setfsops(fstype, dc_vfsops_template, &dc_vfsops);
    874 	if (error) {
    875 		cmn_err(CE_WARN, "dcinit: bad vfs ops template");
    876 		return (error);
    877 	}
    878 	VFS_INIT(&dc_vfs, dc_vfsops, NULL);
    879 	dc_vfs.vfs_flag = VFS_RDONLY;
    880 	dc_vfs.vfs_fstype = fstype;
    881 	if ((dev = getudev()) == (major_t)-1)
    882 		dev = 0;
    883 	dcdev = makedevice(dev, 0);
    884 	dc_vfs.vfs_dev = dcdev;
    885 
    886 	error = vn_make_ops(name, dc_vnodeops_template, &dc_vnodeops);
    887 	if (error != 0) {
    888 		(void) vfs_freevfsops_by_type(fstype);
    889 		cmn_err(CE_WARN, "dcinit: bad vnode ops template");
    890 		return (error);
    891 	}
    892 
    893 	mutex_init(&dctable_lock, NULL, MUTEX_DEFAULT, NULL);
    894 	mutex_init(&dccache_lock, NULL, MUTEX_DEFAULT, NULL);
    895 	dcnode_cache = kmem_cache_create("dcnode_cache", sizeof (struct dcnode),
    896 	    0, dcnode_constructor, dcnode_destructor, NULL, NULL, NULL, 0);
    897 
    898 	return (0);
    899 }
    900 
    901 /*
    902  * Return shadow vnode with the given vp as its subordinate
    903  */
    904 struct vnode *
    905 decompvp(struct vnode *vp, cred_t *cred, caller_context_t *ctp)
    906 {
    907 	struct dcnode *dp, *ndp;
    908 	struct comphdr thdr, *hdr;
    909 	struct kmem_cache **cpp;
    910 	struct vattr vattr;
    911 	size_t hdrsize, bsize;
    912 	int error;
    913 
    914 	/*
    915 	 * See if we have an existing shadow
    916 	 * If none, we have to manufacture one
    917 	 */
    918 	mutex_enter(&dctable_lock);
    919 	dp = dcfind(vp);
    920 	mutex_exit(&dctable_lock);
    921 	if (dp != NULL)
    922 		return (DCTOV(dp));
    923 
    924 	/*
    925 	 * Make sure it's a valid compressed file
    926 	 */
    927 	hdr = &thdr;
    928 	error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, sizeof (struct comphdr), 0,
    929 	    UIO_SYSSPACE, 0, 0, cred, NULL);
    930 	if (error || hdr->ch_magic != CH_MAGIC_ZLIB ||
    931 	    hdr->ch_version != CH_VERSION || hdr->ch_algorithm != CH_ALG_ZLIB ||
    932 	    hdr->ch_fsize == 0 || hdr->ch_blksize < PAGESIZE ||
    933 	    hdr->ch_blksize > ptob(DCCACHESIZE) ||
    934 	    (hdr->ch_blksize & (hdr->ch_blksize - 1)) != 0)
    935 		return (NULL);
    936 
    937 	/* get underlying file size */
    938 	if (VOP_GETATTR(vp, &vattr, 0, cred, ctp) != 0)
    939 		return (NULL);
    940 
    941 	/*
    942 	 * Re-read entire header
    943 	 */
    944 	hdrsize = hdr->ch_blkmap[0] + sizeof (uint64_t);
    945 	hdr = kmem_alloc(hdrsize, KM_SLEEP);
    946 	error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, hdrsize, 0, UIO_SYSSPACE,
    947 	    0, 0, cred, NULL);
    948 	if (error) {
    949 		kmem_free(hdr, hdrsize);
    950 		return (NULL);
    951 	}
    952 
    953 	/*
    954 	 * add extra blkmap entry to make dc_getblock()'s
    955 	 * life easier
    956 	 */
    957 	bsize = hdr->ch_blksize;
    958 	hdr->ch_blkmap[((hdr->ch_fsize-1) / bsize) + 1] = vattr.va_size;
    959 
    960 	ndp = dcnode_alloc();
    961 	ndp->dc_subvp = vp;
    962 	VN_HOLD(vp);
    963 	ndp->dc_hdr = hdr;
    964 	ndp->dc_hdrsize = hdrsize;
    965 
    966 	/*
    967 	 * Allocate kmem cache if none there already
    968 	 */
    969 	ndp->dc_zmax = ZMAXBUF(bsize);
    970 	cpp = &dcbuf_cache[btop(bsize)];
    971 	mutex_enter(&dccache_lock);
    972 	if (*cpp == NULL)
    973 		*cpp = kmem_cache_create("dcbuf_cache", ndp->dc_zmax, 0, NULL,
    974 		    NULL, NULL, NULL, NULL, 0);
    975 	mutex_exit(&dccache_lock);
    976 	ndp->dc_bufcache = *cpp;
    977 
    978 	/*
    979 	 * Recheck table in case someone else created shadow
    980 	 * while we were blocked above.
    981 	 */
    982 	mutex_enter(&dctable_lock);
    983 	dp = dcfind(vp);
    984 	if (dp != NULL) {
    985 		mutex_exit(&dctable_lock);
    986 		dcnode_recycle(ndp);
    987 		kmem_cache_free(dcnode_cache, ndp);
    988 		return (DCTOV(dp));
    989 	}
    990 	dcinsert(ndp);
    991 	mutex_exit(&dctable_lock);
    992 
    993 	return (DCTOV(ndp));
    994 }
    995 
    996 
    997 /*
    998  * dcnode lookup table
    999  * These routines maintain a table of dcnodes hashed by their
   1000  * subordinate vnode so that they can be found if they already
   1001  * exist in the vnode cache
   1002  */
   1003 
   1004 /*
   1005  * Put a dcnode in the table.
   1006  */
   1007 static void
   1008 dcinsert(struct dcnode *newdp)
   1009 {
   1010 	int idx = DCHASH(newdp->dc_subvp);
   1011 
   1012 	ASSERT(MUTEX_HELD(&dctable_lock));
   1013 	newdp->dc_hash = dctable[idx];
   1014 	dctable[idx] = newdp;
   1015 }
   1016 
   1017 /*
   1018  * Remove a dcnode from the hash table.
   1019  */
   1020 void
   1021 dcdelete(struct dcnode *deldp)
   1022 {
   1023 	int idx = DCHASH(deldp->dc_subvp);
   1024 	struct dcnode *dp, *prevdp;
   1025 
   1026 	ASSERT(MUTEX_HELD(&dctable_lock));
   1027 	dp = dctable[idx];
   1028 	if (dp == deldp)
   1029 		dctable[idx] = dp->dc_hash;
   1030 	else {
   1031 		for (prevdp = dp, dp = dp->dc_hash; dp != NULL;
   1032 		    prevdp = dp, dp = dp->dc_hash) {
   1033 			if (dp == deldp) {
   1034 				prevdp->dc_hash = dp->dc_hash;
   1035 				break;
   1036 			}
   1037 		}
   1038 	}
   1039 	ASSERT(dp != NULL);
   1040 }
   1041 
   1042 /*
   1043  * Find a shadow vnode in the dctable hash list.
   1044  */
   1045 static struct dcnode *
   1046 dcfind(struct vnode *vp)
   1047 {
   1048 	struct dcnode *dp;
   1049 
   1050 	ASSERT(MUTEX_HELD(&dctable_lock));
   1051 	for (dp = dctable[DCHASH(vp)]; dp != NULL; dp = dp->dc_hash)
   1052 		if (dp->dc_subvp == vp) {
   1053 			VN_HOLD(DCTOV(dp));
   1054 			if (dp->dc_lrunext)
   1055 				dclru_sub(dp);
   1056 			return (dp);
   1057 		}
   1058 	return (NULL);
   1059 }
   1060 
   1061 #ifdef	DEBUG
   1062 static int
   1063 dclru_count(void)
   1064 {
   1065 	struct dcnode *dp;
   1066 	int i = 0;
   1067 
   1068 	if (dclru == NULL)
   1069 		return (0);
   1070 	for (dp = dclru; dp->dc_lrunext != dclru; dp = dp->dc_lrunext)
   1071 		i++;
   1072 	return (i + 1);
   1073 }
   1074 #endif
   1075 
   1076 static void
   1077 dclru_add(struct dcnode *dp)
   1078 {
   1079 	/*
   1080 	 * Add to dclru as double-link chain
   1081 	 */
   1082 	ASSERT(MUTEX_HELD(&dctable_lock));
   1083 	if (dclru == NULL) {
   1084 		dclru = dp;
   1085 		dp->dc_lruprev = dp->dc_lrunext = dp;
   1086 	} else {
   1087 		struct dcnode *last = dclru->dc_lruprev;
   1088 
   1089 		dclru->dc_lruprev = dp;
   1090 		last->dc_lrunext = dp;
   1091 		dp->dc_lruprev = last;
   1092 		dp->dc_lrunext = dclru;
   1093 	}
   1094 	dclru_len++;
   1095 	ASSERT(dclru_len == dclru_count());
   1096 }
   1097 
   1098 static void
   1099 dclru_sub(struct dcnode *dp)
   1100 {
   1101 	ASSERT(MUTEX_HELD(&dctable_lock));
   1102 	dp->dc_lrunext->dc_lruprev = dp->dc_lruprev;
   1103 	dp->dc_lruprev->dc_lrunext = dp->dc_lrunext;
   1104 	if (dp == dclru)
   1105 		dclru = dp->dc_lrunext == dp ? NULL : dp->dc_lrunext;
   1106 	dp->dc_lrunext = dp->dc_lruprev = NULL;
   1107 	dclru_len--;
   1108 	ASSERT(dclru_len == dclru_count());
   1109 }
   1110