Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     40 
     41 /*
     42  * Each physical swap area has an associated bitmap representing
     43  * its physical storage. The bitmap records which swap slots are
     44  * currently allocated or freed.  Allocation is done by searching
     45  * through the bitmap for the first free slot. Thus, there's
     46  * no linear relation between offset within the swap device and the
     47  * address (within its segment(s)) of the page that the slot backs;
     48  * instead, it's an arbitrary one-to-one mapping.
     49  *
     50  * Associated with each swap area is a swapinfo structure.  These
     51  * structures are linked into a linear list that determines the
     52  * ordering of swap areas in the logical swap device.  Each contains a
     53  * pointer to the corresponding bitmap, the area's size, and its
     54  * associated vnode.
     55  */
     56 
     57 #include <sys/types.h>
     58 #include <sys/inttypes.h>
     59 #include <sys/param.h>
     60 #include <sys/t_lock.h>
     61 #include <sys/sysmacros.h>
     62 #include <sys/systm.h>
     63 #include <sys/errno.h>
     64 #include <sys/kmem.h>
     65 #include <sys/vfs.h>
     66 #include <sys/vnode.h>
     67 #include <sys/pathname.h>
     68 #include <sys/cmn_err.h>
     69 #include <sys/vtrace.h>
     70 #include <sys/swap.h>
     71 #include <sys/dumphdr.h>
     72 #include <sys/debug.h>
     73 #include <sys/fs/snode.h>
     74 #include <sys/fs/swapnode.h>
     75 #include <sys/policy.h>
     76 #include <sys/zone.h>
     77 
     78 #include <vm/as.h>
     79 #include <vm/seg.h>
     80 #include <vm/page.h>
     81 #include <vm/seg_vn.h>
     82 #include <vm/hat.h>
     83 #include <vm/anon.h>
     84 #include <vm/seg_map.h>
     85 
     86 /*
     87  * To balance the load among multiple swap areas, we don't allow
     88  * more than swap_maxcontig allocations to be satisfied from a
     89  * single swap area before moving on to the next swap area.  This
     90  * effectively "interleaves" allocations among the many swap areas.
     91  */
     92 int swap_maxcontig;	/* set by anon_init() to 1 Mb */
     93 
     94 #define	MINIROOTSIZE	12000	/* ~6 Meg XXX */
     95 
     96 /*
     97  * XXX - this lock is a kludge. It serializes some aspects of swapadd() and
     98  * swapdel() (namely VOP_OPEN, VOP_CLOSE, VN_RELE).  It protects against
     99  * somebody swapadd'ing and getting swap slots from a vnode, while someone
    100  * else is in the process of closing or rele'ing it.
    101  */
    102 static kmutex_t swap_lock;
    103 
    104 kmutex_t swapinfo_lock;
    105 
    106 /*
    107  * protected by the swapinfo_lock
    108  */
    109 struct swapinfo	*swapinfo;
    110 
    111 static	struct	swapinfo *silast;
    112 static	int	nswapfiles;
    113 
    114 static u_offset_t	swap_getoff(struct swapinfo *);
    115 static int	swapadd(struct vnode *, ulong_t, ulong_t, char *);
    116 static int	swapdel(struct vnode *, ulong_t);
    117 static int	swapslot_free(struct vnode *, u_offset_t, struct swapinfo *);
    118 
    119 /*
    120  * swap device bitmap allocation macros
    121  */
    122 #define	MAPSHIFT	5
    123 #define	NBBW		(NBPW * NBBY)	/* number of bits per word */
    124 #define	TESTBIT(map, i)		(((map)[(i) >> MAPSHIFT] & (1 << (i) % NBBW)))
    125 #define	SETBIT(map, i)		(((map)[(i) >> MAPSHIFT] |= (1 << (i) % NBBW)))
    126 #define	CLEARBIT(map, i)	(((map)[(i) >> MAPSHIFT] &= ~(1 << (i) % NBBW)))
    127 
    128 int swap_debug = 0;	/* set for debug printf's */
    129 int swap_verify = 0;	/* set to verify slots when freeing and allocating */
    130 
    131 uint_t swapalloc_maxcontig;
    132 
    133 /*
    134  * Allocate a range of up to *lenp contiguous slots (page) from a physical
    135  * swap device. Flags are one of:
    136  *	SA_NOT  Must have a slot from a physical swap device other than the
    137  * 		the one containing input (*vpp, *offp).
    138  * Less slots than requested may be returned. *lenp allocated slots are
    139  * returned starting at *offp on *vpp.
    140  * Returns 1 for a successful allocation, 0 for couldn't allocate any slots.
    141  */
    142 int
    143 swap_phys_alloc(
    144 	struct vnode **vpp,
    145 	u_offset_t *offp,
    146 	size_t *lenp,
    147 	uint_t flags)
    148 {
    149 	struct swapinfo *sip;
    150 	offset_t soff, noff;
    151 	size_t len;
    152 
    153 	mutex_enter(&swapinfo_lock);
    154 	sip = silast;
    155 
    156 	/* Find a desirable physical device and allocate from it. */
    157 	do {
    158 		if (sip == NULL)
    159 			break;
    160 		if (!(sip->si_flags & ST_INDEL) &&
    161 		    (spgcnt_t)sip->si_nfpgs > 0) {
    162 			/* Caller wants other than specified swap device */
    163 			if (flags & SA_NOT) {
    164 				if (*vpp != sip->si_vp ||
    165 				    *offp < sip->si_soff ||
    166 				    *offp >= sip->si_eoff)
    167 					goto found;
    168 			/* Caller is loose, will take anything */
    169 			} else
    170 				goto found;
    171 		} else if (sip->si_nfpgs == 0)
    172 			sip->si_allocs = 0;
    173 		if ((sip = sip->si_next) == NULL)
    174 			sip = swapinfo;
    175 	} while (sip != silast);
    176 	mutex_exit(&swapinfo_lock);
    177 	return (0);
    178 found:
    179 	soff = swap_getoff(sip);
    180 	sip->si_nfpgs--;
    181 	if (soff == -1)
    182 		panic("swap_alloc: swap_getoff failed!");
    183 
    184 	for (len = PAGESIZE; len < *lenp; len += PAGESIZE) {
    185 		if (sip->si_nfpgs == 0)
    186 			break;
    187 		if (swapalloc_maxcontig && len >= swapalloc_maxcontig)
    188 			break;
    189 		noff = swap_getoff(sip);
    190 		if (noff == -1) {
    191 			break;
    192 		} else if (noff != soff + len) {
    193 			CLEARBIT(sip->si_swapslots, btop(noff - sip->si_soff));
    194 			break;
    195 		}
    196 		sip->si_nfpgs--;
    197 	}
    198 	*vpp = sip->si_vp;
    199 	*offp = soff;
    200 	*lenp = len;
    201 	ASSERT((spgcnt_t)sip->si_nfpgs >= 0);
    202 	sip->si_allocs += btop(len);
    203 	if (sip->si_allocs >= swap_maxcontig) {
    204 		sip->si_allocs = 0;
    205 		if ((silast = sip->si_next) == NULL)
    206 			silast = swapinfo;
    207 	}
    208 	TRACE_2(TR_FAC_VM, TR_SWAP_ALLOC,
    209 	    "swap_alloc:sip %p offset %lx", sip, soff);
    210 	mutex_exit(&swapinfo_lock);
    211 	return (1);
    212 }
    213 
    214 int swap_backsearch = 0;
    215 
    216 /*
    217  * Get a free offset on swap device sip.
    218  * Return >=0 offset if succeeded, -1 for failure.
    219  */
    220 static u_offset_t
    221 swap_getoff(struct swapinfo *sip)
    222 {
    223 	uint_t *sp, *ep;
    224 	size_t aoff, boff, poff, slotnumber;
    225 
    226 	ASSERT(MUTEX_HELD(&swapinfo_lock));
    227 
    228 	sip->si_alloccnt++;
    229 	for (sp = &sip->si_swapslots[sip->si_hint >> MAPSHIFT],
    230 	    ep = &sip->si_swapslots[sip->si_mapsize / NBPW]; sp < ep; sp++) {
    231 		if (*sp != (uint_t)0xffffffff)
    232 			goto foundentry;
    233 		else
    234 			sip->si_checkcnt++;
    235 	}
    236 	SWAP_PRINT(SW_ALLOC,
    237 	    "swap_getoff: couldn't find slot from hint %ld to end\n",
    238 	    sip->si_hint, 0, 0, 0, 0);
    239 	/*
    240 	 * Go backwards? Check for faster method XXX
    241 	 */
    242 	if (swap_backsearch) {
    243 		for (sp = &sip->si_swapslots[sip->si_hint >> MAPSHIFT],
    244 		    ep = sip->si_swapslots; sp > ep; sp--) {
    245 			if (*sp != (uint_t)0xffffffff)
    246 				goto foundentry;
    247 			else
    248 				sip->si_checkcnt++;
    249 		}
    250 	} else {
    251 		for (sp = sip->si_swapslots,
    252 		    ep = &sip->si_swapslots[sip->si_hint >> MAPSHIFT];
    253 		    sp < ep; sp++) {
    254 			if (*sp != (uint_t)0xffffffff)
    255 				goto foundentry;
    256 			else
    257 				sip->si_checkcnt++;
    258 		}
    259 	}
    260 	if (*sp == 0xffffffff) {
    261 		cmn_err(CE_WARN, "No free swap slots!");
    262 		return ((u_offset_t)-1);
    263 	}
    264 
    265 foundentry:
    266 	/*
    267 	 * aoff is the page number offset (in bytes) of the si_swapslots
    268 	 * array element containing a free page
    269 	 *
    270 	 * boff is the page number offset of the free page
    271 	 * (i.e. cleared bit) in si_swapslots[aoff].
    272 	 */
    273 	aoff = ((char *)sp - (char *)sip->si_swapslots) * NBBY;
    274 
    275 	for (boff = (sip->si_hint % NBBW); boff < NBBW; boff++) {
    276 		if (!TESTBIT(sip->si_swapslots, aoff + boff))
    277 			goto foundslot;
    278 		else
    279 			sip->si_checkcnt++;
    280 	}
    281 	for (boff = 0; boff < (sip->si_hint % NBBW); boff++) {
    282 		if (!TESTBIT(sip->si_swapslots, aoff + boff))
    283 			goto foundslot;
    284 		else
    285 			sip->si_checkcnt++;
    286 	}
    287 	panic("swap_getoff: didn't find slot in word hint %ld", sip->si_hint);
    288 
    289 foundslot:
    290 	/*
    291 	 * Return the offset of the free page in swap device.
    292 	 * Convert page number of byte offset and add starting
    293 	 * offset of swap device.
    294 	 */
    295 	slotnumber = aoff + boff;
    296 	SWAP_PRINT(SW_ALLOC, "swap_getoff: allocating slot %ld\n",
    297 	    slotnumber, 0, 0, 0, 0);
    298 	poff = ptob(slotnumber);
    299 	if (poff + sip->si_soff >= sip->si_eoff)
    300 		printf("ptob(aoff(%ld) + boff(%ld))(%ld) >= eoff(%ld)\n",
    301 		    aoff, boff, ptob(slotnumber), (long)sip->si_eoff);
    302 	ASSERT(poff < sip->si_eoff);
    303 	/*
    304 	 * We could verify here that the slot isn't already allocated
    305 	 * by looking through all the anon slots.
    306 	 */
    307 	SETBIT(sip->si_swapslots, slotnumber);
    308 	sip->si_hint = slotnumber + 1;	/* hint = next slot */
    309 	return (poff + sip->si_soff);
    310 }
    311 
    312 /*
    313  * Free a swap page.
    314  */
    315 void
    316 swap_phys_free(struct vnode *vp, u_offset_t off, size_t len)
    317 {
    318 	struct swapinfo *sip;
    319 	ssize_t pagenumber, npage;
    320 
    321 	mutex_enter(&swapinfo_lock);
    322 	sip = swapinfo;
    323 
    324 	do {
    325 		if (sip->si_vp == vp &&
    326 		    sip->si_soff <= off && off < sip->si_eoff) {
    327 			for (pagenumber = btop(off - sip->si_soff),
    328 			    npage = btop(len) + pagenumber;
    329 			    pagenumber < npage; pagenumber++) {
    330 				SWAP_PRINT(SW_ALLOC,
    331 				    "swap_phys_free: freeing slot %ld on "
    332 				    "sip %p\n",
    333 				    pagenumber, sip, 0, 0, 0);
    334 				if (!TESTBIT(sip->si_swapslots, pagenumber)) {
    335 					panic(
    336 					    "swap_phys_free: freeing free slot "
    337 					    "%p,%lx\n", (void *)vp,
    338 					    ptob(pagenumber) + sip->si_soff);
    339 				}
    340 				CLEARBIT(sip->si_swapslots, pagenumber);
    341 				sip->si_nfpgs++;
    342 			}
    343 			ASSERT(sip->si_nfpgs <= sip->si_npgs);
    344 			mutex_exit(&swapinfo_lock);
    345 			return;
    346 		}
    347 	} while ((sip = sip->si_next) != NULL);
    348 	panic("swap_phys_free");
    349 	/*NOTREACHED*/
    350 }
    351 
    352 /*
    353  * Return the anon struct corresponding for the given
    354  * <vnode, off> if it is part of the virtual swap device.
    355  * Return the anon struct if found, otherwise NULL.
    356  */
    357 struct anon *
    358 swap_anon(struct vnode *vp, u_offset_t off)
    359 {
    360 	struct anon *ap;
    361 
    362 	ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(vp, off)]));
    363 
    364 	for (ap = anon_hash[ANON_HASH(vp, off)]; ap != NULL; ap = ap->an_hash) {
    365 		if (ap->an_vp == vp && ap->an_off == off)
    366 			return (ap);
    367 	}
    368 	return (NULL);
    369 }
    370 
    371 
    372 /*
    373  * Determine if the vp offset range overlap a swap device.
    374  */
    375 int
    376 swap_in_range(struct vnode *vp, u_offset_t offset, size_t len)
    377 {
    378 	struct swapinfo *sip;
    379 	u_offset_t eoff;
    380 
    381 	eoff = offset + len;
    382 	ASSERT(eoff > offset);
    383 
    384 	mutex_enter(&swapinfo_lock);
    385 	sip = swapinfo;
    386 	if (vp && sip) {
    387 		do {
    388 			if (vp != sip->si_vp || eoff <= sip->si_soff ||
    389 			    offset >= sip->si_eoff)
    390 				continue;
    391 			mutex_exit(&swapinfo_lock);
    392 			return (1);
    393 		} while ((sip = sip->si_next) != NULL);
    394 	}
    395 	mutex_exit(&swapinfo_lock);
    396 	return (0);
    397 }
    398 
    399 /*
    400  * See if name is one of our swap files
    401  * even though lookupname failed.
    402  * This can be used by swapdel to delete
    403  * swap resources on remote machines
    404  * where the link has gone down.
    405  */
    406 static struct vnode *
    407 swapdel_byname(
    408 	char 	*name,			/* pathname to delete */
    409 	ulong_t lowblk) 	/* Low block number of area to delete */
    410 {
    411 	struct swapinfo **sipp, *osip;
    412 	u_offset_t soff;
    413 
    414 	/*
    415 	 * Find the swap file entry for the file to
    416 	 * be deleted. Skip any entries that are in
    417 	 * transition.
    418 	 */
    419 
    420 	soff = ptob(btopr(lowblk << SCTRSHFT)); /* must be page aligned */
    421 
    422 	mutex_enter(&swapinfo_lock);
    423 	for (sipp = &swapinfo; (osip = *sipp) != NULL; sipp = &osip->si_next) {
    424 		if ((strcmp(osip->si_pname, name) == 0) &&
    425 		    (osip->si_soff == soff) && (osip->si_flags == 0)) {
    426 			struct vnode *vp = osip->si_vp;
    427 
    428 			VN_HOLD(vp);
    429 			mutex_exit(&swapinfo_lock);
    430 			return (vp);
    431 		}
    432 	}
    433 	mutex_exit(&swapinfo_lock);
    434 	return (NULL);
    435 }
    436 
    437 
    438 /*
    439  * New system call to manipulate swap files.
    440  */
    441 int
    442 swapctl(int sc_cmd, void *sc_arg, int *rv)
    443 {
    444 	struct swapinfo *sip, *csip, *tsip;
    445 	int error = 0;
    446 	struct swapent st, *ust;
    447 	struct swapres sr;
    448 	struct vnode *vp;
    449 	int cnt = 0;
    450 	int tmp_nswapfiles;
    451 	int nswap;
    452 	int length, nlen;
    453 	int gplen = 0, plen;
    454 	char *swapname;
    455 	char *pname;
    456 	char *tpname;
    457 	struct anoninfo ai;
    458 	spgcnt_t avail;
    459 	int global = INGLOBALZONE(curproc);
    460 
    461 	/*
    462 	 * When running in a zone we want to hide the details of the swap
    463 	 * devices: we report there only being one swap device named "swap"
    464 	 * having a size equal to the sum of the sizes of all real swap devices
    465 	 * on the system.
    466 	 */
    467 	switch (sc_cmd) {
    468 	case SC_GETNSWP:
    469 		if (global)
    470 			*rv = nswapfiles;
    471 		else
    472 			*rv = 1;
    473 		return (0);
    474 
    475 	case SC_AINFO:
    476 		/*
    477 		 * Return anoninfo information with these changes:
    478 		 * ani_max = maximum amount of swap space
    479 		 *	(including potentially available physical memory)
    480 		 * ani_free = amount of unallocated anonymous memory
    481 		 *	(some of which might be reserved and including
    482 		 *	 potentially available physical memory)
    483 		 * ani_resv = amount of claimed (reserved) anonymous memory
    484 		 */
    485 		avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0);
    486 		ai.ani_max = (k_anoninfo.ani_max +
    487 		    k_anoninfo.ani_mem_resv) +avail;
    488 
    489 		ai.ani_free = k_anoninfo.ani_free + avail;
    490 
    491 		ai.ani_resv = k_anoninfo.ani_phys_resv +
    492 		    k_anoninfo.ani_mem_resv;
    493 
    494 		if (copyout(&ai, sc_arg, sizeof (struct anoninfo)) != 0)
    495 			return (EFAULT);
    496 		return (0);
    497 
    498 	case SC_LIST:
    499 		if (copyin(sc_arg, &length, sizeof (int)) != 0)
    500 			return (EFAULT);
    501 		if (!global) {
    502 			struct swapent st;
    503 			char *swappath = "swap";
    504 
    505 			if (length < 1)
    506 				return (ENOMEM);
    507 			ust = (swapent_t *)((swaptbl_t *)sc_arg)->swt_ent;
    508 			if (copyin(ust, &st, sizeof (swapent_t)) != 0)
    509 				return (EFAULT);
    510 			st.ste_start = PAGESIZE >> SCTRSHFT;
    511 			st.ste_length = (off_t)0;
    512 			st.ste_pages = 0;
    513 			st.ste_free = 0;
    514 			st.ste_flags = 0;
    515 			mutex_enter(&swapinfo_lock);
    516 			for (sip = swapinfo, nswap = 0;
    517 			    sip != NULL && nswap < nswapfiles;
    518 			    sip = sip->si_next, nswap++) {
    519 				st.ste_length +=
    520 				    (sip->si_eoff - sip->si_soff) >> SCTRSHFT;
    521 				st.ste_pages += sip->si_npgs;
    522 				st.ste_free += sip->si_nfpgs;
    523 			}
    524 			mutex_exit(&swapinfo_lock);
    525 			if (copyout(&st, ust, sizeof (swapent_t)) != 0 ||
    526 			    copyout(swappath, st.ste_path,
    527 			    strlen(swappath) + 1) != 0) {
    528 				return (EFAULT);
    529 			}
    530 			*rv = 1;
    531 			return (0);
    532 		}
    533 beginning:
    534 		tmp_nswapfiles = nswapfiles;
    535 		/* Return an error if not enough space for the whole table. */
    536 		if (length < tmp_nswapfiles)
    537 			return (ENOMEM);
    538 		/*
    539 		 * Get memory to hold the swap entries and their names. We'll
    540 		 * copy the real entries into these and then copy these out.
    541 		 * Allocating the pathname memory is only a guess so we may
    542 		 * find that we need more and have to do it again.
    543 		 * All this is because we have to hold the anon lock while
    544 		 * traversing the swapinfo list, and we can't be doing copyouts
    545 		 * and/or kmem_alloc()s during this.
    546 		 */
    547 		csip = kmem_zalloc(tmp_nswapfiles * sizeof (struct swapinfo),
    548 		    KM_SLEEP);
    549 retry:
    550 		nlen = tmp_nswapfiles * (gplen += 100);
    551 		pname = kmem_zalloc(nlen, KM_SLEEP);
    552 
    553 		mutex_enter(&swapinfo_lock);
    554 
    555 		if (tmp_nswapfiles != nswapfiles) {
    556 			mutex_exit(&swapinfo_lock);
    557 			kmem_free(pname, nlen);
    558 			kmem_free(csip,
    559 			    tmp_nswapfiles * sizeof (struct swapinfo));
    560 			gplen = 0;
    561 			goto beginning;
    562 		}
    563 		for (sip = swapinfo, tsip = csip, tpname = pname, nswap = 0;
    564 		    sip && nswap < tmp_nswapfiles;
    565 		    sip = sip->si_next, tsip++, tpname += plen, nswap++) {
    566 			plen = sip->si_pnamelen;
    567 			if (tpname + plen - pname > nlen) {
    568 				mutex_exit(&swapinfo_lock);
    569 				kmem_free(pname, nlen);
    570 				goto retry;
    571 			}
    572 			*tsip = *sip;
    573 			tsip->si_pname = tpname;
    574 			(void) strcpy(tsip->si_pname, sip->si_pname);
    575 		}
    576 		mutex_exit(&swapinfo_lock);
    577 
    578 		if (sip) {
    579 			error = ENOMEM;
    580 			goto lout;
    581 		}
    582 		ust = (swapent_t *)((swaptbl_t *)sc_arg)->swt_ent;
    583 		for (tsip = csip, cnt = 0; cnt < nswap;  tsip++, ust++, cnt++) {
    584 			if (copyin(ust, &st, sizeof (swapent_t)) != 0) {
    585 				error = EFAULT;
    586 				goto lout;
    587 			}
    588 			st.ste_flags = tsip->si_flags;
    589 			st.ste_length =
    590 			    (tsip->si_eoff - tsip->si_soff) >> SCTRSHFT;
    591 			st.ste_start = tsip->si_soff >> SCTRSHFT;
    592 			st.ste_pages = tsip->si_npgs;
    593 			st.ste_free = tsip->si_nfpgs;
    594 			if (copyout(&st, ust, sizeof (swapent_t)) != 0) {
    595 				error = EFAULT;
    596 				goto lout;
    597 			}
    598 			if (!tsip->si_pnamelen)
    599 				continue;
    600 			if (copyout(tsip->si_pname, st.ste_path,
    601 			    tsip->si_pnamelen) != 0) {
    602 				error = EFAULT;
    603 				goto lout;
    604 			}
    605 		}
    606 		*rv = nswap;
    607 lout:
    608 		kmem_free(csip, tmp_nswapfiles * sizeof (struct swapinfo));
    609 		kmem_free(pname, nlen);
    610 		return (error);
    611 
    612 	case SC_ADD:
    613 	case SC_REMOVE:
    614 		break;
    615 	default:
    616 		return (EINVAL);
    617 	}
    618 	if ((error = secpolicy_swapctl(CRED())) != 0)
    619 		return (error);
    620 
    621 	if (copyin(sc_arg, &sr, sizeof (swapres_t)))
    622 		return (EFAULT);
    623 
    624 	/* Allocate the space to read in pathname */
    625 	if ((swapname = kmem_alloc(MAXPATHLEN, KM_NOSLEEP)) == NULL)
    626 		return (ENOMEM);
    627 
    628 	error = copyinstr(sr.sr_name, swapname, MAXPATHLEN, 0);
    629 	if (error)
    630 		goto out;
    631 
    632 	error = lookupname(swapname, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
    633 	if (error) {
    634 		if (sc_cmd == SC_ADD)
    635 			goto out;
    636 		/* see if we match by name */
    637 		vp = swapdel_byname(swapname, (size_t)sr.sr_start);
    638 		if (vp == NULL)
    639 			goto out;
    640 	}
    641 
    642 	if (vp->v_flag & (VNOMAP | VNOSWAP)) {
    643 		VN_RELE(vp);
    644 		error = ENOSYS;
    645 		goto out;
    646 	}
    647 	switch (vp->v_type) {
    648 	case VBLK:
    649 		break;
    650 
    651 	case VREG:
    652 		if (vp->v_vfsp && vn_is_readonly(vp))
    653 			error = EROFS;
    654 		else
    655 			error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL);
    656 		break;
    657 
    658 	case VDIR:
    659 		error = EISDIR;
    660 		break;
    661 	default:
    662 		error = ENOSYS;
    663 		break;
    664 	}
    665 	if (error == 0) {
    666 		if (sc_cmd == SC_REMOVE)
    667 			error = swapdel(vp, sr.sr_start);
    668 		else
    669 			error = swapadd(vp, sr.sr_start,
    670 			    sr.sr_length, swapname);
    671 	}
    672 	VN_RELE(vp);
    673 out:
    674 	kmem_free(swapname, MAXPATHLEN);
    675 	return (error);
    676 }
    677 
    678 #if defined(_LP64) && defined(_SYSCALL32)
    679 
    680 int
    681 swapctl32(int sc_cmd, void *sc_arg, int *rv)
    682 {
    683 	struct swapinfo *sip, *csip, *tsip;
    684 	int error = 0;
    685 	struct swapent32 st, *ust;
    686 	struct swapres32 sr;
    687 	struct vnode *vp;
    688 	int cnt = 0;
    689 	int tmp_nswapfiles;
    690 	int nswap;
    691 	int length, nlen;
    692 	int gplen = 0, plen;
    693 	char *swapname;
    694 	char *pname;
    695 	char *tpname;
    696 	struct anoninfo32 ai;
    697 	size_t s;
    698 	spgcnt_t avail;
    699 
    700 	switch (sc_cmd) {
    701 	case SC_GETNSWP:
    702 		*rv = nswapfiles;
    703 		return (0);
    704 
    705 	case SC_AINFO:
    706 		/*
    707 		 * Return anoninfo information with these changes:
    708 		 * ani_max = maximum amount of swap space
    709 		 *	(including potentially available physical memory)
    710 		 * ani_free = amount of unallocated anonymous memory
    711 		 *	(some of which might be reserved and including
    712 		 *	 potentially available physical memory)
    713 		 * ani_resv = amount of claimed (reserved) anonymous memory
    714 		 */
    715 		avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0);
    716 		s = (k_anoninfo.ani_max + k_anoninfo.ani_mem_resv) + avail;
    717 		if (s > UINT32_MAX)
    718 			return (EOVERFLOW);
    719 		ai.ani_max = s;
    720 
    721 		s = k_anoninfo.ani_free + avail;
    722 		if (s > UINT32_MAX)
    723 			return (EOVERFLOW);
    724 		ai.ani_free = s;
    725 
    726 		s = k_anoninfo.ani_phys_resv + k_anoninfo.ani_mem_resv;
    727 		if (s > UINT32_MAX)
    728 			return (EOVERFLOW);
    729 		ai.ani_resv = s;
    730 
    731 		if (copyout(&ai, sc_arg, sizeof (ai)) != 0)
    732 			return (EFAULT);
    733 		return (0);
    734 
    735 	case SC_LIST:
    736 		if (copyin(sc_arg, &length, sizeof (int32_t)) != 0)
    737 			return (EFAULT);
    738 beginning:
    739 		tmp_nswapfiles = nswapfiles;
    740 		/* Return an error if not enough space for the whole table. */
    741 		if (length < tmp_nswapfiles)
    742 			return (ENOMEM);
    743 		/*
    744 		 * Get memory to hold the swap entries and their names. We'll
    745 		 * copy the real entries into these and then copy these out.
    746 		 * Allocating the pathname memory is only a guess so we may
    747 		 * find that we need more and have to do it again.
    748 		 * All this is because we have to hold the anon lock while
    749 		 * traversing the swapinfo list, and we can't be doing copyouts
    750 		 * and/or kmem_alloc()s during this.
    751 		 */
    752 		csip = kmem_zalloc(tmp_nswapfiles * sizeof (*csip), KM_SLEEP);
    753 retry:
    754 		nlen = tmp_nswapfiles * (gplen += 100);
    755 		pname = kmem_zalloc(nlen, KM_SLEEP);
    756 
    757 		mutex_enter(&swapinfo_lock);
    758 
    759 		if (tmp_nswapfiles != nswapfiles) {
    760 			mutex_exit(&swapinfo_lock);
    761 			kmem_free(pname, nlen);
    762 			kmem_free(csip, tmp_nswapfiles * sizeof (*csip));
    763 			gplen = 0;
    764 			goto beginning;
    765 		}
    766 		for (sip = swapinfo, tsip = csip, tpname = pname, nswap = 0;
    767 		    (sip != NULL) && (nswap < tmp_nswapfiles);
    768 		    sip = sip->si_next, tsip++, tpname += plen, nswap++) {
    769 			plen = sip->si_pnamelen;
    770 			if (tpname + plen - pname > nlen) {
    771 				mutex_exit(&swapinfo_lock);
    772 				kmem_free(pname, nlen);
    773 				goto retry;
    774 			}
    775 			*tsip = *sip;
    776 			tsip->si_pname = tpname;
    777 			(void) strcpy(tsip->si_pname, sip->si_pname);
    778 		}
    779 		mutex_exit(&swapinfo_lock);
    780 
    781 		if (sip != NULL) {
    782 			error = ENOMEM;
    783 			goto lout;
    784 		}
    785 		ust = (swapent32_t *)((swaptbl32_t *)sc_arg)->swt_ent;
    786 		for (tsip = csip, cnt = 0; cnt < nswap;  tsip++, ust++, cnt++) {
    787 			if (copyin(ust, &st, sizeof (*ust)) != 0) {
    788 				error = EFAULT;
    789 				goto lout;
    790 			}
    791 			st.ste_flags = tsip->si_flags;
    792 			st.ste_length =
    793 			    (tsip->si_eoff - tsip->si_soff) >> SCTRSHFT;
    794 			st.ste_start = tsip->si_soff >> SCTRSHFT;
    795 			st.ste_pages = tsip->si_npgs;
    796 			st.ste_free = tsip->si_nfpgs;
    797 			if (copyout(&st, ust, sizeof (st)) != 0) {
    798 				error = EFAULT;
    799 				goto lout;
    800 			}
    801 			if (!tsip->si_pnamelen)
    802 				continue;
    803 			if (copyout(tsip->si_pname,
    804 			    (caddr_t)(uintptr_t)st.ste_path,
    805 			    tsip->si_pnamelen) != 0) {
    806 				error = EFAULT;
    807 				goto lout;
    808 			}
    809 		}
    810 		*rv = nswap;
    811 lout:
    812 		kmem_free(csip, tmp_nswapfiles * sizeof (*csip));
    813 		kmem_free(pname, nlen);
    814 		return (error);
    815 
    816 	case SC_ADD:
    817 	case SC_REMOVE:
    818 		break;
    819 	default:
    820 		return (EINVAL);
    821 	}
    822 	if ((error = secpolicy_swapctl(CRED())) != 0)
    823 		return (error);
    824 
    825 	if (copyin(sc_arg, &sr, sizeof (sr)))
    826 		return (EFAULT);
    827 
    828 	/* Allocate the space to read in pathname */
    829 	if ((swapname = kmem_alloc(MAXPATHLEN, KM_NOSLEEP)) == NULL)
    830 		return (ENOMEM);
    831 
    832 	error = copyinstr((caddr_t)(uintptr_t)sr.sr_name,
    833 	    swapname, MAXPATHLEN, NULL);
    834 	if (error)
    835 		goto out;
    836 
    837 	error = lookupname(swapname, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
    838 	if (error) {
    839 		if (sc_cmd == SC_ADD)
    840 			goto out;
    841 		/* see if we match by name */
    842 		vp = swapdel_byname(swapname, (uint_t)sr.sr_start);
    843 		if (vp == NULL)
    844 			goto out;
    845 	}
    846 
    847 	if (vp->v_flag & (VNOMAP | VNOSWAP)) {
    848 		VN_RELE(vp);
    849 		error = ENOSYS;
    850 		goto out;
    851 	}
    852 	switch (vp->v_type) {
    853 	case VBLK:
    854 		break;
    855 
    856 	case VREG:
    857 		if (vp->v_vfsp && vn_is_readonly(vp))
    858 			error = EROFS;
    859 		else
    860 			error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL);
    861 		break;
    862 
    863 	case VDIR:
    864 		error = EISDIR;
    865 		break;
    866 	default:
    867 		error = ENOSYS;
    868 		break;
    869 	}
    870 	if (error == 0) {
    871 		if (sc_cmd == SC_REMOVE)
    872 			error = swapdel(vp, sr.sr_start);
    873 		else
    874 			error = swapadd(vp, sr.sr_start, sr.sr_length,
    875 			    swapname);
    876 	}
    877 	VN_RELE(vp);
    878 out:
    879 	kmem_free(swapname, MAXPATHLEN);
    880 	return (error);
    881 }
    882 
    883 #endif /* _LP64 && _SYSCALL32 */
    884 
    885 /*
    886  * Add a new swap file.
    887  */
    888 int
    889 swapadd(struct vnode *vp, ulong_t lowblk, ulong_t nblks, char *swapname)
    890 {
    891 	struct swapinfo **sipp, *nsip = NULL, *esip = NULL;
    892 	struct vnode *cvp;
    893 	struct vattr vattr;
    894 	pgcnt_t pages;
    895 	u_offset_t soff, eoff;
    896 	int error;
    897 	ssize_t i, start, end;
    898 	ushort_t wasswap;
    899 	ulong_t startblk;
    900 	size_t	returned_mem;
    901 
    902 	SWAP_PRINT(SW_CTL, "swapadd: vp %p lowblk %ld nblks %ld swapname %s\n",
    903 	    vp, lowblk, nblks, swapname, 0);
    904 	/*
    905 	 * Get the real vnode. (If vp is not a specnode it just returns vp, so
    906 	 * it does the right thing, but having this code know about specnodes
    907 	 * violates the spirit of having it be indepedent of vnode type.)
    908 	 */
    909 	cvp = common_specvp(vp);
    910 
    911 	/*
    912 	 * Or in VISSWAP so file system has chance to deny swap-ons during open.
    913 	 */
    914 	mutex_enter(&cvp->v_lock);
    915 	wasswap = cvp->v_flag & VISSWAP;
    916 	cvp->v_flag |= VISSWAP;
    917 	mutex_exit(&cvp->v_lock);
    918 
    919 	mutex_enter(&swap_lock);
    920 	if (error = VOP_OPEN(&cvp, FREAD|FWRITE, CRED(), NULL)) {
    921 		mutex_exit(&swap_lock);
    922 		/* restore state of v_flag */
    923 		if (!wasswap) {
    924 			mutex_enter(&cvp->v_lock);
    925 			cvp->v_flag &= ~VISSWAP;
    926 			mutex_exit(&cvp->v_lock);
    927 		}
    928 		return (error);
    929 	}
    930 	mutex_exit(&swap_lock);
    931 
    932 	/*
    933 	 * Get partition size. Return error if empty partition,
    934 	 * or if request does not fit within the partition.
    935 	 * If this is the first swap device, we can reduce
    936 	 * the size of the swap area to match what is
    937 	 * available.  This can happen if the system was built
    938 	 * on a machine with a different size swap partition.
    939 	 */
    940 	vattr.va_mask = AT_SIZE;
    941 	if (error = VOP_GETATTR(cvp, &vattr, ATTR_COMM, CRED(), NULL))
    942 		goto out;
    943 
    944 	/*
    945 	 * Specfs returns a va_size of MAXOFFSET_T (UNKNOWN_SIZE) when the
    946 	 * size of the device can't be determined.
    947 	 */
    948 	if ((vattr.va_size == 0) || (vattr.va_size == MAXOFFSET_T)) {
    949 		error = EINVAL;
    950 		goto out;
    951 	}
    952 
    953 #ifdef	_ILP32
    954 	/*
    955 	 * No support for large swap in 32-bit OS, if the size of the swap is
    956 	 * bigger than MAXOFF32_T then the size used by swapfs must be limited.
    957 	 * This limitation is imposed by the swap subsystem itself, a D_64BIT
    958 	 * driver as the target of swap operation should be able to field
    959 	 * the IO.
    960 	 */
    961 	if (vattr.va_size > MAXOFF32_T) {
    962 		cmn_err(CE_NOTE,
    963 		    "!swap device %s truncated from 0x%llx to 0x%x bytes",
    964 		    swapname, vattr.va_size, MAXOFF32_T);
    965 		vattr.va_size = MAXOFF32_T;
    966 	}
    967 #endif	/* _ILP32 */
    968 
    969 	/* Fail if file not writeable (try to set size to current size) */
    970 	vattr.va_mask = AT_SIZE;
    971 	if (error = VOP_SETATTR(cvp, &vattr, 0, CRED(), NULL))
    972 		goto out;
    973 
    974 	/* Fail if fs does not support VOP_PAGEIO */
    975 	error = VOP_PAGEIO(cvp, (page_t *)NULL, (u_offset_t)0, 0, 0, CRED(),
    976 	    NULL);
    977 
    978 	if (error == ENOSYS)
    979 		goto out;
    980 	else
    981 		error = 0;
    982 	/*
    983 	 * If swapping on the root filesystem don't put swap blocks that
    984 	 * correspond to the miniroot filesystem on the swap free list.
    985 	 */
    986 	if (cvp == rootdir)
    987 		startblk = roundup(MINIROOTSIZE<<SCTRSHFT, klustsize)>>SCTRSHFT;
    988 	else				/* Skip 1st page (disk label) */
    989 		startblk = (ulong_t)(lowblk ? lowblk : 1);
    990 
    991 	soff = startblk << SCTRSHFT;
    992 	if (soff >= vattr.va_size) {
    993 		error = EINVAL;
    994 		goto out;
    995 	}
    996 
    997 	/*
    998 	 * If user specified 0 blks, use the size of the device
    999 	 */
   1000 	eoff = nblks ?  soff + (nblks - (startblk - lowblk) << SCTRSHFT) :
   1001 	    vattr.va_size;
   1002 
   1003 	SWAP_PRINT(SW_CTL, "swapadd: va_size %ld soff %ld eoff %ld\n",
   1004 	    vattr.va_size, soff, eoff, 0, 0);
   1005 
   1006 	if (eoff > vattr.va_size) {
   1007 		error = EINVAL;
   1008 		goto out;
   1009 	}
   1010 
   1011 	/*
   1012 	 * The starting and ending offsets must be page aligned.
   1013 	 * Round soff up to next page boundary, round eoff
   1014 	 * down to previous page boundary.
   1015 	 */
   1016 	soff = ptob(btopr(soff));
   1017 	eoff = ptob(btop(eoff));
   1018 	if (soff >= eoff) {
   1019 		SWAP_PRINT(SW_CTL, "swapadd: soff %ld >= eoff %ld\n",
   1020 		    soff, eoff, 0, 0, 0);
   1021 		error = EINVAL;
   1022 		goto out;
   1023 	}
   1024 
   1025 	pages = btop(eoff - soff);
   1026 
   1027 	/* Allocate and partially set up the new swapinfo */
   1028 	nsip = kmem_zalloc(sizeof (struct swapinfo), KM_SLEEP);
   1029 	nsip->si_vp = cvp;
   1030 
   1031 	nsip->si_soff = soff;
   1032 	nsip->si_eoff = eoff;
   1033 	nsip->si_hint = 0;
   1034 	nsip->si_checkcnt = nsip->si_alloccnt = 0;
   1035 
   1036 	nsip->si_pnamelen = (int)strlen(swapname) + 1;
   1037 	nsip->si_pname = (char *)kmem_zalloc(nsip->si_pnamelen, KM_SLEEP);
   1038 	bcopy(swapname, nsip->si_pname, nsip->si_pnamelen - 1);
   1039 	SWAP_PRINT(SW_CTL, "swapadd: allocating swapinfo for %s, %ld pages\n",
   1040 	    swapname, pages, 0, 0, 0);
   1041 	/*
   1042 	 * Size of swapslots map in bytes
   1043 	 */
   1044 	nsip->si_mapsize = P2ROUNDUP(pages, NBBW) / NBBY;
   1045 	nsip->si_swapslots = kmem_zalloc(nsip->si_mapsize,