Home | History | Annotate | Download | only in fs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     22 /*	  All Rights Reserved  	*/
     23 
     24 
     25 /*
     26  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     27  * Use is subject to license terms.
     28  */
     29 
     30 #include <sys/types.h>
     31 #include <sys/t_lock.h>
     32 #include <sys/param.h>
     33 #include <sys/tuneable.h>
     34 #include <sys/inline.h>
     35 #include <sys/systm.h>
     36 #include <sys/proc.h>
     37 #include <sys/user.h>
     38 #include <sys/var.h>
     39 #include <sys/buf.h>
     40 #include <sys/vfs.h>
     41 #include <sys/cred.h>
     42 #include <sys/kmem.h>
     43 #include <sys/vnode.h>
     44 #include <sys/swap.h>
     45 #include <sys/vm.h>
     46 #include <sys/debug.h>
     47 #include <sys/cmn_err.h>
     48 #include <sys/sysinfo.h>
     49 #include <sys/callb.h>
     50 #include <sys/reboot.h>
     51 #include <sys/time.h>
     52 #include <sys/fs/ufs_inode.h>
     53 #include <sys/fs/ufs_bio.h>
     54 
     55 #include <vm/hat.h>
     56 #include <vm/page.h>
     57 #include <vm/pvn.h>
     58 #include <vm/seg_kmem.h>
     59 
     60 int doiflush = 1;	/* non-zero to turn inode flushing on */
     61 int dopageflush = 1;	/* non-zero to turn page flushing on */
     62 
     63 /*
     64  * To improve boot performance, don't run the inode flushing loop until
     65  * the specified number of seconds after boot.  To revert to the old
     66  * behavior, set fsflush_iflush_delay to 0.  We have not created any new
     67  * filesystem danger that did not exist previously, since there is always a
     68  * window in between when fsflush does the inode flush loop during which the
     69  * system could crash, fail to sync the filesystem, and fsck will be needed
     70  * to recover.  We have, however, widened this window.  Finally,
     71  * we never delay inode flushing if we're booting into single user mode,
     72  * where the administrator may be modifying files or using fsck.  This
     73  * modification avoids inode flushes during boot whose only purpose is to
     74  * update atimes on files which have been accessed during boot.
     75  */
     76 int fsflush_iflush_delay = 60;
     77 
     78 kcondvar_t fsflush_cv;
     79 static kmutex_t fsflush_lock;	/* just for the cv_wait */
     80 ksema_t fsflush_sema;		/* to serialize with reboot */
     81 
     82 /*
     83  * some statistics for fsflush_do_pages
     84  */
     85 typedef struct {
     86 	ulong_t fsf_scan;	/* number of pages scanned */
     87 	ulong_t fsf_examined;	/* number of page_t's actually examined, can */
     88 				/* be less than fsf_scan due to large pages */
     89 	ulong_t fsf_locked;	/* pages we actually page_lock()ed */
     90 	ulong_t fsf_modified;	/* number of modified pages found */
     91 	ulong_t fsf_coalesce;	/* number of page coalesces done */
     92 	ulong_t fsf_time;	/* nanoseconds of run time */
     93 	ulong_t fsf_releases;	/* number of page_release() done */
     94 } fsf_stat_t;
     95 
     96 fsf_stat_t fsf_recent;	/* counts for most recent duty cycle */
     97 fsf_stat_t fsf_total;	/* total of counts */
     98 ulong_t fsf_cycles;	/* number of runs refelected in fsf_total */
     99 
    100 /*
    101  * data used to determine when we can coalesce consecutive free pages
    102  * into larger pages.
    103  */
    104 #define	MAX_PAGESIZES	32
    105 static ulong_t		fsf_npgsz;
    106 static pgcnt_t		fsf_pgcnt[MAX_PAGESIZES];
    107 static pgcnt_t		fsf_mask[MAX_PAGESIZES];
    108 
    109 
    110 /*
    111  * Scan page_t's and issue I/O's for modified pages.
    112  *
    113  * Also coalesces consecutive small sized free pages into the next larger
    114  * pagesize. This costs a tiny bit of time in fsflush, but will reduce time
    115  * spent scanning on later passes and for anybody allocating large pages.
    116  */
    117 static void
    118 fsflush_do_pages()
    119 {
    120 	vnode_t		*vp;
    121 	ulong_t		pcount;
    122 	hrtime_t	timer = gethrtime();
    123 	ulong_t		releases = 0;
    124 	ulong_t		nexamined = 0;
    125 	ulong_t		nlocked = 0;
    126 	ulong_t		nmodified = 0;
    127 	ulong_t		ncoalesce = 0;
    128 	ulong_t		cnt;
    129 	int		mod;
    130 	int		fspage = 1;
    131 	u_offset_t	offset;
    132 	uint_t		szc;
    133 
    134 	page_t		*coal_page = NULL;  /* 1st page in group to coalesce */
    135 	uint_t		coal_szc = 0;	    /* size code, coal_page->p_szc */
    136 	uint_t		coal_cnt = 0;	    /* count of pages seen */
    137 
    138 	static ulong_t	nscan = 0;
    139 	static pgcnt_t	last_total_pages = 0;
    140 	static page_t	*pp = NULL;
    141 
    142 	/*
    143 	 * Check to see if total_pages has changed.
    144 	 */
    145 	if (total_pages != last_total_pages) {
    146 		last_total_pages = total_pages;
    147 		nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup;
    148 	}
    149 
    150 	if (pp == NULL)
    151 		pp = memsegs->pages;
    152 
    153 	pcount = 0;
    154 	while (pcount < nscan) {
    155 
    156 		/*
    157 		 * move to the next page, skipping over large pages
    158 		 * and issuing prefetches.
    159 		 */
    160 		if (pp->p_szc && fspage == 0) {
    161 			pfn_t pfn;
    162 
    163 			pfn  = page_pptonum(pp);
    164 			cnt = page_get_pagecnt(pp->p_szc);
    165 			cnt -= pfn & (cnt - 1);
    166 		} else
    167 			cnt = 1;
    168 
    169 		pp = page_nextn(pp, cnt);
    170 		prefetch_page_r((void *)pp);
    171 		ASSERT(pp != NULL);
    172 		pcount += cnt;
    173 
    174 		/*
    175 		 * Do a bunch of dirty tests (ie. no locking) to determine
    176 		 * if we can quickly skip this page. These tests are repeated
    177 		 * after acquiring the page lock.
    178 		 */
    179 		++nexamined;
    180 		if (PP_ISSWAP(pp)) {
    181 			fspage = 0;
    182 			coal_page = NULL;
    183 			continue;
    184 		}
    185 
    186 		/*
    187 		 * skip free pages too, but try coalescing them into larger
    188 		 * pagesizes
    189 		 */
    190 		if (PP_ISFREE(pp)) {
    191 			/*
    192 			 * skip pages with a file system identity or that
    193 			 * are already maximum size
    194 			 */
    195 			fspage = 0;
    196 			szc = pp->p_szc;
    197 			if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) {
    198 				coal_page = NULL;
    199 				continue;
    200 			}
    201 
    202 			/*
    203 			 * If not in a coalescing candidate page or the size
    204 			 * codes are different, start a new candidate.
    205 			 */
    206 			if (coal_page == NULL || coal_szc != szc) {
    207 
    208 				/*
    209 				 * page must be properly aligned
    210 				 */
    211 				if ((page_pptonum(pp) & fsf_mask[szc]) != 0) {
    212 					coal_page = NULL;
    213 					continue;
    214 				}
    215 				coal_page = pp;
    216 				coal_szc = szc;
    217 				coal_cnt = 1;
    218 				continue;
    219 			}
    220 
    221 			/*
    222 			 * acceptable to add this to existing candidate page
    223 			 */
    224 			++coal_cnt;
    225 			if (coal_cnt < fsf_pgcnt[coal_szc])
    226 				continue;
    227 
    228 			/*
    229 			 * We've got enough pages to coalesce, so do it.
    230 			 * After promoting, we clear coal_page, so it will
    231 			 * take another pass to promote this to an even
    232 			 * larger page.
    233 			 */
    234 			++ncoalesce;
    235 			(void) page_promote_size(coal_page, coal_szc);
    236 			coal_page = NULL;
    237 			continue;
    238 		} else {
    239 			coal_page = NULL;
    240 		}
    241 
    242 		if (PP_ISKAS(pp) ||
    243 		    PAGE_LOCKED(pp) ||
    244 		    pp->p_lckcnt != 0 ||
    245 		    pp->p_cowcnt != 0) {
    246 			fspage = 0;
    247 			continue;
    248 		}
    249 
    250 
    251 		/*
    252 		 * Reject pages that can't be "exclusively" locked.
    253 		 */
    254 		if (!page_trylock(pp, SE_EXCL))
    255 			continue;
    256 		++nlocked;
    257 
    258 
    259 		/*
    260 		 * After locking the page, redo the above checks.
    261 		 * Since we locked the page, leave out the PAGE_LOCKED() test.
    262 		 */
    263 		vp = pp->p_vnode;
    264 		if (PP_ISSWAP(pp) ||
    265 		    PP_ISFREE(pp) ||
    266 		    vp == NULL ||
    267 		    PP_ISKAS(pp) ||
    268 		    (vp->v_flag & VISSWAP) != 0) {
    269 			page_unlock(pp);
    270 			fspage = 0;
    271 			continue;
    272 		}
    273 		if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
    274 			page_unlock(pp);
    275 			continue;
    276 		}
    277 
    278 		fspage = 1;
    279 		ASSERT(vp->v_type != VCHR);
    280 
    281 		/*
    282 		 * Check the modified bit. Leaving the bit alone in hardware.
    283 		 * It will be cleared if we do the putpage.
    284 		 */
    285 		if (IS_VMODSORT(vp))
    286 			mod = hat_ismod(pp);
    287 		else
    288 			mod = hat_pagesync(pp,
    289 			    HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD;
    290 
    291 		if (mod) {
    292 			++nmodified;
    293 			offset = pp->p_offset;
    294 
    295 			/*
    296 			 * Hold the vnode before releasing the page lock
    297 			 * to prevent it from being freed and re-used by
    298 			 * some other thread.
    299 			 */
    300 			VN_HOLD(vp);
    301 
    302 			page_unlock(pp);
    303 
    304 			(void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC,
    305 			    kcred, NULL);
    306 
    307 			VN_RELE(vp);
    308 		} else {
    309 
    310 			/*
    311 			 * Catch any pages which should be on the cache list,
    312 			 * but aren't yet.
    313 			 */
    314 			if (hat_page_is_mapped(pp) == 0) {
    315 				++releases;
    316 				(void) page_release(pp, 1);
    317 			} else {
    318 				page_unlock(pp);
    319 			}
    320 		}
    321 	}
    322 
    323 	/*
    324 	 * maintain statistics
    325 	 * reset every million wakeups, just to avoid overflow
    326 	 */
    327 	if (++fsf_cycles == 1000000) {
    328 		fsf_cycles = 0;
    329 		fsf_total.fsf_scan = 0;
    330 		fsf_total.fsf_examined = 0;
    331 		fsf_total.fsf_locked = 0;
    332 		fsf_total.fsf_modified = 0;
    333 		fsf_total.fsf_coalesce = 0;
    334 		fsf_total.fsf_time = 0;
    335 		fsf_total.fsf_releases = 0;
    336 	} else {
    337 		fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan;
    338 		fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined;
    339 		fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked;
    340 		fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified;
    341 		fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce;
    342 		fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer;
    343 		fsf_total.fsf_releases += fsf_recent.fsf_releases = releases;
    344 	}
    345 }
    346 
    347 /*
    348  * As part of file system hardening, this daemon is awakened
    349  * every second to flush cached data which includes the
    350  * buffer cache, the inode cache and mapped pages.
    351  */
    352 void
    353 fsflush()
    354 {
    355 	struct buf *bp, *dwp;
    356 	struct hbuf *hp;
    357 	int autoup;
    358 	unsigned int ix, icount, count = 0;
    359 	callb_cpr_t cprinfo;
    360 	uint_t		bcount;
    361 	kmutex_t	*hmp;
    362 	struct vfssw *vswp;
    363 
    364 	proc_fsflush = ttoproc(curthread);
    365 	proc_fsflush->p_cstime = 0;
    366 	proc_fsflush->p_stime =  0;
    367 	proc_fsflush->p_cutime =  0;
    368 	proc_fsflush->p_utime = 0;
    369 	bcopy("fsflush", curproc->p_user.u_psargs, 8);
    370 	bcopy("fsflush", curproc->p_user.u_comm, 7);
    371 
    372 	mutex_init(&fsflush_lock, NULL, MUTEX_DEFAULT, NULL);
    373 	sema_init(&fsflush_sema, 0, NULL, SEMA_DEFAULT, NULL);
    374 
    375 	/*
    376 	 * Setup page coalescing.
    377 	 */
    378 	fsf_npgsz = page_num_pagesizes();
    379 	ASSERT(fsf_npgsz < MAX_PAGESIZES);
    380 	for (ix = 0; ix < fsf_npgsz - 1; ++ix) {
    381 		fsf_pgcnt[ix] =
    382 		    page_get_pagesize(ix + 1) / page_get_pagesize(ix);
    383 		fsf_mask[ix] = page_get_pagecnt(ix + 1) - 1;
    384 	}
    385 
    386 	autoup = v.v_autoup * hz;
    387 	icount = v.v_autoup / tune.t_fsflushr;
    388 	CALLB_CPR_INIT(&cprinfo, &fsflush_lock, callb_generic_cpr, "fsflush");
    389 loop:
    390 	sema_v(&fsflush_sema);
    391 	mutex_enter(&fsflush_lock);
    392 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
    393 	cv_wait(&fsflush_cv, &fsflush_lock);		/* wait for clock */
    394 	CALLB_CPR_SAFE_END(&cprinfo, &fsflush_lock);
    395 	mutex_exit(&fsflush_lock);
    396 	sema_p(&fsflush_sema);
    397 
    398 	/*
    399 	 * Write back all old B_DELWRI buffers on the freelist.
    400 	 */
    401 	bcount = 0;
    402 	for (ix = 0; ix < v.v_hbuf; ix++) {
    403 
    404 		hp = &hbuf[ix];
    405 		dwp = (struct buf *)&dwbuf[ix];
    406 
    407 		bcount += (hp->b_length);
    408 
    409 		if (dwp->av_forw == dwp) {
    410 			continue;
    411 		}
    412 
    413 		hmp = &hbuf[ix].b_lock;
    414 		mutex_enter(hmp);
    415 		bp = dwp->av_forw;
    416 
    417 		/*
    418 		 * Go down only on the delayed write lists.
    419 		 */
    420 		while (bp != dwp) {
    421 
    422 			ASSERT(bp->b_flags & B_DELWRI);
    423 
    424 			if ((bp->b_flags & B_DELWRI) &&
    425 			    (ddi_get_lbolt() - bp->b_start >= autoup) &&
    426 			    sema_tryp(&bp->b_sem)) {
    427 				bp->b_flags |= B_ASYNC;
    428 				hp->b_length--;
    429 				notavail(bp);
    430 				mutex_exit(hmp);
    431 				if (bp->b_vp == NULL) {
    432 					BWRITE(bp);
    433 				} else {
    434 					UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs,
    435 					    bp);
    436 				}
    437 				mutex_enter(hmp);
    438 				bp = dwp->av_forw;
    439 			} else {
    440 				bp = bp->av_forw;
    441 			}
    442 		}
    443 		mutex_exit(hmp);
    444 	}
    445 
    446 	/*
    447 	 *
    448 	 * There is no need to wakeup any thread waiting on bio_mem_cv
    449 	 * since brelse will wake them up as soon as IO is complete.
    450 	 */
    451 	bfreelist.b_bcount = bcount;
    452 
    453 	if (dopageflush)
    454 		fsflush_do_pages();
    455 
    456 	if (!doiflush)
    457 		goto loop;
    458 
    459 	/*
    460 	 * If the system was not booted to single user mode, skip the
    461 	 * inode flushing until after fsflush_iflush_delay secs have elapsed.
    462 	 */
    463 	if ((boothowto & RB_SINGLE) == 0 &&
    464 	    (ddi_get_lbolt64() / hz) < fsflush_iflush_delay)
    465 		goto loop;
    466 
    467 	/*
    468 	 * Flush cached attribute information (e.g. inodes).
    469 	 */
    470 	if (++count >= icount) {
    471 		count = 0;
    472 
    473 		/*
    474 		 * Sync back cached data.
    475 		 */
    476 		RLOCK_VFSSW();
    477 		for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
    478 			if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
    479 				vfs_refvfssw(vswp);
    480 				RUNLOCK_VFSSW();
    481 				(void) fsop_sync_by_kind(vswp - vfssw,
    482 				    SYNC_ATTR, kcred);
    483 				vfs_unrefvfssw(vswp);
    484 				RLOCK_VFSSW();
    485 			}
    486 		}
    487 		RUNLOCK_VFSSW();
    488 	}
    489 	goto loop;
    490 }
    491