1 0 stevel /* 2 0 stevel * CDDL HEADER START 3 0 stevel * 4 0 stevel * The contents of this file are subject to the terms of the 5 3290 johansen * Common Development and Distribution License (the "License"). 6 3290 johansen * You may not use this file except in compliance with the License. 7 0 stevel * 8 0 stevel * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 0 stevel * or http://www.opensolaris.org/os/licensing. 10 0 stevel * See the License for the specific language governing permissions 11 0 stevel * and limitations under the License. 12 0 stevel * 13 0 stevel * When distributing Covered Code, include this CDDL HEADER in each 14 0 stevel * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 0 stevel * If applicable, add the following below this CDDL HEADER, with the 16 0 stevel * fields enclosed by brackets "[]" replaced with your own identifying 17 0 stevel * information: Portions Copyright [yyyy] [name of copyright owner] 18 0 stevel * 19 0 stevel * CDDL HEADER END 20 0 stevel */ 21 0 stevel /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 22 0 stevel /* All Rights Reserved */ 23 0 stevel 24 0 stevel 25 0 stevel /* 26 8756 Amrita * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 27 0 stevel * Use is subject to license terms. 28 0 stevel */ 29 0 stevel 30 0 stevel #include <sys/types.h> 31 0 stevel #include <sys/t_lock.h> 32 0 stevel #include <sys/param.h> 33 0 stevel #include <sys/tuneable.h> 34 0 stevel #include <sys/inline.h> 35 0 stevel #include <sys/systm.h> 36 0 stevel #include <sys/proc.h> 37 0 stevel #include <sys/user.h> 38 0 stevel #include <sys/var.h> 39 0 stevel #include <sys/buf.h> 40 0 stevel #include <sys/vfs.h> 41 0 stevel #include <sys/cred.h> 42 0 stevel #include <sys/kmem.h> 43 0 stevel #include <sys/vnode.h> 44 0 stevel #include <sys/swap.h> 45 0 stevel #include <sys/vm.h> 46 0 stevel #include <sys/debug.h> 47 0 stevel #include <sys/cmn_err.h> 48 0 stevel #include <sys/sysinfo.h> 49 0 stevel #include <sys/callb.h> 50 0 stevel #include <sys/reboot.h> 51 0 stevel #include <sys/time.h> 52 0 stevel #include <sys/fs/ufs_inode.h> 53 0 stevel #include <sys/fs/ufs_bio.h> 54 0 stevel 55 0 stevel #include <vm/hat.h> 56 0 stevel #include <vm/page.h> 57 0 stevel #include <vm/pvn.h> 58 0 stevel #include <vm/seg_kmem.h> 59 0 stevel 60 0 stevel int doiflush = 1; /* non-zero to turn inode flushing on */ 61 0 stevel int dopageflush = 1; /* non-zero to turn page flushing on */ 62 0 stevel 63 0 stevel /* 64 0 stevel * To improve boot performance, don't run the inode flushing loop until 65 0 stevel * the specified number of seconds after boot. To revert to the old 66 0 stevel * behavior, set fsflush_iflush_delay to 0. We have not created any new 67 0 stevel * filesystem danger that did not exist previously, since there is always a 68 0 stevel * window in between when fsflush does the inode flush loop during which the 69 0 stevel * system could crash, fail to sync the filesystem, and fsck will be needed 70 0 stevel * to recover. We have, however, widened this window. Finally, 71 0 stevel * we never delay inode flushing if we're booting into single user mode, 72 0 stevel * where the administrator may be modifying files or using fsck. This 73 0 stevel * modification avoids inode flushes during boot whose only purpose is to 74 0 stevel * update atimes on files which have been accessed during boot. 75 0 stevel */ 76 0 stevel int fsflush_iflush_delay = 60; 77 0 stevel 78 0 stevel kcondvar_t fsflush_cv; 79 0 stevel static kmutex_t fsflush_lock; /* just for the cv_wait */ 80 0 stevel ksema_t fsflush_sema; /* to serialize with reboot */ 81 0 stevel 82 0 stevel /* 83 0 stevel * some statistics for fsflush_do_pages 84 0 stevel */ 85 0 stevel typedef struct { 86 0 stevel ulong_t fsf_scan; /* number of pages scanned */ 87 0 stevel ulong_t fsf_examined; /* number of page_t's actually examined, can */ 88 0 stevel /* be less than fsf_scan due to large pages */ 89 0 stevel ulong_t fsf_locked; /* pages we actually page_lock()ed */ 90 0 stevel ulong_t fsf_modified; /* number of modified pages found */ 91 0 stevel ulong_t fsf_coalesce; /* number of page coalesces done */ 92 0 stevel ulong_t fsf_time; /* nanoseconds of run time */ 93 0 stevel ulong_t fsf_releases; /* number of page_release() done */ 94 0 stevel } fsf_stat_t; 95 0 stevel 96 0 stevel fsf_stat_t fsf_recent; /* counts for most recent duty cycle */ 97 0 stevel fsf_stat_t fsf_total; /* total of counts */ 98 0 stevel ulong_t fsf_cycles; /* number of runs refelected in fsf_total */ 99 0 stevel 100 0 stevel /* 101 5331 amw * data used to determine when we can coalesce consecutive free pages 102 0 stevel * into larger pages. 103 0 stevel */ 104 0 stevel #define MAX_PAGESIZES 32 105 0 stevel static ulong_t fsf_npgsz; 106 0 stevel static pgcnt_t fsf_pgcnt[MAX_PAGESIZES]; 107 0 stevel static pgcnt_t fsf_mask[MAX_PAGESIZES]; 108 0 stevel 109 0 stevel 110 0 stevel /* 111 0 stevel * Scan page_t's and issue I/O's for modified pages. 112 0 stevel * 113 0 stevel * Also coalesces consecutive small sized free pages into the next larger 114 0 stevel * pagesize. This costs a tiny bit of time in fsflush, but will reduce time 115 0 stevel * spent scanning on later passes and for anybody allocating large pages. 116 0 stevel */ 117 0 stevel static void 118 0 stevel fsflush_do_pages() 119 0 stevel { 120 0 stevel vnode_t *vp; 121 0 stevel ulong_t pcount; 122 0 stevel hrtime_t timer = gethrtime(); 123 0 stevel ulong_t releases = 0; 124 0 stevel ulong_t nexamined = 0; 125 0 stevel ulong_t nlocked = 0; 126 0 stevel ulong_t nmodified = 0; 127 0 stevel ulong_t ncoalesce = 0; 128 10106 Jason ulong_t cnt; 129 0 stevel int mod; 130 10106 Jason int fspage = 1; 131 0 stevel u_offset_t offset; 132 0 stevel uint_t szc; 133 0 stevel 134 5331 amw page_t *coal_page = NULL; /* 1st page in group to coalesce */ 135 0 stevel uint_t coal_szc = 0; /* size code, coal_page->p_szc */ 136 0 stevel uint_t coal_cnt = 0; /* count of pages seen */ 137 0 stevel 138 0 stevel static ulong_t nscan = 0; 139 0 stevel static pgcnt_t last_total_pages = 0; 140 10106 Jason static page_t *pp = NULL; 141 0 stevel 142 0 stevel /* 143 0 stevel * Check to see if total_pages has changed. 144 0 stevel */ 145 0 stevel if (total_pages != last_total_pages) { 146 0 stevel last_total_pages = total_pages; 147 0 stevel nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup; 148 0 stevel } 149 0 stevel 150 10106 Jason if (pp == NULL) 151 10106 Jason pp = memsegs->pages; 152 0 stevel 153 0 stevel pcount = 0; 154 8756 Amrita while (pcount < nscan) { 155 0 stevel 156 0 stevel /* 157 0 stevel * move to the next page, skipping over large pages 158 0 stevel * and issuing prefetches. 159 0 stevel */ 160 10106 Jason if (pp->p_szc && fspage == 0) { 161 10106 Jason pfn_t pfn; 162 10106 Jason 163 10106 Jason pfn = page_pptonum(pp); 164 10106 Jason cnt = page_get_pagecnt(pp->p_szc); 165 10106 Jason cnt -= pfn & (cnt - 1); 166 10106 Jason } else 167 10106 Jason cnt = 1; 168 10106 Jason 169 10106 Jason pp = page_nextn(pp, cnt); 170 0 stevel prefetch_page_r((void *)pp); 171 0 stevel ASSERT(pp != NULL); 172 10106 Jason pcount += cnt; 173 0 stevel 174 0 stevel /* 175 0 stevel * Do a bunch of dirty tests (ie. no locking) to determine 176 0 stevel * if we can quickly skip this page. These tests are repeated 177 0 stevel * after acquiring the page lock. 178 0 stevel */ 179 0 stevel ++nexamined; 180 0 stevel if (PP_ISSWAP(pp)) { 181 10106 Jason fspage = 0; 182 0 stevel coal_page = NULL; 183 0 stevel continue; 184 0 stevel } 185 0 stevel 186 0 stevel /* 187 0 stevel * skip free pages too, but try coalescing them into larger 188 0 stevel * pagesizes 189 0 stevel */ 190 0 stevel if (PP_ISFREE(pp)) { 191 0 stevel /* 192 0 stevel * skip pages with a file system identity or that 193 0 stevel * are already maximum size 194 0 stevel */ 195 10106 Jason fspage = 0; 196 0 stevel szc = pp->p_szc; 197 0 stevel if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) { 198 0 stevel coal_page = NULL; 199 0 stevel continue; 200 0 stevel } 201 0 stevel 202 0 stevel /* 203 0 stevel * If not in a coalescing candidate page or the size 204 0 stevel * codes are different, start a new candidate. 205 0 stevel */ 206 0 stevel if (coal_page == NULL || coal_szc != szc) { 207 0 stevel 208 0 stevel /* 209 0 stevel * page must be properly aligned 210 0 stevel */ 211 0 stevel if ((page_pptonum(pp) & fsf_mask[szc]) != 0) { 212 0 stevel coal_page = NULL; 213 0 stevel continue; 214 0 stevel } 215 0 stevel coal_page = pp; 216 0 stevel coal_szc = szc; 217 0 stevel coal_cnt = 1; 218 0 stevel continue; 219 0 stevel } 220 0 stevel 221 0 stevel /* 222 0 stevel * acceptable to add this to existing candidate page 223 0 stevel */ 224 0 stevel ++coal_cnt; 225 0 stevel if (coal_cnt < fsf_pgcnt[coal_szc]) 226 0 stevel continue; 227 0 stevel 228 0 stevel /* 229 0 stevel * We've got enough pages to coalesce, so do it. 230 0 stevel * After promoting, we clear coal_page, so it will 231 0 stevel * take another pass to promote this to an even 232 0 stevel * larger page. 233 0 stevel */ 234 0 stevel ++ncoalesce; 235 0 stevel (void) page_promote_size(coal_page, coal_szc); 236 0 stevel coal_page = NULL; 237 0 stevel continue; 238 0 stevel } else { 239 0 stevel coal_page = NULL; 240 0 stevel } 241 0 stevel 242 3290 johansen if (PP_ISKAS(pp) || 243 0 stevel PAGE_LOCKED(pp) || 244 0 stevel pp->p_lckcnt != 0 || 245 10106 Jason pp->p_cowcnt != 0) { 246 10106 Jason fspage = 0; 247 0 stevel continue; 248 10106 Jason } 249 0 stevel 250 0 stevel 251 0 stevel /* 252 0 stevel * Reject pages that can't be "exclusively" locked. 253 0 stevel */ 254 0 stevel if (!page_trylock(pp, SE_EXCL)) 255 0 stevel continue; 256 0 stevel ++nlocked; 257 0 stevel 258 0 stevel 259 0 stevel /* 260 0 stevel * After locking the page, redo the above checks. 261 0 stevel * Since we locked the page, leave out the PAGE_LOCKED() test. 262 0 stevel */ 263 0 stevel vp = pp->p_vnode; 264 0 stevel if (PP_ISSWAP(pp) || 265 0 stevel PP_ISFREE(pp) || 266 0 stevel vp == NULL || 267 3290 johansen PP_ISKAS(pp) || 268 0 stevel (vp->v_flag & VISSWAP) != 0) { 269 10106 Jason page_unlock(pp); 270 10106 Jason fspage = 0; 271 10106 Jason continue; 272 10106 Jason } 273 10106 Jason if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 274 0 stevel page_unlock(pp); 275 0 stevel continue; 276 0 stevel } 277 0 stevel 278 10106 Jason fspage = 1; 279 0 stevel ASSERT(vp->v_type != VCHR); 280 0 stevel 281 0 stevel /* 282 0 stevel * Check the modified bit. Leaving the bit alone in hardware. 283 0 stevel * It will be cleared if we do the putpage. 284 0 stevel */ 285 0 stevel if (IS_VMODSORT(vp)) 286 0 stevel mod = hat_ismod(pp); 287 0 stevel else 288 0 stevel mod = hat_pagesync(pp, 289 0 stevel HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD; 290 0 stevel 291 0 stevel if (mod) { 292 0 stevel ++nmodified; 293 0 stevel offset = pp->p_offset; 294 0 stevel 295 0 stevel /* 296 0 stevel * Hold the vnode before releasing the page lock 297 0 stevel * to prevent it from being freed and re-used by 298 0 stevel * some other thread. 299 0 stevel */ 300 0 stevel VN_HOLD(vp); 301 0 stevel 302 0 stevel page_unlock(pp); 303 0 stevel 304 0 stevel (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC, 305 5331 amw kcred, NULL); 306 0 stevel 307 0 stevel VN_RELE(vp); 308 0 stevel } else { 309 0 stevel 310 0 stevel /* 311 0 stevel * Catch any pages which should be on the cache list, 312 0 stevel * but aren't yet. 313 0 stevel */ 314 0 stevel if (hat_page_is_mapped(pp) == 0) { 315 0 stevel ++releases; 316 0 stevel (void) page_release(pp, 1); 317 0 stevel } else { 318 0 stevel page_unlock(pp); 319 0 stevel } 320 0 stevel } 321 0 stevel } 322 0 stevel 323 0 stevel /* 324 0 stevel * maintain statistics 325 0 stevel * reset every million wakeups, just to avoid overflow 326 0 stevel */ 327 0 stevel if (++fsf_cycles == 1000000) { 328 0 stevel fsf_cycles = 0; 329 0 stevel fsf_total.fsf_scan = 0; 330 0 stevel fsf_total.fsf_examined = 0; 331 0 stevel fsf_total.fsf_locked = 0; 332 0 stevel fsf_total.fsf_modified = 0; 333 0 stevel fsf_total.fsf_coalesce = 0; 334 0 stevel fsf_total.fsf_time = 0; 335 0 stevel fsf_total.fsf_releases = 0; 336 0 stevel } else { 337 0 stevel fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan; 338 0 stevel fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined; 339 0 stevel fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked; 340 0 stevel fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified; 341 0 stevel fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce; 342 0 stevel fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer; 343 0 stevel fsf_total.fsf_releases += fsf_recent.fsf_releases = releases; 344 0 stevel } 345 0 stevel } 346 0 stevel 347 0 stevel /* 348 0 stevel * As part of file system hardening, this daemon is awakened 349 0 stevel * every second to flush cached data which includes the 350 0 stevel * buffer cache, the inode cache and mapped pages. 351 0 stevel */ 352 0 stevel void 353 0 stevel fsflush() 354 0 stevel { 355 0 stevel struct buf *bp, *dwp; 356 0 stevel struct hbuf *hp; 357 0 stevel int autoup; 358 0 stevel unsigned int ix, icount, count = 0; 359 0 stevel callb_cpr_t cprinfo; 360 0 stevel uint_t bcount; 361 0 stevel kmutex_t *hmp; 362 0 stevel struct vfssw *vswp; 363 0 stevel 364 0 stevel proc_fsflush = ttoproc(curthread); 365 0 stevel proc_fsflush->p_cstime = 0; 366 0 stevel proc_fsflush->p_stime = 0; 367 0 stevel proc_fsflush->p_cutime = 0; 368 0 stevel proc_fsflush->p_utime = 0; 369 3446 mrj bcopy("fsflush", curproc->p_user.u_psargs, 8); 370 3446 mrj bcopy("fsflush", curproc->p_user.u_comm, 7); 371 0 stevel 372 0 stevel mutex_init(&fsflush_lock, NULL, MUTEX_DEFAULT, NULL); 373 0 stevel sema_init(&fsflush_sema, 0, NULL, SEMA_DEFAULT, NULL); 374 0 stevel 375 0 stevel /* 376 0 stevel * Setup page coalescing. 377 0 stevel */ 378 0 stevel fsf_npgsz = page_num_pagesizes(); 379 0 stevel ASSERT(fsf_npgsz < MAX_PAGESIZES); 380 0 stevel for (ix = 0; ix < fsf_npgsz - 1; ++ix) { 381 0 stevel fsf_pgcnt[ix] = 382 0 stevel page_get_pagesize(ix + 1) / page_get_pagesize(ix); 383 0 stevel fsf_mask[ix] = page_get_pagecnt(ix + 1) - 1; 384 0 stevel } 385 0 stevel 386 0 stevel autoup = v.v_autoup * hz; 387 0 stevel icount = v.v_autoup / tune.t_fsflushr; 388 0 stevel CALLB_CPR_INIT(&cprinfo, &fsflush_lock, callb_generic_cpr, "fsflush"); 389 0 stevel loop: 390 0 stevel sema_v(&fsflush_sema); 391 0 stevel mutex_enter(&fsflush_lock); 392 0 stevel CALLB_CPR_SAFE_BEGIN(&cprinfo); 393 0 stevel cv_wait(&fsflush_cv, &fsflush_lock); /* wait for clock */ 394 0 stevel CALLB_CPR_SAFE_END(&cprinfo, &fsflush_lock); 395 0 stevel mutex_exit(&fsflush_lock); 396 0 stevel sema_p(&fsflush_sema); 397 0 stevel 398 0 stevel /* 399 0 stevel * Write back all old B_DELWRI buffers on the freelist. 400 0 stevel */ 401 0 stevel bcount = 0; 402 0 stevel for (ix = 0; ix < v.v_hbuf; ix++) { 403 0 stevel 404 0 stevel hp = &hbuf[ix]; 405 0 stevel dwp = (struct buf *)&dwbuf[ix]; 406 0 stevel 407 0 stevel bcount += (hp->b_length); 408 0 stevel 409 0 stevel if (dwp->av_forw == dwp) { 410 0 stevel continue; 411 0 stevel } 412 0 stevel 413 0 stevel hmp = &hbuf[ix].b_lock; 414 0 stevel mutex_enter(hmp); 415 0 stevel bp = dwp->av_forw; 416 0 stevel 417 0 stevel /* 418 0 stevel * Go down only on the delayed write lists. 419 0 stevel */ 420 0 stevel while (bp != dwp) { 421 0 stevel 422 0 stevel ASSERT(bp->b_flags & B_DELWRI); 423 0 stevel 424 0 stevel if ((bp->b_flags & B_DELWRI) && 425 11066 rafael (ddi_get_lbolt() - bp->b_start >= autoup) && 426 0 stevel sema_tryp(&bp->b_sem)) { 427 0 stevel bp->b_flags |= B_ASYNC; 428 0 stevel hp->b_length--; 429 0 stevel notavail(bp); 430 0 stevel mutex_exit(hmp); 431 0 stevel if (bp->b_vp == NULL) { 432 0 stevel BWRITE(bp); 433 0 stevel } else { 434 0 stevel UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, 435 8756 Amrita bp); 436 0 stevel } 437 0 stevel mutex_enter(hmp); 438 0 stevel bp = dwp->av_forw; 439 0 stevel } else { 440 0 stevel bp = bp->av_forw; 441 0 stevel } 442 0 stevel } 443 0 stevel mutex_exit(hmp); 444 0 stevel } 445 0 stevel 446 0 stevel /* 447 0 stevel * 448 0 stevel * There is no need to wakeup any thread waiting on bio_mem_cv 449 0 stevel * since brelse will wake them up as soon as IO is complete. 450 0 stevel */ 451 0 stevel bfreelist.b_bcount = bcount; 452 0 stevel 453 0 stevel if (dopageflush) 454 0 stevel fsflush_do_pages(); 455 0 stevel 456 0 stevel if (!doiflush) 457 0 stevel goto loop; 458 0 stevel 459 0 stevel /* 460 0 stevel * If the system was not booted to single user mode, skip the 461 0 stevel * inode flushing until after fsflush_iflush_delay secs have elapsed. 462 0 stevel */ 463 0 stevel if ((boothowto & RB_SINGLE) == 0 && 464 11066 rafael (ddi_get_lbolt64() / hz) < fsflush_iflush_delay) 465 0 stevel goto loop; 466 0 stevel 467 0 stevel /* 468 0 stevel * Flush cached attribute information (e.g. inodes). 469 0 stevel */ 470 0 stevel if (++count >= icount) { 471 0 stevel count = 0; 472 0 stevel 473 0 stevel /* 474 0 stevel * Sync back cached data. 475 0 stevel */ 476 0 stevel RLOCK_VFSSW(); 477 0 stevel for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 478 0 stevel if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) { 479 0 stevel vfs_refvfssw(vswp); 480 0 stevel RUNLOCK_VFSSW(); 481 0 stevel (void) fsop_sync_by_kind(vswp - vfssw, 482 8756 Amrita SYNC_ATTR, kcred); 483 0 stevel vfs_unrefvfssw(vswp); 484 0 stevel RLOCK_VFSSW(); 485 0 stevel } 486 0 stevel } 487 0 stevel RUNLOCK_VFSSW(); 488 0 stevel } 489 0 stevel goto loop; 490 0 stevel } 491