Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * Page Retire - Big Theory Statement.
     28  *
     29  * This file handles removing sections of faulty memory from use when the
     30  * user land FMA Diagnosis Engine requests that a page be removed or when
     31  * a CE or UE is detected by the hardware.
     32  *
     33  * In the bad old days, the kernel side of Page Retire did a lot of the work
     34  * on its own. Now, with the DE keeping track of errors, the kernel side is
     35  * rather simple minded on most platforms.
     36  *
     37  * Errors are all reflected to the DE, and after digesting the error and
     38  * looking at all previously reported errors, the DE decides what should
     39  * be done about the current error. If the DE wants a particular page to
     40  * be retired, then the kernel page retire code is invoked via an ioctl.
     41  * On non-FMA platforms, the ue_drain and ce_drain paths ends up calling
     42  * page retire to handle the error. Since page retire is just a simple
     43  * mechanism it doesn't need to differentiate between the different callers.
     44  *
     45  * The p_toxic field in the page_t is used to indicate which errors have
     46  * occurred and what action has been taken on a given page. Because errors are
     47  * reported without regard to the locked state of a page, no locks are used
     48  * to SET the error bits in p_toxic. However, in order to clear the error
     49  * bits, the page_t must be held exclusively locked.
     50  *
     51  * When page_retire() is called, it must be able to acquire locks, sleep, etc.
     52  * It must not be called from high-level interrupt context.
     53  *
     54  * Depending on how the requested page is being used at the time of the retire
     55  * request (and on the availability of sufficient system resources), the page
     56  * may be retired immediately, or just marked for retirement later. For
     57  * example, locked pages are marked, while free pages are retired. Multiple
     58  * requests may be made to retire the same page, although there is no need
     59  * to: once the p_toxic flags are set, the page will be retired as soon as it
     60  * can be exclusively locked.
     61  *
     62  * The retire mechanism is driven centrally out of page_unlock(). To expedite
     63  * the retirement of pages, further requests for SE_SHARED locks are denied
     64  * as long as a page retirement is pending. In addition, as long as pages are
     65  * pending retirement a background thread runs periodically trying to retire
     66  * those pages. Pages which could not be retired while the system is running
     67  * are scrubbed prior to rebooting to avoid latent errors on the next boot.
     68  *
     69  * UE pages without persistent errors are scrubbed and returned to service.
     70  * Recidivist pages, as well as FMA-directed requests for retirement, result
     71  * in the page being taken out of service. Once the decision is made to take
     72  * a page out of service, the page is cleared, hashed onto the retired_pages
     73  * vnode, marked as retired, and it is unlocked.  No other requesters (except
     74  * for unretire) are allowed to lock retired pages.
     75  *
     76  * The public routines return (sadly) 0 if they worked and a non-zero error
     77  * value if something went wrong. This is done for the ioctl side of the
     78  * world to allow errors to be reflected all the way out to user land. The
     79  * non-zero values are explained in comments atop each function.
     80  */
     81 
     82 /*
     83  * Things to fix:
     84  *
     85  * 	1. Trying to retire non-relocatable kvp pages may result in a
     86  *      quagmire. This is because seg_kmem() no longer keeps its pages locked,
     87  *      and calls page_lookup() in the free path; since kvp pages are modified
     88  *      and don't have a usable backing store, page_retire() can't do anything
     89  *      with them, and we'll keep denying the lock to seg_kmem_free() in a
     90  *      vicious cycle. To prevent that, we don't deny locks to kvp pages, and
     91  *      hence only try to retire a page from page_unlock() in the free path.
     92  *      Since most kernel pages are indefinitely held anyway, and don't
     93  *      participate in I/O, this is of little consequence.
     94  *
     95  *      2. Low memory situations will be interesting. If we don't have
     96  *      enough memory for page_relocate() to succeed, we won't be able to
     97  *      retire dirty pages; nobody will be able to push them out to disk
     98  *      either, since we aggressively deny the page lock. We could change
     99  *      fsflush so it can recognize this situation, grab the lock, and push
    100  *      the page out, where we'll catch it in the free path and retire it.
    101  *
    102  *	3. Beware of places that have code like this in them:
    103  *
    104  *		if (! page_tryupgrade(pp)) {
    105  *			page_unlock(pp);
    106  *			while (! page_lock(pp, SE_EXCL, NULL, P_RECLAIM)) {
    107  *				/ *NOTHING* /
    108  *			}
    109  *		}
    110  *		page_free(pp);
    111  *
    112  *	The problem is that pp can change identity right after the
    113  *	page_unlock() call.  In particular, page_retire() can step in
    114  *	there, change pp's identity, and hash pp onto the retired_vnode.
    115  *
    116  *	Of course, other functions besides page_retire() can have the
    117  *	same effect. A kmem reader can waltz by, set up a mapping to the
    118  *	page, and then unlock the page. Page_free() will then go castors
    119  *	up. So if anybody is doing this, it's already a bug.
    120  *
    121  *      4. mdboot()'s call into page_retire_mdboot() should probably be
    122  *      moved lower. Where the call is made now, we can get into trouble
    123  *      by scrubbing a kernel page that is then accessed later.
    124  */
    125 
    126 #include <sys/types.h>
    127 #include <sys/param.h>
    128 #include <sys/systm.h>
    129 #include <sys/mman.h>
    130 #include <sys/vnode.h>
    131 #include <sys/vfs_opreg.h>
    132 #include <sys/cmn_err.h>
    133 #include <sys/ksynch.h>
    134 #include <sys/thread.h>
    135 #include <sys/disp.h>
    136 #include <sys/ontrap.h>
    137 #include <sys/vmsystm.h>
    138 #include <sys/mem_config.h>
    139 #include <sys/atomic.h>
    140 #include <sys/callb.h>
    141 #include <vm/page.h>
    142 #include <vm/vm_dep.h>
    143 #include <vm/as.h>
    144 #include <vm/hat.h>
    145 #include <vm/seg_kmem.h>
    146 
    147 /*
    148  * vnode for all pages which are retired from the VM system;
    149  */
    150 vnode_t *retired_pages;
    151 
    152 static int page_retire_pp_finish(page_t *, void *, uint_t);
    153 
    154 /*
    155  * Make a list of all of the pages that have been marked for retirement
    156  * but are not yet retired.  At system shutdown, we will scrub all of the
    157  * pages in the list in case there are outstanding UEs.  Then, we
    158  * cross-check this list against the number of pages that are yet to be
    159  * retired, and if we find inconsistencies, we scan every page_t in the
    160  * whole system looking for any pages that need to be scrubbed for UEs.
    161  * The background thread also uses this queue to determine which pages
    162  * it should keep trying to retire.
    163  */
    164 #ifdef	DEBUG
    165 #define	PR_PENDING_QMAX	32
    166 #else	/* DEBUG */
    167 #define	PR_PENDING_QMAX	256
    168 #endif	/* DEBUG */
    169 page_t		*pr_pending_q[PR_PENDING_QMAX];
    170 kmutex_t	pr_q_mutex;
    171 
    172 /*
    173  * Page retire global kstats
    174  */
    175 struct page_retire_kstat {
    176 	kstat_named_t	pr_retired;
    177 	kstat_named_t	pr_requested;
    178 	kstat_named_t	pr_requested_free;
    179 	kstat_named_t	pr_enqueue_fail;
    180 	kstat_named_t	pr_dequeue_fail;
    181 	kstat_named_t	pr_pending;
    182 	kstat_named_t	pr_pending_kas;
    183 	kstat_named_t	pr_failed;
    184 	kstat_named_t	pr_failed_kernel;
    185 	kstat_named_t	pr_limit;
    186 	kstat_named_t	pr_limit_exceeded;
    187 	kstat_named_t	pr_fma;
    188 	kstat_named_t	pr_mce;
    189 	kstat_named_t	pr_ue;
    190 	kstat_named_t	pr_ue_cleared_retire;
    191 	kstat_named_t	pr_ue_cleared_free;
    192 	kstat_named_t	pr_ue_persistent;
    193 	kstat_named_t	pr_unretired;
    194 };
    195 
    196 static struct page_retire_kstat page_retire_kstat = {
    197 	{ "pages_retired",		KSTAT_DATA_UINT64},
    198 	{ "pages_retire_request",	KSTAT_DATA_UINT64},
    199 	{ "pages_retire_request_free",	KSTAT_DATA_UINT64},
    200 	{ "pages_notenqueued", 		KSTAT_DATA_UINT64},
    201 	{ "pages_notdequeued", 		KSTAT_DATA_UINT64},
    202 	{ "pages_pending", 		KSTAT_DATA_UINT64},
    203 	{ "pages_pending_kas", 		KSTAT_DATA_UINT64},
    204 	{ "pages_deferred",		KSTAT_DATA_UINT64},
    205 	{ "pages_deferred_kernel",	KSTAT_DATA_UINT64},
    206 	{ "pages_limit",		KSTAT_DATA_UINT64},
    207 	{ "pages_limit_exceeded",	KSTAT_DATA_UINT64},
    208 	{ "pages_fma",			KSTAT_DATA_UINT64},
    209 	{ "pages_multiple_ce",		KSTAT_DATA_UINT64},
    210 	{ "pages_ue",			KSTAT_DATA_UINT64},
    211 	{ "pages_ue_cleared_retired",	KSTAT_DATA_UINT64},
    212 	{ "pages_ue_cleared_freed",	KSTAT_DATA_UINT64},
    213 	{ "pages_ue_persistent",	KSTAT_DATA_UINT64},
    214 	{ "pages_unretired",		KSTAT_DATA_UINT64},
    215 };
    216 
    217 static kstat_t  *page_retire_ksp = NULL;
    218 
    219 #define	PR_INCR_KSTAT(stat)	\
    220 	atomic_add_64(&(page_retire_kstat.stat.value.ui64), 1)
    221 #define	PR_DECR_KSTAT(stat)	\
    222 	atomic_add_64(&(page_retire_kstat.stat.value.ui64), -1)
    223 
    224 #define	PR_KSTAT_RETIRED_CE	(page_retire_kstat.pr_mce.value.ui64)
    225 #define	PR_KSTAT_RETIRED_FMA	(page_retire_kstat.pr_fma.value.ui64)
    226 #define	PR_KSTAT_RETIRED_NOTUE	(PR_KSTAT_RETIRED_CE + PR_KSTAT_RETIRED_FMA)
    227 #define	PR_KSTAT_PENDING	(page_retire_kstat.pr_pending.value.ui64)
    228 #define	PR_KSTAT_PENDING_KAS	(page_retire_kstat.pr_pending_kas.value.ui64)
    229 #define	PR_KSTAT_EQFAIL		(page_retire_kstat.pr_enqueue_fail.value.ui64)
    230 #define	PR_KSTAT_DQFAIL		(page_retire_kstat.pr_dequeue_fail.value.ui64)
    231 
    232 /*
    233  * page retire kstats to list all retired pages
    234  */
    235 static int pr_list_kstat_update(kstat_t *ksp, int rw);
    236 static int pr_list_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
    237 kmutex_t pr_list_kstat_mutex;
    238 
    239 /*
    240  * Limit the number of multiple CE page retires.
    241  * The default is 0.1% of physmem, or 1 in 1000 pages. This is set in
    242  * basis points, where 100 basis points equals one percent.
    243  */
    244 #define	MCE_BPT	10
    245 uint64_t	max_pages_retired_bps = MCE_BPT;
    246 #define	PAGE_RETIRE_LIMIT	((physmem * max_pages_retired_bps) / 10000)
    247 
    248 /*
    249  * Control over the verbosity of page retirement.
    250  *
    251  * When set to zero (the default), no messages will be printed.
    252  * When set to one, summary messages will be printed.
    253  * When set > one, all messages will be printed.
    254  *
    255  * A value of one will trigger detailed messages for retirement operations,
    256  * and is intended as a platform tunable for processors where FMA's DE does
    257  * not run (e.g., spitfire). Values > one are intended for debugging only.
    258  */
    259 int page_retire_messages = 0;
    260 
    261 /*
    262  * Control whether or not we return scrubbed UE pages to service.
    263  * By default we do not since FMA wants to run its diagnostics first
    264  * and then ask us to unretire the page if it passes. Non-FMA platforms
    265  * may set this to zero so we will only retire recidivist pages. It should
    266  * not be changed by the user.
    267  */
    268 int page_retire_first_ue = 1;
    269 
    270 /*
    271  * Master enable for page retire. This prevents a CE or UE early in boot
    272  * from trying to retire a page before page_retire_init() has finished
    273  * setting things up. This is internal only and is not a tunable!
    274  */
    275 static int pr_enable = 0;
    276 
    277 #ifdef	DEBUG
    278 struct page_retire_debug {
    279 	int prd_dup1;
    280 	int prd_dup2;
    281 	int prd_qdup;
    282 	int prd_noaction;
    283 	int prd_queued;
    284 	int prd_notqueued;
    285 	int prd_dequeue;
    286 	int prd_top;
    287 	int prd_locked;
    288 	int prd_reloc;
    289 	int prd_relocfail;
    290 	int prd_mod;
    291 	int prd_mod_late;
    292 	int prd_kern;
    293 	int prd_free;
    294 	int prd_noreclaim;
    295 	int prd_hashout;
    296 	int prd_fma;
    297 	int prd_uescrubbed;
    298 	int prd_uenotscrubbed;
    299 	int prd_mce;
    300 	int prd_prlocked;
    301 	int prd_prnotlocked;
    302 	int prd_prretired;
    303 	int prd_ulocked;
    304 	int prd_unotretired;
    305 	int prd_udestroy;
    306 	int prd_uhashout;
    307 	int prd_uunretired;
    308 	int prd_unotlocked;
    309 	int prd_checkhit;
    310 	int prd_checkmiss_pend;
    311 	int prd_checkmiss_noerr;
    312 	int prd_tctop;
    313 	int prd_tclocked;
    314 	int prd_hunt;
    315 	int prd_dohunt;
    316 	int prd_earlyhunt;
    317 	int prd_latehunt;
    318 	int prd_nofreedemote;
    319 	int prd_nodemote;
    320 	int prd_demoted;
    321 } pr_debug;
    322 
    323 #define	PR_DEBUG(foo)	((pr_debug.foo)++)
    324 
    325 /*
    326  * A type histogram. We record the incidence of the various toxic
    327  * flag combinations along with the interesting page attributes. The
    328  * goal is to get as many combinations as we can while driving all
    329  * pr_debug values nonzero (indicating we've exercised all possible
    330  * code paths across all possible page types). Not all combinations
    331  * will make sense -- e.g. PRT_MOD|PRT_KERNEL.
    332  *
    333  * pr_type offset bit encoding (when examining with a debugger):
    334  *
    335  *    PRT_NAMED  - 0x4
    336  *    PRT_KERNEL - 0x8
    337  *    PRT_FREE   - 0x10
    338  *    PRT_MOD    - 0x20
    339  *    PRT_FMA    - 0x0
    340  *    PRT_MCE    - 0x40
    341  *    PRT_UE     - 0x80
    342  */
    343 
    344 #define	PRT_NAMED	0x01
    345 #define	PRT_KERNEL	0x02
    346 #define	PRT_FREE	0x04
    347 #define	PRT_MOD		0x08
    348 #define	PRT_FMA		0x00	/* yes, this is not a mistake */
    349 #define	PRT_MCE		0x10
    350 #define	PRT_UE		0x20
    351 #define	PRT_ALL		0x3F
    352 
    353 int pr_types[PRT_ALL+1];
    354 
    355 #define	PR_TYPES(pp)	{			\
    356 	int whichtype = 0;			\
    357 	if (pp->p_vnode)			\
    358 		whichtype |= PRT_NAMED;		\
    359 	if (PP_ISKAS(pp))			\
    360 		whichtype |= PRT_KERNEL;	\
    361 	if (PP_ISFREE(pp))			\
    362 		whichtype |= PRT_FREE;		\
    363 	if (hat_ismod(pp))			\
    364 		whichtype |= PRT_MOD;		\
    365 	if (pp->p_toxic & PR_UE)		\
    366 		whichtype |= PRT_UE;		\
    367 	if (pp->p_toxic & PR_MCE)		\
    368 		whichtype |= PRT_MCE;		\
    369 	pr_types[whichtype]++;			\
    370 }
    371 
    372 int recl_calls;
    373 int recl_mtbf = 3;
    374 int reloc_calls;
    375 int reloc_mtbf = 7;
    376 int pr_calls;
    377 int pr_mtbf = 15;
    378 
    379 #define	MTBF(v, f)	(((++(v)) & (f)) != (f))
    380 
    381 #else	/* DEBUG */
    382 
    383 #define	PR_DEBUG(foo)	/* nothing */
    384 #define	PR_TYPES(foo)	/* nothing */
    385 #define	MTBF(v, f)	(1)
    386 
    387 #endif	/* DEBUG */
    388 
    389 /*
    390  * page_retire_done() - completion processing
    391  *
    392  * Used by the page_retire code for common completion processing.
    393  * It keeps track of how many times a given result has happened,
    394  * and writes out an occasional message.
    395  *
    396  * May be called with a NULL pp (PRD_INVALID_PA case).
    397  */
    398 #define	PRD_INVALID_KEY		-1
    399 #define	PRD_SUCCESS		0
    400 #define	PRD_PENDING		1
    401 #define	PRD_FAILED		2
    402 #define	PRD_DUPLICATE		3
    403 #define	PRD_INVALID_PA		4
    404 #define	PRD_LIMIT		5
    405 #define	PRD_UE_SCRUBBED		6
    406 #define	PRD_UNR_SUCCESS		7
    407 #define	PRD_UNR_CANTLOCK	8
    408 #define	PRD_UNR_NOT		9
    409 
    410 typedef struct page_retire_op {
    411 	int	pr_key;		/* one of the PRD_* defines from above */
    412 	int	pr_count;	/* How many times this has happened */
    413 	int	pr_retval;	/* return value */
    414 	int	pr_msglvl;	/* message level - when to print */
    415 	char	*pr_message;	/* Cryptic message for field service */
    416 } page_retire_op_t;
    417 
    418 static page_retire_op_t page_retire_ops[] = {
    419 	/* key			count	retval	msglvl	message */
    420 	{PRD_SUCCESS,		0,	0,	1,
    421 		"Page 0x%08x.%08x removed from service"},
    422 	{PRD_PENDING,		0,	EAGAIN,	2,
    423 		"Page 0x%08x.%08x will be retired on free"},
    424 	{PRD_FAILED,		0,	EAGAIN,	0, NULL},
    425 	{PRD_DUPLICATE,		0,	EIO,	2,
    426 		"Page 0x%08x.%08x already retired or pending"},
    427 	{PRD_INVALID_PA,	0,	EINVAL, 2,
    428 		"PA 0x%08x.%08x is not a relocatable page"},
    429 	{PRD_LIMIT,		0,	0,	1,
    430 		"Page 0x%08x.%08x not retired due to limit exceeded"},
    431 	{PRD_UE_SCRUBBED,	0,	0,	1,
    432 		"Previously reported error on page 0x%08x.%08x cleared"},
    433 	{PRD_UNR_SUCCESS,	0,	0,	1,
    434 		"Page 0x%08x.%08x returned to service"},
    435 	{PRD_UNR_CANTLOCK,	0,	EAGAIN,	2,
    436 		"Page 0x%08x.%08x could not be unretired"},
    437 	{PRD_UNR_NOT,		0,	EIO,	2,
    438 		"Page 0x%08x.%08x is not retired"},
    439 	{PRD_INVALID_KEY,	0,	0,	0, NULL} /* MUST BE LAST! */
    440 };
    441 
    442 /*
    443  * print a message if page_retire_messages is true.
    444  */
    445 #define	PR_MESSAGE(debuglvl, msglvl, msg, pa)				\
    446 {									\
    447 	uint64_t p = (uint64_t)pa;					\
    448 	if (page_retire_messages >= msglvl && msg != NULL) {		\
    449 		cmn_err(debuglvl, msg,					\
    450 		    (uint32_t)(p >> 32), (uint32_t)p);			\
    451 	}								\
    452 }
    453 
    454 /*
    455  * Note that multiple bits may be set in a single settoxic operation.
    456  * May be called without the page locked.
    457  */
    458 void
    459 page_settoxic(page_t *pp, uchar_t bits)
    460 {
    461 	atomic_or_8(&pp->p_toxic, bits);
    462 }
    463 
    464 /*
    465  * Note that multiple bits may cleared in a single clrtoxic operation.
    466  * Must be called with the page exclusively locked to prevent races which
    467  * may attempt to retire a page without any toxic bits set.
    468  * Note that the PR_CAPTURE bit can be cleared without the exclusive lock
    469  * being held as there is a separate mutex which protects that bit.
    470  */
    471 void
    472 page_clrtoxic(page_t *pp, uchar_t bits)
    473 {
    474 	ASSERT((bits & PR_CAPTURE) || PAGE_EXCL(pp));
    475 	atomic_and_8(&pp->p_toxic, ~bits);
    476 }
    477 
    478 /*
    479  * Prints any page retire messages to the user, and decides what
    480  * error code is appropriate for the condition reported.
    481  */
    482 static int
    483 page_retire_done(page_t *pp, int code)
    484 {
    485 	page_retire_op_t *prop;
    486 	uint64_t	pa = 0;
    487 	int		i;
    488 
    489 	if (pp != NULL) {
    490 		pa = mmu_ptob((uint64_t)pp->p_pagenum);
    491 	}
    492 
    493 	prop = NULL;
    494 	for (i = 0; page_retire_ops[i].pr_key != PRD_INVALID_KEY; i++) {
    495 		if (page_retire_ops[i].pr_key == code) {
    496 			prop = &page_retire_ops[i];
    497 			break;
    498 		}
    499 	}
    500 
    501 #ifdef	DEBUG
    502 	if (page_retire_ops[i].pr_key == PRD_INVALID_KEY) {
    503 		cmn_err(CE_PANIC, "page_retire_done: Invalid opcode %d", code);
    504 	}
    505 #endif
    506 
    507 	ASSERT(prop->pr_key == code);
    508 
    509 	prop->pr_count++;
    510 
    511 	PR_MESSAGE(CE_NOTE, prop->pr_msglvl, prop->pr_message, pa);
    512 	if (pp != NULL) {
    513 		page_settoxic(pp, PR_MSG);
    514 	}
    515 
    516 	return (prop->pr_retval);
    517 }
    518 
    519 /*
    520  * Act like page_destroy(), but instead of freeing the page, hash it onto
    521  * the retired_pages vnode, and mark it retired.
    522  *
    523  * For fun, we try to scrub the page until it's squeaky clean.
    524  * availrmem is adjusted here.
    525  */
    526 static void
    527 page_retire_destroy(page_t *pp)
    528 {
    529 	u_offset_t off = (u_offset_t)((uintptr_t)pp);
    530 
    531 	ASSERT(PAGE_EXCL(pp));
    532 	ASSERT(!PP_ISFREE(pp));
    533 	ASSERT(pp->p_szc == 0);
    534 	ASSERT(!hat_page_is_mapped(pp));
    535 	ASSERT(!pp->p_vnode);
    536 
    537 	page_clr_all_props(pp);
    538 	pagescrub(pp, 0, MMU_PAGESIZE);
    539 
    540 	pp->p_next = NULL;
    541 	pp->p_prev = NULL;
    542 	if (page_hashin(pp, retired_pages, off, NULL) == 0) {
    543 		cmn_err(CE_PANIC, "retired page %p hashin failed", (void *)pp);
    544 	}
    545 
    546 	page_settoxic(pp, PR_RETIRED);
    547 	PR_INCR_KSTAT(pr_retired);
    548 
    549 	if (pp->p_toxic & PR_FMA) {
    550 		PR_INCR_KSTAT(pr_fma);
    551 	} else if (pp->p_toxic & PR_UE) {
    552 		PR_INCR_KSTAT(pr_ue);
    553 	} else {
    554 		PR_INCR_KSTAT(pr_mce);
    555 	}
    556 
    557 	mutex_enter(&freemem_lock);
    558 	availrmem--;
    559 	mutex_exit(&freemem_lock);
    560 
    561 	page_unlock(pp);
    562 }
    563 
    564 /*
    565  * Check whether the number of pages which have been retired already exceeds
    566  * the maximum allowable percentage of memory which may be retired.
    567  *
    568  * Returns 1 if the limit has been exceeded.
    569  */
    570 static int
    571 page_retire_limit(void)
    572 {
    573 	if (PR_KSTAT_RETIRED_NOTUE >= (uint64_t)PAGE_RETIRE_LIMIT) {
    574 		PR_INCR_KSTAT(pr_limit_exceeded);
    575 		return (1);
    576 	}
    577 
    578 	return (0);
    579 }
    580 
    581 #define	MSG_DM	"Data Mismatch occurred at PA 0x%08x.%08x"		\
    582 	"[ 0x%x != 0x%x ] while attempting to clear previously "	\
    583 	"reported error; page removed from service"
    584 
    585 #define	MSG_UE	"Uncorrectable Error occurred at PA 0x%08x.%08x while "	\
    586 	"attempting to clear previously reported error; page removed "	\
    587 	"from service"
    588 
    589 /*
    590  * Attempt to clear a UE from a page.
    591  * Returns 1 if the error has been successfully cleared.
    592  */
    593 static int
    594 page_clear_transient_ue(page_t *pp)
    595 {
    596 	caddr_t		kaddr;
    597 	uint8_t		rb, wb;
    598 	uint64_t	pa;
    599 	uint32_t	pa_hi, pa_lo;
    600 	on_trap_data_t	otd;
    601 	int		errors = 0;
    602 	int		i;
    603 
    604 	ASSERT(PAGE_EXCL(pp));
    605 	ASSERT(PP_PR_REQ(pp));
    606 	ASSERT(pp->p_szc == 0);
    607 	ASSERT(!hat_page_is_mapped(pp));
    608 
    609 	/*
    610 	 * Clear the page and attempt to clear the UE.  If we trap
    611 	 * on the next access to the page, we know the UE has recurred.
    612 	 */
    613 	pagescrub(pp, 0, PAGESIZE);
    614 
    615 	/*
    616 	 * Map the page and write a bunch of bit patterns to compare
    617 	 * what we wrote with what we read back.  This isn't a perfect
    618 	 * test but it should be good enough to catch most of the
    619 	 * recurring UEs. If this fails to catch a recurrent UE, we'll
    620 	 * retire the page the next time we see a UE on the page.
    621 	 */
    622 	kaddr = ppmapin(pp, PROT_READ|PROT_WRITE, (caddr_t)-1);
    623 
    624 	pa = ptob((uint64_t)page_pptonum(pp));
    625 	pa_hi = (uint32_t)(pa >> 32);
    626 	pa_lo = (uint32_t)pa;
    627 
    628 	/*
    629 	 * Disable preemption to prevent the off chance that
    630 	 * we migrate while in the middle of running through
    631 	 * the bit pattern and run on a different processor
    632 	 * than what we started on.
    633 	 */
    634 	kpreempt_disable();
    635 
    636 	/*
    637 	 * Fill the page with each (0x00 - 0xFF] bit pattern, flushing
    638 	 * the cache in between reading and writing.  We do this under
    639 	 * on_trap() protection to avoid recursion.
    640 	 */
    641 	if (on_trap(&otd, OT_DATA_EC)) {
    642 		PR_MESSAGE(CE_WARN, 1, MSG_UE, pa);
    643 		errors = 1;
    644 	} else {
    645 		for (wb = 0xff; wb > 0; wb--) {
    646 			for (i = 0; i < PAGESIZE; i++) {
    647 				kaddr[i] = wb;
    648 			}
    649 
    650 			sync_data_memory(kaddr, PAGESIZE);
    651 
    652 			for (i = 0; i < PAGESIZE; i++) {
    653 				rb = kaddr[i];
    654 				if (rb != wb) {
    655 					/*
    656 					 * We had a mismatch without a trap.
    657 					 * Uh-oh. Something is really wrong
    658 					 * with this system.
    659 					 */
    660 					if (page_retire_messages) {
    661 						cmn_err(CE_WARN, MSG_DM,
    662 						    pa_hi, pa_lo, rb, wb);
    663 					}
    664 					errors = 1;
    665 					goto out;	/* double break */
    666 				}
    667 			}
    668 		}
    669 	}
    670 out:
    671 	no_trap();
    672 	kpreempt_enable();
    673 	ppmapout(kaddr);
    674 
    675 	return (errors ? 0 : 1);
    676 }
    677 
    678 /*
    679  * Try to clear a page_t with a single UE. If the UE was transient, it is
    680  * returned to service, and we return 1. Otherwise we return 0 meaning
    681  * that further processing is required to retire the page.
    682  */
    683 static int
    684 page_retire_transient_ue(page_t *pp)
    685 {
    686 	ASSERT(PAGE_EXCL(pp));
    687 	ASSERT(!hat_page_is_mapped(pp));
    688 
    689 	/*
    690 	 * If this page is a repeat offender, retire him under the
    691 	 * "two strikes and you're out" rule. The caller is responsible
    692 	 * for scrubbing the page to try to clear the error.
    693 	 */
    694 	if (pp->p_toxic & PR_UE_SCRUBBED) {
    695 		PR_INCR_KSTAT(pr_ue_persistent);
    696 		return (0);
    697 	}
    698 
    699 	if (page_clear_transient_ue(pp)) {
    700 		/*
    701 		 * We set the PR_SCRUBBED_UE bit; if we ever see this
    702 		 * page again, we will retire it, no questions asked.
    703 		 */
    704 		page_settoxic(pp, PR_UE_SCRUBBED);
    705 
    706 		if (page_retire_first_ue) {
    707 			PR_INCR_KSTAT(pr_ue_cleared_retire);
    708 			return (0);
    709 		} else {
    710 			PR_INCR_KSTAT(pr_ue_cleared_free);
    711 
    712 			page_clrtoxic(pp, PR_UE | PR_MCE | PR_MSG);
    713 
    714 			/* LINTED: CONSTCOND */
    715 			VN_DISPOSE(pp, B_FREE, 1, kcred);
    716 			return (1);
    717 		}
    718 	}
    719 
    720 	PR_INCR_KSTAT(pr_ue_persistent);
    721 	return (0);
    722 }
    723 
    724 /*
    725  * Update the statistics dynamically when our kstat is read.
    726  */
    727 static int
    728 page_retire_kstat_update(kstat_t *ksp, int rw)
    729 {
    730 	struct page_retire_kstat *pr;
    731 
    732 	if (ksp == NULL)
    733 		return (EINVAL);
    734 
    735 	switch (rw) {
    736 
    737 	case KSTAT_READ:
    738 		pr = (struct page_retire_kstat *)ksp->ks_data;
    739 		ASSERT(pr == &page_retire_kstat);
    740 		pr->pr_limit.value.ui64 = PAGE_RETIRE_LIMIT;
    741 		return (0);
    742 
    743 	case KSTAT_WRITE:
    744 		return (EACCES);
    745 
    746 	default:
    747 		return (EINVAL);
    748 	}
    749 	/*NOTREACHED*/
    750 }
    751 
    752 static int
    753 pr_list_kstat_update(kstat_t *ksp, int rw)
    754 {
    755 	uint_t count;
    756 	page_t *pp;
    757 	kmutex_t *vphm;
    758 
    759 	if (rw == KSTAT_WRITE)
    760 		return (EACCES);
    761 
    762 	vphm = page_vnode_mutex(retired_pages);
    763 	mutex_enter(vphm);
    764 	/* Needs to be under a lock so that for loop will work right */
    765 	if (retired_pages->v_pages == NULL) {
    766 		mutex_exit(vphm);
    767 		ksp->ks_ndata = 0;
    768 		ksp->ks_data_size = 0;
    769 		return (0);
    770 	}
    771 
    772 	count = 1;
    773 	for (pp = retired_pages->v_pages->p_vpnext;
    774 	    pp != retired_pages->v_pages; pp = pp->p_vpnext) {
    775 		count++;
    776 	}
    777 	mutex_exit(vphm);
    778 
    779 	ksp->ks_ndata = count;
    780 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
    781 
    782 	return (0);
    783 }
    784 
    785 /*
    786  * all spans will be pagesize and no coalescing will be done with the
    787  * list produced.
    788  */
    789 static int
    790 pr_list_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
    791 {
    792 	kmutex_t *vphm;
    793 	page_t *pp;
    794 	struct memunit {
    795 		uint64_t address;
    796 		uint64_t size;
    797 	} *kspmem;
    798 
    799 	if (rw == KSTAT_WRITE)
    800 		return (EACCES);
    801 
    802 	ksp->ks_snaptime = gethrtime();
    803 
    804 	kspmem = (struct memunit *)buf;
    805 
    806 	vphm = page_vnode_mutex(retired_pages);
    807 	mutex_enter(vphm);
    808 	pp = retired_pages->v_pages;
    809 	if (((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) ||
    810 	    (pp == NULL)) {
    811 		mutex_exit(vphm);
    812 		return (0);
    813 	}
    814 	kspmem->address = ptob(pp->p_pagenum);
    815 	kspmem->size = PAGESIZE;
    816 	kspmem++;
    817 	for (pp = pp->p_vpnext; pp != retired_pages->v_pages;
    818 	    pp = pp->p_vpnext, kspmem++) {
    819 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
    820 			break;
    821 		kspmem->address = ptob(pp->p_pagenum);
    822 		kspmem->size = PAGESIZE;
    823 	}
    824 	mutex_exit(vphm);
    825 
    826 	return (0);
    827 }
    828 
    829 /*
    830  * page_retire_pend_count -- helper function for page_capture_thread,
    831  * returns the number of pages pending retirement.
    832  */
    833 uint64_t
    834 page_retire_pend_count(void)
    835 {
    836 	return (PR_KSTAT_PENDING);
    837 }
    838 
    839 uint64_t
    840 page_retire_pend_kas_count(void)
    841 {
    842 	return (PR_KSTAT_PENDING_KAS);
    843 }
    844 
    845 void
    846 page_retire_incr_pend_count(void *datap)
    847 {
    848 	PR_INCR_KSTAT(pr_pending);
    849 
    850 	if ((datap == &kvp) || (datap == &zvp)) {
    851 		PR_INCR_KSTAT(pr_pending_kas);
    852 	}
    853 }
    854 
    855 void
    856 page_retire_decr_pend_count(void *datap)
    857 {
    858 	PR_DECR_KSTAT(pr_pending);
    859 
    860 	if ((datap == &kvp) || (datap == &zvp)) {
    861 		PR_DECR_KSTAT(pr_pending_kas);
    862 	}
    863 }
    864 
    865 /*
    866  * Initialize the page retire mechanism:
    867  *
    868  *   - Establish the correctable error retire limit.
    869  *   - Initialize locks.
    870  *   - Build the retired_pages vnode.
    871  *   - Set up the kstats.
    872  *   - Fire off the background thread.
    873  *   - Tell page_retire() it's OK to start retiring pages.
    874  */
    875 void
    876 page_retire_init(void)
    877 {
    878 	const fs_operation_def_t retired_vnodeops_template[] = {
    879 		{ NULL, NULL }
    880 	};
    881 	struct vnodeops *vops;
    882 	kstat_t *ksp;
    883 
    884 	const uint_t page_retire_ndata =
    885 	    sizeof (page_retire_kstat) / sizeof (kstat_named_t);
    886 
    887 	ASSERT(page_retire_ksp == NULL);
    888 
    889 	if (max_pages_retired_bps <= 0) {
    890 		max_pages_retired_bps = MCE_BPT;
    891 	}
    892 
    893 	mutex_init(&pr_q_mutex, NULL, MUTEX_DEFAULT, NULL);
    894 
    895 	retired_pages = vn_alloc(KM_SLEEP);
    896 	if (vn_make_ops("retired_pages", retired_vnodeops_template, &vops)) {
    897 		cmn_err(CE_PANIC,
    898 		    "page_retired_init: can't make retired vnodeops");
    899 	}
    900 	vn_setops(retired_pages, vops);
    901 
    902 	if ((page_retire_ksp = kstat_create("unix", 0, "page_retire",
    903 	    "misc", KSTAT_TYPE_NAMED, page_retire_ndata,
    904 	    KSTAT_FLAG_VIRTUAL)) == NULL) {
    905 		cmn_err(CE_WARN, "kstat_create for page_retire failed");
    906 	} else {
    907 		page_retire_ksp->ks_data = (void *)&page_retire_kstat;
    908 		page_retire_ksp->ks_update = page_retire_kstat_update;
    909 		kstat_install(page_retire_ksp);
    910 	}
    911 
    912 	mutex_init(&pr_list_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
    913 	ksp = kstat_create("unix", 0, "page_retire_list", "misc",
    914 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
    915 	if (ksp != NULL) {
    916 		ksp->ks_update = pr_list_kstat_update;
    917 		ksp->ks_snapshot = pr_list_kstat_snapshot;
    918 		ksp->ks_lock = &pr_list_kstat_mutex;
    919 		kstat_install(ksp);
    920 	}
    921 
    922 	page_capture_register_callback(PC_RETIRE, -1, page_retire_pp_finish);
    923 	pr_enable = 1;
    924 }
    925 
    926 /*
    927  * page_retire_hunt() callback for the retire thread.
    928  */
    929 static void
    930 page_retire_thread_cb(page_t *pp)
    931 {
    932 	PR_DEBUG(prd_tctop);
    933 	if (!PP_ISKAS(pp) && page_trylock(pp, SE_EXCL)) {
    934 		PR_DEBUG(prd_tclocked);
    935 		page_unlock(pp);
    936 	}
    937 }
    938 
    939 /*
    940  * Callback used by page_trycapture() to finish off retiring a page.
    941  * The page has already been cleaned and we've been given sole access to
    942  * it.
    943  * Always returns 0 to indicate that callback succeded as the callback never
    944  * fails to finish retiring the given page.
    945  */
    946 /*ARGSUSED*/
    947 static int
    948 page_retire_pp_finish(page_t *pp, void *notused, uint_t flags)
    949 {
    950 	int		toxic;
    951 
    952 	ASSERT(PAGE_EXCL(pp));
    953 	ASSERT(pp->p_iolock_state == 0);
    954 	ASSERT(pp->p_szc == 0);
    955 
    956 	toxic = pp->p_toxic;
    957 
    958 	/*
    959 	 * The problem page is locked, demoted, unmapped, not free,
    960 	 * hashed out, and not COW or mlocked (whew!).
    961 	 *
    962 	 * Now we select our ammunition, take it around back, and shoot it.
    963 	 */
    964 	if (toxic & PR_UE) {
    965 ue_error:
    966 		if (page_retire_transient_ue(pp)) {
    967 			PR_DEBUG(prd_uescrubbed);
    968 			(void) page_retire_done(pp, PRD_UE_SCRUBBED);
    969 		} else {
    970 			PR_DEBUG(prd_uenotscrubbed);
    971 			page_retire_destroy(pp);
    972 			(void) page_retire_done(pp, PRD_SUCCESS);
    973 		}
    974 		return (0);
    975 	} else if (toxic & PR_FMA) {
    976 		PR_DEBUG(prd_fma);
    977 		page_retire_destroy(pp);
    978 		(void) page_retire_done(pp, PRD_SUCCESS);
    979 		return (0);
    980 	} else if (toxic & PR_MCE) {
    981 		PR_DEBUG(prd_mce);
    982 		page_retire_destroy(pp);
    983 		(void) page_retire_done(pp, PRD_SUCCESS);
    984 		return (0);
    985 	}
    986 
    987 	/*
    988 	 * When page_retire_first_ue is set to zero and a UE occurs which is
    989 	 * transient, it's possible that we clear some flags set by a second
    990 	 * UE error on the page which occurs while the first is currently being
    991 	 * handled and thus we need to handle the case where none of the above
    992 	 * are set.  In this instance, PR_UE_SCRUBBED should be set and thus
    993 	 * we should execute the UE code above.
    994 	 */
    995 	if (toxic & PR_UE_SCRUBBED) {
    996 		goto ue_error;
    997 	}
    998 
    999 	/*
   1000 	 * It's impossible to get here.
   1001 	 */
   1002 	panic("bad toxic flags 0x%x in page_retire_pp_finish\n", toxic);
   1003 	return (0);
   1004 }
   1005 
   1006 /*
   1007  * page_retire() - the front door in to retire a page.
   1008  *
   1009  * Ideally, page_retire() would instantly retire the requested page.
   1010  * Unfortunately, some pages are locked or otherwise tied up and cannot be
   1011  * retired right away.  We use the page capture logic to deal with this
   1012  * situation as it will continuously try to retire the page in the background
   1013  * if the first attempt fails.  Success is determined by looking to see whether
   1014  * the page has been retired after the page_trycapture() attempt.
   1015  *
   1016  * Returns:
   1017  *
   1018  *   - 0 on success,
   1019  *   - EINVAL when the PA is whacko,
   1020  *   - EIO if the page is already retired or already pending retirement, or
   1021  *   - EAGAIN if the page could not be _immediately_ retired but is pending.
   1022  */
   1023 int
   1024 page_retire(uint64_t pa, uchar_t reason)
   1025 {
   1026 	page_t	*pp;
   1027 
   1028 	ASSERT(reason & PR_REASONS);		/* there must be a reason */
   1029 	ASSERT(!(reason & ~PR_REASONS));	/* but no other bits */
   1030 
   1031 	pp = page_numtopp_nolock(mmu_btop(pa));
   1032 	if (pp == NULL) {
   1033 		PR_MESSAGE(CE_WARN, 1, "Cannot schedule clearing of error on"
   1034 		    " page 0x%08x.%08x; page is not relocatable memory", pa);
   1035 		return (page_retire_done(pp, PRD_INVALID_PA));
   1036 	}
   1037 	if (PP_RETIRED(pp)) {
   1038 		PR_DEBUG(prd_dup1);
   1039 		return (page_retire_done(pp, PRD_DUPLICATE));
   1040 	}
   1041 
   1042 	if ((reason & PR_UE) && !PP_TOXIC(pp)) {
   1043 		PR_MESSAGE(CE_NOTE, 1, "Scheduling clearing of error on"
   1044 		    " page 0x%08x.%08x", pa);
   1045 	} else if (PP_PR_REQ(pp)) {
   1046 		PR_DEBUG(prd_dup2);
   1047 		return (page_retire_done(pp, PRD_DUPLICATE));
   1048 	} else {
   1049 		PR_MESSAGE(CE_NOTE, 1, "Scheduling removal of"
   1050 		    " page 0x%08x.%08x", pa);
   1051 	}
   1052 
   1053 	/* Avoid setting toxic bits in the first place */
   1054 	if ((reason & (PR_FMA | PR_MCE)) && !(reason & PR_UE) &&
   1055 	    page_retire_limit()) {
   1056 		return (page_retire_done(pp, PRD_LIMIT));
   1057 	}
   1058 
   1059 	if (MTBF(pr_calls, pr_mtbf)) {
   1060 		page_settoxic(pp, reason);
   1061 		if (page_trycapture(pp, 0, CAPTURE_RETIRE, pp->p_vnode) == 0) {
   1062 			PR_DEBUG(prd_prlocked);
   1063 		} else {
   1064 			PR_DEBUG(prd_prnotlocked);
   1065 		}
   1066 	} else {
   1067 		PR_DEBUG(prd_prnotlocked);
   1068 	}
   1069 
   1070 	if (PP_RETIRED(pp)) {
   1071 		PR_DEBUG(prd_prretired);
   1072 		return (0);
   1073 	} else {
   1074 		cv_signal(&pc_cv);
   1075 		PR_INCR_KSTAT(pr_failed);
   1076 
   1077 		if (pp->p_toxic & PR_MSG) {
   1078 			return (page_retire_done(pp, PRD_FAILED));
   1079 		} else {
   1080 			return (page_retire_done(pp, PRD_PENDING));
   1081 		}
   1082 	}
   1083 }
   1084 
   1085 /*
   1086  * Take a retired page off the retired-pages vnode and clear the toxic flags.
   1087  * If "free" is nonzero, lock it and put it back on the freelist. If "free"
   1088  * is zero, the caller already holds SE_EXCL lock so we simply unretire it
   1089  * and don't do anything else with it.
   1090  *
   1091  * Any unretire messages are printed from this routine.
   1092  *
   1093  * Returns 0 if page pp was unretired; else an error code.
   1094  *
   1095  * If flags is:
   1096  *	PR_UNR_FREE - lock the page, clear the toxic flags and free it
   1097  *	    to the freelist.
   1098  *	PR_UNR_TEMP - lock the page, unretire it, leave the toxic
   1099  *	    bits set as is and return it to the caller.
   1100  *	PR_UNR_CLEAN - page is SE_EXCL locked, unretire it, clear the
   1101  *	    toxic flags and return it to caller as is.
   1102  */
   1103 int
   1104 page_unretire_pp(page_t *pp, int flags)
   1105 {
   1106 	/*
   1107 	 * To be retired, a page has to be hashed onto the retired_pages vnode
   1108 	 * and have PR_RETIRED set in p_toxic.
   1109 	 */
   1110 	if (flags == PR_UNR_CLEAN ||
   1111 	    page_try_reclaim_lock(pp, SE_EXCL, SE_RETIRED)) {
   1112 		ASSERT(PAGE_EXCL(pp));
   1113 		PR_DEBUG(prd_ulocked);
   1114 		if (!PP_RETIRED(pp)) {
   1115 			PR_DEBUG(prd_unotretired);
   1116 			page_unlock(pp);
   1117 			return (page_retire_done(pp, PRD_UNR_NOT));
   1118 		}
   1119 
   1120 		PR_MESSAGE(CE_NOTE, 1, "unretiring retired"
   1121 		    " page 0x%08x.%08x", mmu_ptob((uint64_t)pp->p_pagenum));
   1122 		if (pp->p_toxic & PR_FMA) {
   1123 			PR_DECR_KSTAT(pr_fma);
   1124 		} else if (pp->p_toxic & PR_UE) {
   1125 			PR_DECR_KSTAT(pr_ue);
   1126 		} else {
   1127 			PR_DECR_KSTAT(pr_mce);
   1128 		}
   1129 
   1130 		if (flags == PR_UNR_TEMP)
   1131 			page_clrtoxic(pp, PR_RETIRED);
   1132 		else
   1133 			page_clrtoxic(pp, PR_TOXICFLAGS);
   1134 
   1135 		if (flags == PR_UNR_FREE) {
   1136 			PR_DEBUG(prd_udestroy);
   1137 			page_destroy(pp, 0);
   1138 		} else {
   1139 			PR_DEBUG(prd_uhashout);
   1140 			page_hashout(pp, NULL);
   1141 		}
   1142 
   1143 		mutex_enter(&freemem_lock);
   1144 		availrmem++;
   1145 		mutex_exit(&freemem_lock);
   1146 
   1147 		PR_DEBUG(prd_uunretired);
   1148 		PR_DECR_KSTAT(pr_retired);
   1149 		PR_INCR_KSTAT(pr_unretired);
   1150 		return (page_retire_done(pp, PRD_UNR_SUCCESS));
   1151 	}
   1152 	PR_DEBUG(prd_unotlocked);
   1153 	return (page_retire_done(pp, PRD_UNR_CANTLOCK));
   1154 }
   1155 
   1156 /*
   1157  * Return a page to service by moving it from the retired_pages vnode
   1158  * onto the freelist.
   1159  *
   1160  * Called from mmioctl_page_retire() on behalf of the FMA DE.
   1161  *
   1162  * Returns:
   1163  *
   1164  *   - 0 if the page is unretired,
   1165  *   - EAGAIN if the pp can not be locked,
   1166  *   - EINVAL if the PA is whacko, and
   1167  *   - EIO if the pp is not retired.
   1168  */
   1169 int
   1170 page_unretire(uint64_t pa)
   1171 {
   1172 	page_t	*pp;
   1173 
   1174 	pp = page_numtopp_nolock(mmu_btop(pa));
   1175 	if (pp == NULL) {
   1176 		return (page_retire_done(pp, PRD_INVALID_PA));
   1177 	}
   1178 
   1179 	return (page_unretire_pp(pp, PR_UNR_FREE));
   1180 }
   1181 
   1182 /*
   1183  * Test a page to see if it is retired. If errors is non-NULL, the toxic
   1184  * bits of the page are returned. Returns 0 on success, error code on failure.
   1185  */
   1186 int
   1187 page_retire_check_pp(page_t *pp, uint64_t *errors)
   1188 {
   1189 	int rc;
   1190 
   1191 	if (PP_RETIRED(pp)) {
   1192 		PR_DEBUG(prd_checkhit);
   1193 		rc = 0;
   1194 	} else if (PP_PR_REQ(pp)) {
   1195 		PR_DEBUG(prd_checkmiss_pend);
   1196 		rc = EAGAIN;
   1197 	} else {
   1198 		PR_DEBUG(prd_checkmiss_noerr);
   1199 		rc = EIO;
   1200 	}
   1201 
   1202 	/*
   1203 	 * We have magically arranged the bit values returned to fmd(1M)
   1204 	 * to line up with the FMA, MCE, and UE bits of the page_t.
   1205 	 */
   1206 	if (errors) {
   1207 		uint64_t toxic = (uint64_t)(pp->p_toxic & PR_ERRMASK);
   1208 		if (toxic & PR_UE_SCRUBBED) {
   1209 			toxic &= ~PR_UE_SCRUBBED;
   1210 			toxic |= PR_UE;
   1211 		}
   1212 		*errors = toxic;
   1213 	}
   1214 
   1215 	return (rc);
   1216 }
   1217 
   1218 /*
   1219  * Test to see if the page_t for a given PA is retired, and return the
   1220  * hardware errors we have seen on the page if requested.
   1221  *
   1222  * Called from mmioctl_page_retire on behalf of the FMA DE.
   1223  *
   1224  * Returns:
   1225  *
   1226  *   - 0 if the page is retired,
   1227  *   - EIO if the page is not retired and has no errors,
   1228  *   - EAGAIN if the page is not retired but is pending; and
   1229  *   - EINVAL if the PA is whacko.
   1230  */
   1231 int
   1232 page_retire_check(uint64_t pa, uint64_t *errors)
   1233 {
   1234 	page_t	*pp;
   1235 
   1236 	if (errors) {
   1237 		*errors = 0;
   1238 	}
   1239 
   1240 	pp = page_numtopp_nolock(mmu_btop(pa));
   1241 	if (pp == NULL) {
   1242 		return (page_retire_done(pp, PRD_INVALID_PA));
   1243 	}
   1244 
   1245 	return (page_retire_check_pp(pp, errors));
   1246 }
   1247 
   1248 /*
   1249  * Page retire self-test. For now, it always returns 0.
   1250  */
   1251 int
   1252 page_retire_test(void)
   1253 {
   1254 	page_t *first, *pp, *cpp, *cpp2, *lpp;
   1255 
   1256 	/*
   1257 	 * Tests the corner case where a large page can't be retired
   1258 	 * because one of the constituent pages is locked. We mark
   1259 	 * one page to be retired and try to retire it, and mark the
   1260 	 * other page to be retired but don't try to retire it, so
   1261 	 * that page_unlock() in the failure path will recurse and try
   1262 	 * to retire THAT page. This is the worst possible situation
   1263 	 * we can get ourselves into.
   1264 	 */
   1265 	memsegs_lock(0);
   1266 	pp = first = page_first();
   1267 	do {
   1268 		if (pp->p_szc && PP_PAGEROOT(pp) == pp) {
   1269 			cpp = pp + 1;
   1270 			lpp = PP_ISFREE(pp)? pp : pp + 2;
   1271 			cpp2 = pp + 3;
   1272 			if (!page_trylock(lpp, pp == lpp? SE_EXCL : SE_SHARED))
   1273 				continue;
   1274 			if (!page_trylock(cpp, SE_EXCL)) {
   1275 				page_unlock(lpp);
   1276 				continue;
   1277 			}
   1278 
   1279 			/* fails */
   1280 			(void) page_retire(ptob(cpp->p_pagenum), PR_FMA);
   1281 
   1282 			page_unlock(lpp);
   1283 			page_unlock(cpp);
   1284 			(void) page_retire(ptob(cpp->p_pagenum), PR_FMA);
   1285 			(void) page_retire(ptob(cpp2->p_pagenum), PR_FMA);
   1286 		}
   1287 	} while ((pp = page_next(pp)) != first);
   1288 	memsegs_unlock(0);
   1289 
   1290 	return (0);
   1291 }
   1292