Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * VM - Hardware Address Translation management for Spitfire MMU.
     28  *
     29  * This file implements the machine specific hardware translation
     30  * needed by the VM system.  The machine independent interface is
     31  * described in <vm/hat.h> while the machine dependent interface
     32  * and data structures are described in <vm/hat_sfmmu.h>.
     33  *
     34  * The hat layer manages the address translation hardware as a cache
     35  * driven by calls from the higher levels in the VM system.
     36  */
     37 
     38 #include <sys/types.h>
     39 #include <sys/kstat.h>
     40 #include <vm/hat.h>
     41 #include <vm/hat_sfmmu.h>
     42 #include <vm/page.h>
     43 #include <sys/pte.h>
     44 #include <sys/systm.h>
     45 #include <sys/mman.h>
     46 #include <sys/sysmacros.h>
     47 #include <sys/machparam.h>
     48 #include <sys/vtrace.h>
     49 #include <sys/kmem.h>
     50 #include <sys/mmu.h>
     51 #include <sys/cmn_err.h>
     52 #include <sys/cpu.h>
     53 #include <sys/cpuvar.h>
     54 #include <sys/debug.h>
     55 #include <sys/lgrp.h>
     56 #include <sys/archsystm.h>
     57 #include <sys/machsystm.h>
     58 #include <sys/vmsystm.h>
     59 #include <vm/as.h>
     60 #include <vm/seg.h>
     61 #include <vm/seg_kp.h>
     62 #include <vm/seg_kmem.h>
     63 #include <vm/seg_kpm.h>
     64 #include <vm/rm.h>
     65 #include <sys/t_lock.h>
     66 #include <sys/obpdefs.h>
     67 #include <sys/vm_machparam.h>
     68 #include <sys/var.h>
     69 #include <sys/trap.h>
     70 #include <sys/machtrap.h>
     71 #include <sys/scb.h>
     72 #include <sys/bitmap.h>
     73 #include <sys/machlock.h>
     74 #include <sys/membar.h>
     75 #include <sys/atomic.h>
     76 #include <sys/cpu_module.h>
     77 #include <sys/prom_debug.h>
     78 #include <sys/ksynch.h>
     79 #include <sys/mem_config.h>
     80 #include <sys/mem_cage.h>
     81 #include <vm/vm_dep.h>
     82 #include <vm/xhat_sfmmu.h>
     83 #include <sys/fpu/fpusystm.h>
     84 #include <vm/mach_kpm.h>
     85 #include <sys/callb.h>
     86 
     87 #ifdef	DEBUG
     88 #define	SFMMU_VALIDATE_HMERID(hat, rid, saddr, len)			\
     89 	if (SFMMU_IS_SHMERID_VALID(rid)) {				\
     90 		caddr_t _eaddr = (saddr) + (len);			\
     91 		sf_srd_t *_srdp;					\
     92 		sf_region_t *_rgnp;					\
     93 		ASSERT((rid) < SFMMU_MAX_HME_REGIONS);			\
     94 		ASSERT(SF_RGNMAP_TEST(hat->sfmmu_hmeregion_map, rid));	\
     95 		ASSERT((hat) != ksfmmup);				\
     96 		_srdp = (hat)->sfmmu_srdp;				\
     97 		ASSERT(_srdp != NULL);					\
     98 		ASSERT(_srdp->srd_refcnt != 0);				\
     99 		_rgnp = _srdp->srd_hmergnp[(rid)];			\
    100 		ASSERT(_rgnp != NULL && _rgnp->rgn_id == rid);		\
    101 		ASSERT(_rgnp->rgn_refcnt != 0);				\
    102 		ASSERT(!(_rgnp->rgn_flags & SFMMU_REGION_FREE));	\
    103 		ASSERT((_rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) ==	\
    104 		    SFMMU_REGION_HME);					\
    105 		ASSERT((saddr) >= _rgnp->rgn_saddr);			\
    106 		ASSERT((saddr) < _rgnp->rgn_saddr + _rgnp->rgn_size);	\
    107 		ASSERT(_eaddr > _rgnp->rgn_saddr);			\
    108 		ASSERT(_eaddr <= _rgnp->rgn_saddr + _rgnp->rgn_size);	\
    109 	}
    110 
    111 #define	SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid) 	 	 \
    112 {						 			 \
    113 		caddr_t _hsva;						 \
    114 		caddr_t _heva;						 \
    115 		caddr_t _rsva;					 	 \
    116 		caddr_t _reva;					 	 \
    117 		int	_ttesz = get_hblk_ttesz(hmeblkp);		 \
    118 		int	_flagtte;					 \
    119 		ASSERT((srdp)->srd_refcnt != 0);			 \
    120 		ASSERT((rid) < SFMMU_MAX_HME_REGIONS);			 \
    121 		ASSERT((rgnp)->rgn_id == rid);				 \
    122 		ASSERT(!((rgnp)->rgn_flags & SFMMU_REGION_FREE));	 \
    123 		ASSERT(((rgnp)->rgn_flags & SFMMU_REGION_TYPE_MASK) ==	 \
    124 		    SFMMU_REGION_HME);					 \
    125 		ASSERT(_ttesz <= (rgnp)->rgn_pgszc);			 \
    126 		_hsva = (caddr_t)get_hblk_base(hmeblkp);		 \
    127 		_heva = get_hblk_endaddr(hmeblkp);			 \
    128 		_rsva = (caddr_t)P2ALIGN(				 \
    129 		    (uintptr_t)(rgnp)->rgn_saddr, HBLK_MIN_BYTES);	 \
    130 		_reva = (caddr_t)P2ROUNDUP(				 \
    131 		    (uintptr_t)((rgnp)->rgn_saddr + (rgnp)->rgn_size),	 \
    132 		    HBLK_MIN_BYTES);					 \
    133 		ASSERT(_hsva >= _rsva);				 	 \
    134 		ASSERT(_hsva < _reva);				 	 \
    135 		ASSERT(_heva > _rsva);				 	 \
    136 		ASSERT(_heva <= _reva);				 	 \
    137 		_flagtte = (_ttesz < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ :  \
    138 			_ttesz;						 \
    139 		ASSERT(rgnp->rgn_hmeflags & (0x1 << _flagtte));		 \
    140 }
    141 
    142 #else /* DEBUG */
    143 #define	SFMMU_VALIDATE_HMERID(hat, rid, addr, len)
    144 #define	SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid)
    145 #endif /* DEBUG */
    146 
    147 #if defined(SF_ERRATA_57)
    148 extern caddr_t errata57_limit;
    149 #endif
    150 
    151 #define	HME8BLK_SZ_RND		((roundup(HME8BLK_SZ, sizeof (int64_t))) /  \
    152 				(sizeof (int64_t)))
    153 #define	HBLK_RESERVE		((struct hme_blk *)hblk_reserve)
    154 
    155 #define	HBLK_RESERVE_CNT	128
    156 #define	HBLK_RESERVE_MIN	20
    157 
    158 static struct hme_blk		*freehblkp;
    159 static kmutex_t			freehblkp_lock;
    160 static int			freehblkcnt;
    161 
    162 static int64_t			hblk_reserve[HME8BLK_SZ_RND];
    163 static kmutex_t			hblk_reserve_lock;
    164 static kthread_t		*hblk_reserve_thread;
    165 
    166 static nucleus_hblk8_info_t	nucleus_hblk8;
    167 static nucleus_hblk1_info_t	nucleus_hblk1;
    168 
    169 /*
    170  * Data to manage per-cpu hmeblk pending queues, hmeblks are queued here
    171  * after the initial phase of removing an hmeblk from the hash chain, see
    172  * the detailed comment in sfmmu_hblk_hash_rm() for further details.
    173  */
    174 static cpu_hme_pend_t		*cpu_hme_pend;
    175 static uint_t			cpu_hme_pend_thresh;
    176 /*
    177  * SFMMU specific hat functions
    178  */
    179 void	hat_pagecachectl(struct page *, int);
    180 
    181 /* flags for hat_pagecachectl */
    182 #define	HAT_CACHE	0x1
    183 #define	HAT_UNCACHE	0x2
    184 #define	HAT_TMPNC	0x4
    185 
    186 /*
    187  * Flag to allow the creation of non-cacheable translations
    188  * to system memory. It is off by default. At the moment this
    189  * flag is used by the ecache error injector. The error injector
    190  * will turn it on when creating such a translation then shut it
    191  * off when it's finished.
    192  */
    193 
    194 int	sfmmu_allow_nc_trans = 0;
    195 
    196 /*
    197  * Flag to disable large page support.
    198  * 	value of 1 => disable all large pages.
    199  *	bits 1, 2, and 3 are to disable 64K, 512K and 4M pages respectively.
    200  *
    201  * For example, use the value 0x4 to disable 512K pages.
    202  *
    203  */
    204 #define	LARGE_PAGES_OFF		0x1
    205 
    206 /*
    207  * The disable_large_pages and disable_ism_large_pages variables control
    208  * hat_memload_array and the page sizes to be used by ISM and the kernel.
    209  *
    210  * The disable_auto_data_large_pages and disable_auto_text_large_pages variables
    211  * are only used to control which OOB pages to use at upper VM segment creation
    212  * time, and are set in hat_init_pagesizes and used in the map_pgsz* routines.
    213  * Their values may come from platform or CPU specific code to disable page
    214  * sizes that should not be used.
    215  *
    216  * WARNING: 512K pages are currently not supported for ISM/DISM.
    217  */
    218 uint_t	disable_large_pages = 0;
    219 uint_t	disable_ism_large_pages = (1 << TTE512K);
    220 uint_t	disable_auto_data_large_pages = 0;
    221 uint_t	disable_auto_text_large_pages = 0;
    222 
    223 /*
    224  * Private sfmmu data structures for hat management
    225  */
    226 static struct kmem_cache *sfmmuid_cache;
    227 static struct kmem_cache *mmuctxdom_cache;
    228 
    229 /*
    230  * Private sfmmu data structures for tsb management
    231  */
    232 static struct kmem_cache *sfmmu_tsbinfo_cache;
    233 static struct kmem_cache *sfmmu_tsb8k_cache;
    234 static struct kmem_cache *sfmmu_tsb_cache[NLGRPS_MAX];
    235 static vmem_t *kmem_bigtsb_arena;
    236 static vmem_t *kmem_tsb_arena;
    237 
    238 /*
    239  * sfmmu static variables for hmeblk resource management.
    240  */
    241 static vmem_t *hat_memload1_arena; /* HAT translation arena for sfmmu1_cache */
    242 static struct kmem_cache *sfmmu8_cache;
    243 static struct kmem_cache *sfmmu1_cache;
    244 static struct kmem_cache *pa_hment_cache;
    245 
    246 static kmutex_t 	ism_mlist_lock;	/* mutex for ism mapping list */
    247 /*
    248  * private data for ism
    249  */
    250 static struct kmem_cache *ism_blk_cache;
    251 static struct kmem_cache *ism_ment_cache;
    252 #define	ISMID_STARTADDR	NULL
    253 
    254 /*
    255  * Region management data structures and function declarations.
    256  */
    257 
    258 static void	sfmmu_leave_srd(sfmmu_t *);
    259 static int	sfmmu_srdcache_constructor(void *, void *, int);
    260 static void	sfmmu_srdcache_destructor(void *, void *);
    261 static int	sfmmu_rgncache_constructor(void *, void *, int);
    262 static void	sfmmu_rgncache_destructor(void *, void *);
    263 static int	sfrgnmap_isnull(sf_region_map_t *);
    264 static int	sfhmergnmap_isnull(sf_hmeregion_map_t *);
    265 static int	sfmmu_scdcache_constructor(void *, void *, int);
    266 static void	sfmmu_scdcache_destructor(void *, void *);
    267 static void	sfmmu_rgn_cb_noop(caddr_t, caddr_t, caddr_t,
    268     size_t, void *, u_offset_t);
    269 
    270 static uint_t srd_hashmask = SFMMU_MAX_SRD_BUCKETS - 1;
    271 static sf_srd_bucket_t *srd_buckets;
    272 static struct kmem_cache *srd_cache;
    273 static uint_t srd_rgn_hashmask = SFMMU_MAX_REGION_BUCKETS - 1;
    274 static struct kmem_cache *region_cache;
    275 static struct kmem_cache *scd_cache;
    276 
    277 #ifdef sun4v
    278 int use_bigtsb_arena = 1;
    279 #else
    280 int use_bigtsb_arena = 0;
    281 #endif
    282 
    283 /* External /etc/system tunable, for turning on&off the shctx support */
    284 int disable_shctx = 0;
    285 /* Internal variable, set by MD if the HW supports shctx feature */
    286 int shctx_on = 0;
    287 
    288 #ifdef DEBUG
    289 static void check_scd_sfmmu_list(sfmmu_t **, sfmmu_t *, int);
    290 #endif
    291 static void sfmmu_to_scd_list(sfmmu_t **, sfmmu_t *);
    292 static void sfmmu_from_scd_list(sfmmu_t **, sfmmu_t *);
    293 
    294 static sf_scd_t *sfmmu_alloc_scd(sf_srd_t *, sf_region_map_t *);
    295 static void sfmmu_find_scd(sfmmu_t *);
    296 static void sfmmu_join_scd(sf_scd_t *, sfmmu_t *);
    297 static void sfmmu_finish_join_scd(sfmmu_t *);
    298 static void sfmmu_leave_scd(sfmmu_t *, uchar_t);
    299 static void sfmmu_destroy_scd(sf_srd_t *, sf_scd_t *, sf_region_map_t *);
    300 static int sfmmu_alloc_scd_tsbs(sf_srd_t *, sf_scd_t *);
    301 static void sfmmu_free_scd_tsbs(sfmmu_t *);
    302 static void sfmmu_tsb_inv_ctx(sfmmu_t *);
    303 static int find_ism_rid(sfmmu_t *, sfmmu_t *, caddr_t, uint_t *);
    304 static void sfmmu_ism_hatflags(sfmmu_t *, int);
    305 static int sfmmu_srd_lock_held(sf_srd_t *);
    306 static void sfmmu_remove_scd(sf_scd_t **, sf_scd_t *);
    307 static void sfmmu_add_scd(sf_scd_t **headp, sf_scd_t *);
    308 static void sfmmu_link_scd_to_regions(sf_srd_t *, sf_scd_t *);
    309 static void sfmmu_unlink_scd_from_regions(sf_srd_t *, sf_scd_t *);
    310 static void sfmmu_link_to_hmeregion(sfmmu_t *, sf_region_t *);
    311 static void sfmmu_unlink_from_hmeregion(sfmmu_t *, sf_region_t *);
    312 
    313 /*
    314  * ``hat_lock'' is a hashed mutex lock for protecting sfmmu TSB lists,
    315  * HAT flags, synchronizing TLB/TSB coherency, and context management.
    316  * The lock is hashed on the sfmmup since the case where we need to lock
    317  * all processes is rare but does occur (e.g. we need to unload a shared
    318  * mapping from all processes using the mapping).  We have a lot of buckets,
    319  * and each slab of sfmmu_t's can use about a quarter of them, giving us
    320  * a fairly good distribution without wasting too much space and overhead
    321  * when we have to grab them all.
    322  */
    323 #define	SFMMU_NUM_LOCK	128		/* must be power of two */
    324 hatlock_t	hat_lock[SFMMU_NUM_LOCK];
    325 
    326 /*
    327  * Hash algorithm optimized for a small number of slabs.
    328  *  7 is (highbit((sizeof sfmmu_t)) - 1)
    329  * This hash algorithm is based upon the knowledge that sfmmu_t's come from a
    330  * kmem_cache, and thus they will be sequential within that cache.  In
    331  * addition, each new slab will have a different "color" up to cache_maxcolor
    332  * which will skew the hashing for each successive slab which is allocated.
    333  * If the size of sfmmu_t changed to a larger size, this algorithm may need
    334  * to be revisited.
    335  */
    336 #define	TSB_HASH_SHIFT_BITS (7)
    337 #define	PTR_HASH(x) ((uintptr_t)x >> TSB_HASH_SHIFT_BITS)
    338 
    339 #ifdef DEBUG
    340 int tsb_hash_debug = 0;
    341 #define	TSB_HASH(sfmmup)	\
    342 	(tsb_hash_debug ? &hat_lock[0] : \
    343 	&hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)])
    344 #else	/* DEBUG */
    345 #define	TSB_HASH(sfmmup)	&hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)]
    346 #endif	/* DEBUG */
    347 
    348 
    349 /* sfmmu_replace_tsb() return codes. */
    350 typedef enum tsb_replace_rc {
    351 	TSB_SUCCESS,
    352 	TSB_ALLOCFAIL,
    353 	TSB_LOSTRACE,
    354 	TSB_ALREADY_SWAPPED,
    355 	TSB_CANTGROW
    356 } tsb_replace_rc_t;
    357 
    358 /*
    359  * Flags for TSB allocation routines.
    360  */
    361 #define	TSB_ALLOC	0x01
    362 #define	TSB_FORCEALLOC	0x02
    363 #define	TSB_GROW	0x04
    364 #define	TSB_SHRINK	0x08
    365 #define	TSB_SWAPIN	0x10
    366 
    367 /*
    368  * Support for HAT callbacks.
    369  */
    370 #define	SFMMU_MAX_RELOC_CALLBACKS	10
    371 int sfmmu_max_cb_id = SFMMU_MAX_RELOC_CALLBACKS;
    372 static id_t sfmmu_cb_nextid = 0;
    373 static id_t sfmmu_tsb_cb_id;
    374 struct sfmmu_callback *sfmmu_cb_table;
    375 
    376 /*
    377  * Kernel page relocation is enabled by default for non-caged
    378  * kernel pages.  This has little effect unless segkmem_reloc is
    379  * set, since by default kernel memory comes from inside the
    380  * kernel cage.
    381  */
    382 int hat_kpr_enabled = 1;
    383 
    384 kmutex_t	kpr_mutex;
    385 kmutex_t	kpr_suspendlock;
    386 kthread_t	*kreloc_thread;
    387 
    388 /*
    389  * Enable VA->PA translation sanity checking on DEBUG kernels.
    390  * Disabled by default.  This is incompatible with some
    391  * drivers (error injector, RSM) so if it breaks you get
    392  * to keep both pieces.
    393  */
    394 int hat_check_vtop = 0;
    395 
    396 /*
    397  * Private sfmmu routines (prototypes)
    398  */
    399 static struct hme_blk *sfmmu_shadow_hcreate(sfmmu_t *, caddr_t, int, uint_t);
    400 static struct 	hme_blk *sfmmu_hblk_alloc(sfmmu_t *, caddr_t,
    401 			struct hmehash_bucket *, uint_t, hmeblk_tag, uint_t,
    402 			uint_t);
    403 static caddr_t	sfmmu_hblk_unload(struct hat *, struct hme_blk *, caddr_t,
    404 			caddr_t, demap_range_t *, uint_t);
    405 static caddr_t	sfmmu_hblk_sync(struct hat *, struct hme_blk *, caddr_t,
    406 			caddr_t, int);
    407 static void	sfmmu_hblk_free(struct hme_blk **);
    408 static void	sfmmu_hblks_list_purge(struct hme_blk **, int);
    409 static uint_t	sfmmu_get_free_hblk(struct hme_blk **, uint_t);
    410 static uint_t	sfmmu_put_free_hblk(struct hme_blk *, uint_t);
    411 static struct hme_blk *sfmmu_hblk_steal(int);
    412 static int	sfmmu_steal_this_hblk(struct hmehash_bucket *,
    413 			struct hme_blk *, uint64_t, struct hme_blk *);
    414 static caddr_t	sfmmu_hblk_unlock(struct hme_blk *, caddr_t, caddr_t);
    415 
    416 static void	hat_do_memload_array(struct hat *, caddr_t, size_t,
    417 		    struct page **, uint_t, uint_t, uint_t);
    418 static void	hat_do_memload(struct hat *, caddr_t, struct page *,
    419 		    uint_t, uint_t, uint_t);
    420 static void	sfmmu_memload_batchsmall(struct hat *, caddr_t, page_t **,
    421 		    uint_t, uint_t, pgcnt_t, uint_t);
    422 void		sfmmu_tteload(struct hat *, tte_t *, caddr_t, page_t *,
    423 			uint_t);
    424 static int	sfmmu_tteload_array(sfmmu_t *, tte_t *, caddr_t, page_t **,
    425 			uint_t, uint_t);
    426 static struct hmehash_bucket *sfmmu_tteload_acquire_hashbucket(sfmmu_t *,
    427 					caddr_t, int, uint_t);
    428 static struct hme_blk *sfmmu_tteload_find_hmeblk(sfmmu_t *,
    429 			struct hmehash_bucket *, caddr_t, uint_t, uint_t,
    430 			uint_t);
    431 static int	sfmmu_tteload_addentry(sfmmu_t *, struct hme_blk *, tte_t *,
    432 			caddr_t, page_t **, uint_t, uint_t);
    433 static void	sfmmu_tteload_release_hashbucket(struct hmehash_bucket *);
    434 
    435 static int	sfmmu_pagearray_setup(caddr_t, page_t **, tte_t *, int);
    436 static pfn_t	sfmmu_uvatopfn(caddr_t, sfmmu_t *, tte_t *);
    437 void		sfmmu_memtte(tte_t *, pfn_t, uint_t, int);
    438 #ifdef VAC
    439 static void	sfmmu_vac_conflict(struct hat *, caddr_t, page_t *);
    440 static int	sfmmu_vacconflict_array(caddr_t, page_t *, int *);
    441 int	tst_tnc(page_t *pp, pgcnt_t);
    442 void	conv_tnc(page_t *pp, int);
    443 #endif
    444 
    445 static void	sfmmu_get_ctx(sfmmu_t *);
    446 static void	sfmmu_free_sfmmu(sfmmu_t *);
    447 
    448 static void	sfmmu_ttesync(struct hat *, caddr_t, tte_t *, page_t *);
    449 static void	sfmmu_chgattr(struct hat *, caddr_t, size_t, uint_t, int);
    450 
    451 cpuset_t	sfmmu_pageunload(page_t *, struct sf_hment *, int);
    452 static void	hat_pagereload(struct page *, struct page *);
    453 static cpuset_t	sfmmu_pagesync(page_t *, struct sf_hment *, uint_t);
    454 #ifdef VAC
    455 void	sfmmu_page_cache_array(page_t *, int, int, pgcnt_t);
    456 static void	sfmmu_page_cache(page_t *, int, int, int);
    457 #endif
    458 
    459 cpuset_t	sfmmu_rgntlb_demap(caddr_t, sf_region_t *,
    460     struct hme_blk *, int);
    461 static void	sfmmu_tlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *,
    462 			pfn_t, int, int, int, int);
    463 static void	sfmmu_ismtlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *,
    464 			pfn_t, int);
    465 static void	sfmmu_tlb_demap(caddr_t, sfmmu_t *, struct hme_blk *, int, int);
    466 static void	sfmmu_tlb_range_demap(demap_range_t *);
    467 static void	sfmmu_invalidate_ctx(sfmmu_t *);
    468 static void	sfmmu_sync_mmustate(sfmmu_t *);
    469 
    470 static void 	sfmmu_tsbinfo_setup_phys(struct tsb_info *, pfn_t);
    471 static int	sfmmu_tsbinfo_alloc(struct tsb_info **, int, int, uint_t,
    472 			sfmmu_t *);
    473 static void	sfmmu_tsb_free(struct tsb_info *);
    474 static void	sfmmu_tsbinfo_free(struct tsb_info *);
    475 static int	sfmmu_init_tsbinfo(struct tsb_info *, int, int, uint_t,
    476 			sfmmu_t *);
    477 static void	sfmmu_tsb_chk_reloc(sfmmu_t *, hatlock_t *);
    478 static void	sfmmu_tsb_swapin(sfmmu_t *, hatlock_t *);
    479 static int	sfmmu_select_tsb_szc(pgcnt_t);
    480 static void	sfmmu_mod_tsb(sfmmu_t *, caddr_t, tte_t *, int);
    481 #define		sfmmu_load_tsb(sfmmup, vaddr, tte, szc) \
    482 	sfmmu_mod_tsb(sfmmup, vaddr, tte, szc)
    483 #define		sfmmu_unload_tsb(sfmmup, vaddr, szc)    \
    484 	sfmmu_mod_tsb(sfmmup, vaddr, NULL, szc)
    485 static void	sfmmu_copy_tsb(struct tsb_info *, struct tsb_info *);
    486 static tsb_replace_rc_t sfmmu_replace_tsb(sfmmu_t *, struct tsb_info *, uint_t,
    487     hatlock_t *, uint_t);
    488 static void	sfmmu_size_tsb(sfmmu_t *, int, uint64_t, uint64_t, int);
    489 
    490 #ifdef VAC
    491 void	sfmmu_cache_flush(pfn_t, int);
    492 void	sfmmu_cache_flushcolor(int, pfn_t);
    493 #endif
    494 static caddr_t	sfmmu_hblk_chgattr(sfmmu_t *, struct hme_blk *, caddr_t,
    495 			caddr_t, demap_range_t *, uint_t, int);
    496 
    497 static uint64_t	sfmmu_vtop_attr(uint_t, int mode, tte_t *);
    498 static uint_t	sfmmu_ptov_attr(tte_t *);
    499 static caddr_t	sfmmu_hblk_chgprot(sfmmu_t *, struct hme_blk *, caddr_t,
    500 			caddr_t, demap_range_t *, uint_t);
    501 static uint_t	sfmmu_vtop_prot(uint_t, uint_t *);
    502 static int	sfmmu_idcache_constructor(void *, void *, int);
    503 static void	sfmmu_idcache_destructor(void *, void *);
    504 static int	sfmmu_hblkcache_constructor(void *, void *, int);
    505 static void	sfmmu_hblkcache_destructor(void *, void *);
    506 static void	sfmmu_hblkcache_reclaim(void *);
    507 static void	sfmmu_shadow_hcleanup(sfmmu_t *, struct hme_blk *,
    508 			struct hmehash_bucket *);
    509 static void	sfmmu_hblk_hash_rm(struct hmehash_bucket *, struct hme_blk *,
    510 			struct hme_blk *, struct hme_blk **, int);
    511 static void	sfmmu_hblk_hash_add(struct hmehash_bucket *, struct hme_blk *,
    512 			uint64_t);
    513 static struct hme_blk *sfmmu_check_pending_hblks(int);
    514 static void	sfmmu_free_hblks(sfmmu_t *, caddr_t, caddr_t, int);
    515 static void	sfmmu_cleanup_rhblk(sf_srd_t *, caddr_t, uint_t, int);
    516 static void	sfmmu_unload_hmeregion_va(sf_srd_t *, uint_t, caddr_t, caddr_t,
    517 			int, caddr_t *);
    518 static void	sfmmu_unload_hmeregion(sf_srd_t *, sf_region_t *);
    519 
    520 static void	sfmmu_rm_large_mappings(page_t *, int);
    521 
    522 static void	hat_lock_init(void);
    523 static void	hat_kstat_init(void);
    524 static int	sfmmu_kstat_percpu_update(kstat_t *ksp, int rw);
    525 static void	sfmmu_set_scd_rttecnt(sf_srd_t *, sf_scd_t *);
    526 static	int	sfmmu_is_rgnva(sf_srd_t *, caddr_t, ulong_t, ulong_t);
    527 static void	sfmmu_check_page_sizes(sfmmu_t *, int);
    528 int	fnd_mapping_sz(page_t *);
    529 static void	iment_add(struct ism_ment *,  struct hat *);
    530 static void	iment_sub(struct ism_ment *, struct hat *);
    531 static pgcnt_t	ism_tsb_entries(sfmmu_t *, int szc);
    532 extern void	sfmmu_setup_tsbinfo(sfmmu_t *);
    533 extern void	sfmmu_clear_utsbinfo(void);
    534 
    535 static void	sfmmu_ctx_wrap_around(mmu_ctx_t *);
    536 
    537 extern int vpm_enable;
    538 
    539 /* kpm globals */
    540 #ifdef	DEBUG
    541 /*
    542  * Enable trap level tsbmiss handling
    543  */
    544 int	kpm_tsbmtl = 1;
    545 
    546 /*
    547  * Flush the TLB on kpm mapout. Note: Xcalls are used (again) for the
    548  * required TLB shootdowns in this case, so handle w/ care. Off by default.
    549  */
    550 int	kpm_tlb_flush;
    551 #endif	/* DEBUG */
    552 
    553 static void	*sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *, size_t, int);
    554 
    555 #ifdef DEBUG
    556 static void	sfmmu_check_hblk_flist();
    557 #endif
    558 
    559 /*
    560  * Semi-private sfmmu data structures.  Some of them are initialize in
    561  * startup or in hat_init. Some of them are private but accessed by
    562  * assembly code or mach_sfmmu.c
    563  */
    564 struct hmehash_bucket *uhme_hash;	/* user hmeblk hash table */
    565 struct hmehash_bucket *khme_hash;	/* kernel hmeblk hash table */
    566 uint64_t	uhme_hash_pa;		/* PA of uhme_hash */
    567 uint64_t	khme_hash_pa;		/* PA of khme_hash */
    568 int 		uhmehash_num;		/* # of buckets in user hash table */
    569 int 		khmehash_num;		/* # of buckets in kernel hash table */
    570 
    571 uint_t		max_mmu_ctxdoms = 0;	/* max context domains in the system */
    572 mmu_ctx_t	**mmu_ctxs_tbl;		/* global array of context domains */
    573 uint64_t	mmu_saved_gnum = 0;	/* to init incoming MMUs' gnums */
    574 
    575 #define	DEFAULT_NUM_CTXS_PER_MMU 8192
    576 static uint_t	nctxs = DEFAULT_NUM_CTXS_PER_MMU;
    577 
    578 int		cache;			/* describes system cache */
    579 
    580 caddr_t		ktsb_base;		/* kernel 8k-indexed tsb base address */
    581 uint64_t	ktsb_pbase;		/* kernel 8k-indexed tsb phys address */
    582 int		ktsb_szcode;		/* kernel 8k-indexed tsb size code */
    583 int		ktsb_sz;		/* kernel 8k-indexed tsb size */
    584 
    585 caddr_t		ktsb4m_base;		/* kernel 4m-indexed tsb base address */
    586 uint64_t	ktsb4m_pbase;		/* kernel 4m-indexed tsb phys address */
    587 int		ktsb4m_szcode;		/* kernel 4m-indexed tsb size code */
    588 int		ktsb4m_sz;		/* kernel 4m-indexed tsb size */
    589 
    590 uint64_t	kpm_tsbbase;		/* kernel seg_kpm 4M TSB base address */
    591 int		kpm_tsbsz;		/* kernel seg_kpm 4M TSB size code */
    592 uint64_t	kpmsm_tsbbase;		/* kernel seg_kpm 8K TSB base address */
    593 int		kpmsm_tsbsz;		/* kernel seg_kpm 8K TSB size code */
    594 
    595 #ifndef sun4v
    596 int		utsb_dtlb_ttenum = -1;	/* index in TLB for utsb locked TTE */
    597 int		utsb4m_dtlb_ttenum = -1; /* index in TLB for 4M TSB TTE */
    598 int		dtlb_resv_ttenum;	/* index in TLB of first reserved TTE */
    599 caddr_t		utsb_vabase;		/* reserved kernel virtual memory */
    600 caddr_t		utsb4m_vabase;		/* for trap handler TSB accesses */
    601 #endif /* sun4v */
    602 uint64_t	tsb_alloc_bytes = 0;	/* bytes allocated to TSBs */
    603 vmem_t		*kmem_tsb_default_arena[NLGRPS_MAX];	/* For dynamic TSBs */
    604 vmem_t		*kmem_bigtsb_default_arena[NLGRPS_MAX]; /* dynamic 256M TSBs */
    605 
    606 /*
    607  * Size to use for TSB slabs.  Future platforms that support page sizes
    608  * larger than 4M may wish to change these values, and provide their own
    609  * assembly macros for building and decoding the TSB base register contents.
    610  * Note disable_large_pages will override the value set here.
    611  */
    612 static	uint_t tsb_slab_ttesz = TTE4M;
    613 size_t	tsb_slab_size = MMU_PAGESIZE4M;
    614 uint_t	tsb_slab_shift = MMU_PAGESHIFT4M;
    615 /* PFN mask for TTE */
    616 size_t	tsb_slab_mask = MMU_PAGEOFFSET4M >> MMU_PAGESHIFT;
    617 
    618 /*
    619  * Size to use for TSB slabs.  These are used only when 256M tsb arenas
    620  * exist.
    621  */
    622 static uint_t	bigtsb_slab_ttesz = TTE256M;
    623 static size_t	bigtsb_slab_size = MMU_PAGESIZE256M;
    624 static uint_t	bigtsb_slab_shift = MMU_PAGESHIFT256M;
    625 /* 256M page alignment for 8K pfn */
    626 static size_t	bigtsb_slab_mask = MMU_PAGEOFFSET256M >> MMU_PAGESHIFT;
    627 
    628 /* largest TSB size to grow to, will be smaller on smaller memory systems */
    629 static int	tsb_max_growsize = 0;
    630 
    631 /*
    632  * Tunable parameters dealing with TSB policies.
    633  */
    634 
    635 /*
    636  * This undocumented tunable forces all 8K TSBs to be allocated from
    637  * the kernel heap rather than from the kmem_tsb_default_arena arenas.
    638  */
    639 #ifdef	DEBUG
    640 int	tsb_forceheap = 0;
    641 #endif	/* DEBUG */
    642 
    643 /*
    644  * Decide whether to use per-lgroup arenas, or one global set of
    645  * TSB arenas.  The default is not to break up per-lgroup, since
    646  * most platforms don't recognize any tangible benefit from it.
    647  */
    648 int	tsb_lgrp_affinity = 0;
    649 
    650 /*
    651  * Used for growing the TSB based on the process RSS.
    652  * tsb_rss_factor is based on the smallest TSB, and is
    653  * shifted by the TSB size to determine if we need to grow.
    654  * The default will grow the TSB if the number of TTEs for
    655  * this page size exceeds 75% of the number of TSB entries,
    656  * which should _almost_ eliminate all conflict misses
    657  * (at the expense of using up lots and lots of memory).
    658  */
    659 #define	TSB_RSS_FACTOR		(TSB_ENTRIES(TSB_MIN_SZCODE) * 0.75)
    660 #define	SFMMU_RSS_TSBSIZE(tsbszc)	(tsb_rss_factor << tsbszc)
    661 #define	SELECT_TSB_SIZECODE(pgcnt) ( \
    662 	(enable_tsb_rss_sizing)? sfmmu_select_tsb_szc(pgcnt) : \
    663 	default_tsb_size)
    664 #define	TSB_OK_SHRINK()	\
    665 	(tsb_alloc_bytes > tsb_alloc_hiwater || freemem < desfree)
    666 #define	TSB_OK_GROW()	\
    667 	(tsb_alloc_bytes < tsb_alloc_hiwater && freemem > desfree)
    668 
    669 int	enable_tsb_rss_sizing = 1;
    670 int	tsb_rss_factor	= (int)TSB_RSS_FACTOR;
    671 
    672 /* which TSB size code to use for new address spaces or if rss sizing off */
    673 int default_tsb_size = TSB_8K_SZCODE;
    674 
    675 static uint64_t tsb_alloc_hiwater; /* limit TSB reserved memory */
    676 uint64_t tsb_alloc_hiwater_factor; /* tsb_alloc_hiwater = physmem / this */
    677 #define	TSB_ALLOC_HIWATER_FACTOR_DEFAULT	32
    678 
    679 #ifdef DEBUG
    680 static int tsb_random_size = 0;	/* set to 1 to test random tsb sizes on alloc */
    681 static int tsb_grow_stress = 0;	/* if set to 1, keep replacing TSB w/ random */
    682 static int tsb_alloc_mtbf = 0;	/* fail allocation every n attempts */
    683 static int tsb_alloc_fail_mtbf = 0;
    684 static int tsb_alloc_count = 0;
    685 #endif /* DEBUG */
    686 
    687 /* if set to 1, will remap valid TTEs when growing TSB. */
    688 int tsb_remap_ttes = 1;
    689 
    690 /*
    691  * If we have more than this many mappings, allocate a second TSB.
    692  * This default is chosen because the I/D fully associative TLBs are
    693  * assumed to have at least 8 available entries. Platforms with a
    694  * larger fully-associative TLB could probably override the default.
    695  */
    696 
    697 #ifdef sun4v
    698 int tsb_sectsb_threshold = 0;
    699 #else
    700 int tsb_sectsb_threshold = 8;
    701 #endif
    702 
    703 /*
    704  * kstat data
    705  */
    706 struct sfmmu_global_stat sfmmu_global_stat;
    707 struct sfmmu_tsbsize_stat sfmmu_tsbsize_stat;
    708 
    709 /*
    710  * Global data
    711  */
    712 sfmmu_t 	*ksfmmup;		/* kernel's hat id */
    713 
    714 #ifdef DEBUG
    715 static void	chk_tte(tte_t *, tte_t *, tte_t *, struct hme_blk *);
    716 #endif
    717 
    718 /* sfmmu locking operations */
    719 static kmutex_t *sfmmu_mlspl_enter(struct page *, int);
    720 static int	sfmmu_mlspl_held(struct page *, int);
    721 
    722 kmutex_t *sfmmu_page_enter(page_t *);
    723 void	sfmmu_page_exit(kmutex_t *);
    724 int	sfmmu_page_spl_held(struct page *);
    725 
    726 /* sfmmu internal locking operations - accessed directly */
    727 static void	sfmmu_mlist_reloc_enter(page_t *, page_t *,
    728 				kmutex_t **, kmutex_t **);
    729 static void	sfmmu_mlist_reloc_exit(kmutex_t *, kmutex_t *);
    730 static hatlock_t *
    731 		sfmmu_hat_enter(sfmmu_t *);
    732 static hatlock_t *
    733 		sfmmu_hat_tryenter(sfmmu_t *);
    734 static void	sfmmu_hat_exit(hatlock_t *);
    735 static void	sfmmu_hat_lock_all(void);
    736 static void	sfmmu_hat_unlock_all(void);
    737 static void	sfmmu_ismhat_enter(sfmmu_t *, int);
    738 static void	sfmmu_ismhat_exit(sfmmu_t *, int);
    739 
    740 /*
    741  * Array of mutexes protecting a page's mapping list and p_nrm field.
    742  *
    743  * The hash function looks complicated, but is made up so that:
    744  *
    745  * "pp" not shifted, so adjacent pp values will hash to different cache lines
    746  *  (8 byte alignment * 8 bytes/mutes == 64 byte coherency subblock)
    747  *
    748  * "pp" >> mml_shift, incorporates more source bits into the hash result
    749  *
    750  *  "& (mml_table_size - 1), should be faster than using remainder "%"
    751  *
    752  * Hopefully, mml_table, mml_table_size and mml_shift are all in the same
    753  * cacheline, since they get declared next to each other below. We'll trust
    754  * ld not to do something random.
    755  */
    756 #ifdef	DEBUG
    757 int mlist_hash_debug = 0;
    758 #define	MLIST_HASH(pp)	(mlist_hash_debug ? &mml_table[0] : \
    759 	&mml_table[((uintptr_t)(pp) + \
    760 	((uintptr_t)(pp) >> mml_shift)) & (mml_table_sz - 1)])
    761 #else	/* !DEBUG */
    762 #define	MLIST_HASH(pp)   &mml_table[ \
    763 	((uintptr_t)(pp) + ((uintptr_t)(pp) >> mml_shift)) & (mml_table_sz - 1)]
    764 #endif	/* !DEBUG */
    765 
    766 kmutex_t		*mml_table;
    767 uint_t			mml_table_sz;	/* must be a power of 2 */
    768 uint_t			mml_shift;	/* log2(mml_table_sz) + 3 for align */
    769 
    770 kpm_hlk_t	*kpmp_table;
    771 uint_t		kpmp_table_sz;	/* must be a power of 2 */
    772 uchar_t		kpmp_shift;
    773 
    774 kpm_shlk_t	*kpmp_stable;
    775 uint_t		kpmp_stable_sz;	/* must be a power of 2 */
    776 
    777 /*
    778  * SPL_HASH was improved to avoid false cache line sharing
    779  */
    780 #define	SPL_TABLE_SIZE	128
    781 #define	SPL_MASK	(SPL_TABLE_SIZE - 1)
    782 #define	SPL_SHIFT	7		/* log2(SPL_TABLE_SIZE) */
    783 
    784 #define	SPL_INDEX(pp) \
    785 	((((uintptr_t)(pp) >> SPL_SHIFT) ^ \
    786 	((uintptr_t)(pp) >> (SPL_SHIFT << 1))) & \
    787 	(SPL_TABLE_SIZE - 1))
    788 
    789 #define	SPL_HASH(pp)    \
    790 	(&sfmmu_page_lock[SPL_INDEX(pp) & SPL_MASK].pad_mutex)
    791 
    792 static	pad_mutex_t	sfmmu_page_lock[SPL_TABLE_SIZE];
    793 
    794 
    795 /*
    796  * hat_unload_callback() will group together callbacks in order
    797  * to avoid xt_sync() calls.  This is the maximum size of the group.
    798  */
    799 #define	MAX_CB_ADDR	32
    800 
    801 tte_t	hw_tte;
    802 static ulong_t sfmmu_dmr_maxbit = DMR_MAXBIT;
    803 
    804 static char	*mmu_ctx_kstat_names[] = {
    805 	"mmu_ctx_tsb_exceptions",
    806 	"mmu_ctx_tsb_raise_exception",
    807 	"mmu_ctx_wrap_around",
    808 };
    809 
    810 /*
    811  * Wrapper for vmem_xalloc since vmem_create only allows limited
    812  * parameters for vm_source_alloc functions.  This function allows us
    813  * to specify alignment consistent with the size of the object being
    814  * allocated.
    815  */
    816 static void *
    817 sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *vmp, size_t size, int vmflag)
    818 {
    819 	return (vmem_xalloc(vmp, size, size, 0, 0, NULL, NULL, vmflag));
    820 }
    821 
    822 /* Common code for setting tsb_alloc_hiwater. */
    823 #define	SFMMU_SET_TSB_ALLOC_HIWATER(pages)	tsb_alloc_hiwater = \
    824 		ptob(pages) / tsb_alloc_hiwater_factor
    825 
    826 /*
    827  * Set tsb_max_growsize to allow at most all of physical memory to be mapped by
    828  * a single TSB.  physmem is the number of physical pages so we need physmem 8K
    829  * TTEs to represent all those physical pages.  We round this up by using
    830  * 1<<highbit().  To figure out which size code to use, remember that the size
    831  * code is just an amount to shift the smallest TSB size to get the size of
    832  * this TSB.  So we subtract that size, TSB_START_SIZE, from highbit() (or
    833  * highbit() - 1) to get the size code for the smallest TSB that can represent
    834  * all of physical memory, while erring on the side of too much.
    835  *
    836  * Restrict tsb_max_growsize to make sure that:
    837  *	1) TSBs can't grow larger than the TSB slab size
    838  *	2) TSBs can't grow larger than UTSB_MAX_SZCODE.
    839  */
    840 #define	SFMMU_SET_TSB_MAX_GROWSIZE(pages) {				\
    841 	int	_i, _szc, _slabszc, _tsbszc;				\
    842 									\
    843 	_i = highbit(pages);						\
    844 	if ((1 << (_i - 1)) == (pages))					\
    845 		_i--;		/* 2^n case, round down */              \
    846 	_szc = _i - TSB_START_SIZE;					\
    847 	_slabszc = bigtsb_slab_shift - (TSB_START_SIZE + TSB_ENTRY_SHIFT); \
    848 	_tsbszc = MIN(_szc, _slabszc);                                  \
    849 	tsb_max_growsize = MIN(_tsbszc, UTSB_MAX_SZCODE);               \
    850 }
    851 
    852 /*
    853  * Given a pointer to an sfmmu and a TTE size code, return a pointer to the
    854  * tsb_info which handles that TTE size.
    855  */
    856 #define	SFMMU_GET_TSBINFO(tsbinfop, sfmmup, tte_szc) {			\
    857 	(tsbinfop) = (sfmmup)->sfmmu_tsb;				\
    858 	ASSERT(((tsbinfop)->tsb_flags & TSB_SHAREDCTX) ||		\
    859 	    sfmmu_hat_lock_held(sfmmup));				\
    860 	if ((tte_szc) >= TTE4M)	{					\
    861 		ASSERT((tsbinfop) != NULL);				\
    862 		(tsbinfop) = (tsbinfop)->tsb_next;			\
    863 	}								\
    864 }
    865 
    866 /*
    867  * Macro to use to unload entries from the TSB.
    868  * It has knowledge of which page sizes get replicated in the TSB
    869  * and will call the appropriate unload routine for the appropriate size.
    870  */
    871 #define	SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, ismhat)		\
    872 {									\
    873 	int ttesz = get_hblk_ttesz(hmeblkp);				\
    874 	if (ttesz == TTE8K || ttesz == TTE4M) {				\
    875 		sfmmu_unload_tsb(sfmmup, addr, ttesz);			\
    876 	} else {							\
    877 		caddr_t sva = ismhat ? addr : 				\
    878 		    (caddr_t)get_hblk_base(hmeblkp);			\
    879 		caddr_t eva = sva + get_hblk_span(hmeblkp);		\
    880 		ASSERT(addr >= sva && addr < eva);			\
    881 		sfmmu_unload_tsb_range(sfmmup, sva, eva, ttesz);	\
    882 	}								\
    883 }
    884 
    885 
    886 /* Update tsb_alloc_hiwater after memory is configured. */
    887 /*ARGSUSED*/
    888 static void
    889 sfmmu_update_post_add(void *arg, pgcnt_t delta_pages)
    890 {
    891 	/* Assumes physmem has already been updated. */
    892 	SFMMU_SET_TSB_ALLOC_HIWATER(physmem);
    893 	SFMMU_SET_TSB_MAX_GROWSIZE(physmem);
    894 }
    895 
    896 /*
    897  * Update tsb_alloc_hiwater before memory is deleted.  We'll do nothing here
    898  * and update tsb_alloc_hiwater and tsb_max_growsize after the memory is
    899  * deleted.
    900  */
    901 /*ARGSUSED*/
    902 static int
    903 sfmmu_update_pre_del(void *arg, pgcnt_t delta_pages)
    904 {
    905 	return (0);
    906 }
    907 
    908 /* Update tsb_alloc_hiwater after memory fails to be unconfigured. */
    909 /*ARGSUSED*/
    910 static void
    911 sfmmu_update_post_del(void *arg, pgcnt_t delta_pages, int cancelled)
    912 {
    913 	/*
    914 	 * Whether the delete was cancelled or not, just go ahead and update
    915 	 * tsb_alloc_hiwater and tsb_max_growsize.
    916 	 */
    917 	SFMMU_SET_TSB_ALLOC_HIWATER(physmem);
    918 	SFMMU_SET_TSB_MAX_GROWSIZE(physmem);
    919 }
    920 
    921 static kphysm_setup_vector_t sfmmu_update_vec = {
    922 	KPHYSM_SETUP_VECTOR_VERSION,	/* version */
    923 	sfmmu_update_post_add,		/* post_add */
    924 	sfmmu_update_pre_del,		/* pre_del */
    925 	sfmmu_update_post_del		/* post_del */
    926 };
    927 
    928 
    929 /*
    930  * HME_BLK HASH PRIMITIVES
    931  */
    932 
    933 /*
    934  * Enter a hme on the mapping list for page pp.
    935  * When large pages are more prevalent in the system we might want to
    936  * keep the mapping list in ascending order by the hment size. For now,
    937  * small pages are more frequent, so don't slow it down.
    938  */
    939 #define	HME_ADD(hme, pp)					\
    940 {								\
    941 	ASSERT(sfmmu_mlist_held(pp));				\
    942 								\
    943 	hme->hme_prev = NULL;					\
    944 	hme->hme_next = pp->p_mapping;				\
    945 	hme->hme_page = pp;					\
    946 	if (pp->p_mapping) {					\
    947 		((struct sf_hment *)(pp->p_mapping))->hme_prev = hme;\
    948 		ASSERT(pp->p_share > 0);			\
    949 	} else  {						\
    950 		/* EMPTY */					\
    951 		ASSERT(pp->p_share == 0);			\
    952 	}							\
    953 	pp->p_mapping = hme;					\
    954 	pp->p_share++;						\
    955 }
    956 
    957 /*
    958  * Enter a hme on the mapping list for page pp.
    959  * If we are unmapping a large translation, we need to make sure that the
    960  * change is reflect in the corresponding bit of the p_index field.
    961  */
    962 #define	HME_SUB(hme, pp)					\
    963 {								\
    964 	ASSERT(sfmmu_mlist_held(pp));				\
    965 	ASSERT(hme->hme_page == pp || IS_PAHME(hme));		\
    966 								\
    967 	if (pp->p_mapping == NULL) {				\
    968 		panic("hme_remove - no mappings");		\
    969 	}							\
    970 								\
    971 	membar_stst();	/* ensure previous stores finish */	\
    972 								\
    973 	ASSERT(pp->p_share > 0);				\
    974 	pp->p_share--;						\
    975 								\
    976 	if (hme->hme_prev) {					\
    977 		ASSERT(pp->p_mapping != hme);			\
    978 		ASSERT(hme->hme_prev->hme_page == pp ||		\
    979 			IS_PAHME(hme->hme_prev));		\
    980 		hme->hme_prev->hme_next = hme->hme_next;	\
    981 	} else {						\
    982 		ASSERT(pp->p_mapping == hme);			\
    983 		pp->p_mapping = hme->hme_next;			\
    984 		ASSERT((pp->p_mapping == NULL) ?		\
    985 			(pp->p_share == 0) : 1);		\
    986 	}							\
    987 								\
    988 	if (hme->hme_next) {					\
    989 		ASSERT(hme->hme_next->hme_page == pp ||		\
    990 			IS_PAHME(hme->hme_next));		\
    991 		hme->hme_next->hme_prev = hme->hme_prev;	\
    992 	}							\
    993 								\
    994 	/* zero out the entry */				\
    995 	hme->hme_next = NULL;					\
    996 	hme->hme_prev = NULL;					\
    997 	hme->hme_page = NULL;					\
    998 								\
    999 	if (hme_size(hme) > TTE8K) {				\
   1000 		/* remove mappings for remainder of large pg */	\
   1001 		sfmmu_rm_large_mappings(pp, hme_size(hme));	\
   1002 	}							\
   1003 }
   1004 
   1005 /*
   1006  * This function returns the hment given the hme_blk and a vaddr.
   1007  * It assumes addr has already been checked to belong to hme_blk's
   1008  * range.
   1009  */
   1010 #define	HBLKTOHME(hment, hmeblkp, addr)					\
   1011 {									\
   1012 	int index;							\
   1013 	HBLKTOHME_IDX(hment, hmeblkp, addr, index)			\
   1014 }
   1015 
   1016 /*
   1017  * Version of HBLKTOHME that also returns the index in hmeblkp
   1018  * of the hment.
   1019  */
   1020 #define	HBLKTOHME_IDX(hment, hmeblkp, addr, idx)			\
   1021 {									\
   1022 	ASSERT(in_hblk_range((hmeblkp), (addr)));			\
   1023 									\
   1024 	if (get_hblk_ttesz(hmeblkp) == TTE8K) {				\
   1025 		idx = (((uintptr_t)(addr) >> MMU_PAGESHIFT) & (NHMENTS-1)); \
   1026 	} else								\
   1027 		idx = 0;						\
   1028 									\
   1029 	(hment) = &(hmeblkp)->hblk_hme[idx];				\
   1030 }
   1031 
   1032 /*
   1033  * Disable any page sizes not supported by the CPU
   1034  */
   1035 void
   1036 hat_init_pagesizes()
   1037 {
   1038 	int 		i;
   1039 
   1040 	mmu_exported_page_sizes = 0;
   1041 	for (i = TTE8K; i < max_mmu_page_sizes; i++) {
   1042 
   1043 		szc_2_userszc[i] = (uint_t)-1;
   1044 		userszc_2_szc[i] = (uint_t)-1;
   1045 
   1046 		if ((mmu_exported_pagesize_mask & (1 << i)) == 0) {
   1047 			disable_large_pages |= (1 << i);
   1048 		} else {
   1049 			szc_2_userszc[i] = mmu_exported_page_sizes;
   1050 			userszc_2_szc[mmu_exported_page_sizes] = i;
   1051 			mmu_exported_page_sizes++;
   1052 		}
   1053 	}
   1054 
   1055 	disable_ism_large_pages |= disable_large_pages;
   1056 	disable_auto_data_large_pages = disable_large_pages;
   1057 	disable_auto_text_large_pages = disable_large_pages;
   1058 
   1059 	/*
   1060 	 * Initialize mmu-specific large page sizes.
   1061 	 */
   1062 	if (&mmu_large_pages_disabled) {
   1063 		disable_large_pages |= mmu_large_pages_disabled(HAT_LOAD);
   1064 		disable_ism_large_pages |=
   1065 		    mmu_large_pages_disabled(HAT_LOAD_SHARE);
   1066 		disable_auto_data_large_pages |=
   1067 		    mmu_large_pages_disabled(HAT_AUTO_DATA);
   1068 		disable_auto_text_large_pages |=
   1069 		    mmu_large_pages_disabled(HAT_AUTO_TEXT);
   1070 	}
   1071 }
   1072 
   1073 /*
   1074  * Initialize the hardware address translation structures.
   1075  */
   1076 void
   1077 hat_init(void)
   1078 {
   1079 	int 		i;
   1080 	uint_t		sz;
   1081 	size_t		size;
   1082 
   1083 	hat_lock_init();
   1084 	hat_kstat_init();
   1085 
   1086 	/*
   1087 	 * Hardware-only bits in a TTE
   1088 	 */
   1089 	MAKE_TTE_MASK(&hw_tte);
   1090 
   1091 	hat_init_pagesizes();
   1092 
   1093 	/* Initialize the hash locks */
   1094 	for (i = 0; i < khmehash_num; i++) {
   1095 		mutex_init(&khme_hash[i].hmehash_mutex, NULL,
   1096 		    MUTEX_DEFAULT, NULL);
   1097 		khme_hash[i].hmeh_nextpa = HMEBLK_ENDPA;
   1098 	}
   1099 	for (i = 0; i < uhmehash_num; i++) {
   1100 		mutex_init(&uhme_hash[i].hmehash_mutex, NULL,
   1101 		    MUTEX_DEFAULT, NULL);
   1102 		uhme_hash[i].hmeh_nextpa = HMEBLK_ENDPA;
   1103 	}
   1104 	khmehash_num--;		/* make sure counter starts from 0 */
   1105 	uhmehash_num--;		/* make sure counter starts from 0 */
   1106 
   1107 	/*
   1108 	 * Allocate context domain structures.
   1109 	 *
   1110 	 * A platform may choose to modify max_mmu_ctxdoms in
   1111 	 * set_platform_defaults(). If a platform does not define
   1112 	 * a set_platform_defaults() or does not choose to modify
   1113 	 * max_mmu_ctxdoms, it gets one MMU context domain for every CPU.
   1114 	 *
   1115 	 * For sun4v, there will be one global context domain, this is to
   1116 	 * avoid the ldom cpu substitution problem.
   1117 	 *
   1118 	 * For all platforms that have CPUs sharing MMUs, this
   1119 	 * value must be defined.
   1120 	 */
   1121 	if (max_mmu_ctxdoms == 0) {
   1122 #ifndef sun4v
   1123 		max_mmu_ctxdoms = max_ncpus;
   1124 #else /* sun4v */
   1125 		max_mmu_ctxdoms = 1;
   1126 #endif /* sun4v */
   1127 	}
   1128 
   1129 	size = max_mmu_ctxdoms * sizeof (mmu_ctx_t *);
   1130 	mmu_ctxs_tbl = kmem_zalloc(size, KM_SLEEP);
   1131 
   1132 	/* mmu_ctx_t is 64 bytes aligned */
   1133 	mmuctxdom_cache = kmem_cache_create("mmuctxdom_cache",
   1134 	    sizeof (mmu_ctx_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
   1135 	/*
   1136 	 * MMU context domain initialization for the Boot CPU.
   1137 	 * This needs the context domains array allocated above.
   1138 	 */
   1139 	mutex_enter(&cpu_lock);
   1140 	sfmmu_cpu_init(CPU);
   1141 	mutex_exit(&cpu_lock);
   1142 
   1143 	/*
   1144 	 * Intialize ism mapping list lock.
   1145 	 */
   1146 
   1147 	mutex_init(&ism_mlist_lock, NULL, MUTEX_DEFAULT, NULL);
   1148 
   1149 	/*
   1150 	 * Each sfmmu structure carries an array of MMU context info
   1151 	 * structures, one per context domain. The size of this array depends
   1152 	 * on the maximum number of context domains. So, the size of the
   1153 	 * sfmmu structure varies per platform.
   1154 	 *
   1155 	 * sfmmu is allocated from static arena, because trap
   1156 	 * handler at TL > 0 is not allowed to touch kernel relocatable
   1157 	 * memory. sfmmu's alignment is changed to 64 bytes from
   1158 	 * default 8 bytes, as the lower 6 bits will be used to pass
   1159 	 * pgcnt to vtag_flush_pgcnt_tl1.
   1160 	 */
   1161 	size = sizeof (sfmmu_t) + sizeof (sfmmu_ctx_t) * (max_mmu_ctxdoms - 1);
   1162 
   1163 	sfmmuid_cache = kmem_cache_create("sfmmuid_cache", size,
   1164 	    64, sfmmu_idcache_constructor, sfmmu_idcache_destructor,
   1165 	    NULL, NULL, static_arena, 0);
   1166 
   1167 	sfmmu_tsbinfo_cache = kmem_cache_create("sfmmu_tsbinfo_cache",
   1168 	    sizeof (struct tsb_info), 0, NULL, NULL, NULL, NULL, NULL, 0);
   1169 
   1170 	/*
   1171 	 * Since we only use the tsb8k cache to "borrow" pages for TSBs
   1172 	 * from the heap when low on memory or when TSB_FORCEALLOC is
   1173 	 * specified, don't use magazines to cache them--we want to return
   1174 	 * them to the system as quickly as possible.
   1175 	 */
   1176 	sfmmu_tsb8k_cache = kmem_cache_create("sfmmu_tsb8k_cache",
   1177 	    MMU_PAGESIZE, MMU_PAGESIZE, NULL, NULL, NULL, NULL,
   1178 	    static_arena, KMC_NOMAGAZINE);
   1179 
   1180 	/*
   1181 	 * Set tsb_alloc_hiwater to 1/tsb_alloc_hiwater_factor of physical
   1182 	 * memory, which corresponds to the old static reserve for TSBs.
   1183 	 * tsb_alloc_hiwater_factor defaults to 32.  This caps the amount of
   1184 	 * memory we'll allocate for TSB slabs; beyond this point TSB
   1185 	 * allocations will be taken from the kernel heap (via
   1186 	 * sfmmu_tsb8k_cache) and will be throttled as would any other kmem
   1187 	 * consumer.
   1188 	 */
   1189 	if (tsb_alloc_hiwater_factor == 0) {
   1190 		tsb_alloc_hiwater_factor = TSB_ALLOC_HIWATER_FACTOR_DEFAULT;
   1191 	}
   1192 	SFMMU_SET_TSB_ALLOC_HIWATER(physmem);
   1193 
   1194 	for (sz = tsb_slab_ttesz; sz > 0; sz--) {
   1195 		if (!(disable_large_pages & (1 << sz)))
   1196 			break;
   1197 	}
   1198 
   1199 	if (sz < tsb_slab_ttesz) {
   1200 		tsb_slab_ttesz = sz;
   1201 		tsb_slab_shift = MMU_PAGESHIFT + (sz << 1) + sz;
   1202 		tsb_slab_size = 1 << tsb_slab_shift;
   1203 		tsb_slab_mask = (1 << (tsb_slab_shift - MMU_PAGESHIFT)) - 1;
   1204 		use_bigtsb_arena = 0;
   1205 	} else if (use_bigtsb_arena &&
   1206 	    (disable_large_pages & (1 << bigtsb_slab_ttesz))) {
   1207 		use_bigtsb_arena = 0;
   1208 	}
   1209 
   1210 	if (!use_bigtsb_arena) {
   1211 		bigtsb_slab_shift = tsb_slab_shift;
   1212 	}
   1213 	SFMMU_SET_TSB_MAX_GROWSIZE(physmem);
   1214 
   1215 	/*
   1216 	 * On smaller memory systems, allocate TSB memory in smaller chunks
   1217 	 * than the default 4M slab size. We also honor disable_large_pages
   1218 	 * here.
   1219 	 *
   1220 	 * The trap handlers need to be patched with the final slab shift,
   1221 	 * since they need to be able to construct the TSB pointer at runtime.
   1222 	 */
   1223 	if ((tsb_max_growsize <= TSB_512K_SZCODE) &&
   1224 	    !(disable_large_pages & (1 << TTE512K))) {
   1225 		tsb_slab_ttesz = TTE512K;
   1226 		tsb_slab_shift = MMU_PAGESHIFT512K;
   1227 		tsb_slab_size = MMU_PAGESIZE512K;
   1228 		tsb_slab_mask = MMU_PAGEOFFSET512K >> MMU_PAGESHIFT;
   1229 		use_bigtsb_arena = 0;
   1230 	}
   1231 
   1232 	if (!use_bigtsb_arena) {
   1233 		bigtsb_slab_ttesz = tsb_slab_ttesz;
   1234 		bigtsb_slab_shift = tsb_slab_shift;
   1235 		bigtsb_slab_size = tsb_slab_size;
   1236 		bigtsb_slab_mask = tsb_slab_mask;
   1237 	}
   1238 
   1239 
   1240 	/*
   1241 	 * Set up memory callback to update tsb_alloc_hiwater and
   1242 	 * tsb_max_growsize.
   1243 	 */
   1244 	i = kphysm_setup_func_register(&sfmmu_update_vec, (void *) 0);
   1245 	ASSERT(i == 0);
   1246 
   1247 	/*
   1248 	 * kmem_tsb_arena is the source from which large TSB slabs are
   1249 	 * drawn.  The quantum of this arena corresponds to the largest
   1250 	 * TSB size we can dynamically allocate for user processes.
   1251 	 * Currently it must also be a supported page size since we
   1252 	 * use exactly one translation entry to map each slab page.
   1253 	 *
   1254 	 * The per-lgroup kmem_tsb_default_arena arenas are the arenas from
   1255 	 * which most TSBs are allocated.  Since most TSB allocations are
   1256 	 * typically 8K we have a kmem cache we stack on top of each
   1257 	 * kmem_tsb_default_arena to speed up those allocations.
   1258 	 *
   1259 	 * Note the two-level scheme of arenas is required only
   1260 	 * because vmem_create doesn't allow us to specify alignment
   1261 	 * requirements.  If this ever changes the code could be
   1262 	 * simplified to use only one level of arenas.
   1263 	 *
   1264 	 * If 256M page support exists on sun4v, 256MB kmem_bigtsb_arena
   1265 	 * will be provided in addition to the 4M kmem_tsb_arena.
   1266 	 */
   1267 	if (use_bigtsb_arena) {
   1268 		kmem_bigtsb_arena = vmem_create("kmem_bigtsb", NULL, 0,
   1269 		    bigtsb_slab_size, sfmmu_vmem_xalloc_aligned_wrapper,
   1270 		    vmem_xfree, heap_arena, 0, VM_SLEEP);
   1271 	}
   1272 
   1273 	kmem_tsb_arena = vmem_create("kmem_tsb", NULL, 0, tsb_slab_size,
   1274 	    sfmmu_vmem_xalloc_aligned_wrapper,
   1275 	    vmem_xfree, heap_arena, 0, VM_SLEEP);
   1276 
   1277 	if (tsb_lgrp_affinity) {
   1278 		char s[50];
   1279 		for (i = 0; i < NLGRPS_MAX; i++) {
   1280 			if (use_bigtsb_arena) {
   1281 				(void) sprintf(s, "kmem_bigtsb_lgrp%d", i);
   1282 				kmem_bigtsb_default_arena[i] = vmem_create(s,
   1283 				    NULL, 0, 2 * tsb_slab_size,
   1284 				    sfmmu_tsb_segkmem_alloc,
   1285 				    sfmmu_tsb_segkmem_free, kmem_bigtsb_arena,
   1286 				    0, VM_SLEEP | VM_BESTFIT);
   1287 			}
   1288 
   1289 			(void) sprintf(s, "kmem_tsb_lgrp%d", i);
   1290 			kmem_tsb_default_arena[i] = vmem_create(s,
   1291 			    NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc,
   1292 			    sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0,
   1293 			    VM_SLEEP | VM_BESTFIT);
   1294 
   1295 			(void) sprintf(s, "sfmmu_tsb_lgrp%d_cache", i);
   1296 			sfmmu_tsb_cache[i] = kmem_cache_create(s,
   1297 			    PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL,
   1298 			    kmem_tsb_default_arena[i], 0);
   1299 		}
   1300 	} else {
   1301 		if (use_bigtsb_arena) {
   1302 			kmem_bigtsb_default_arena[0] =
   1303 			    vmem_create("kmem_bigtsb_default", NULL, 0,
   1304 			    2 * tsb_slab_size, sfmmu_tsb_segkmem_alloc,
   1305 			    sfmmu_tsb_segkmem_free, kmem_bigtsb_arena, 0,
   1306 			    VM_SLEEP | VM_BESTFIT);
   1307 		}
   1308 
   1309 		kmem_tsb_default_arena[0] = vmem_create("kmem_tsb_default",
   1310 		    NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc,
   1311 		    sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0,
   1312 		    VM_SLEEP | VM_BESTFIT);
   1313 		sfmmu_tsb_cache[0] = kmem_cache_create("sfmmu_tsb_cache",
   1314 		    PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL,
   1315 		    kmem_tsb_default_arena[0], 0);
   1316 	}
   1317 
   1318 	sfmmu8_cache = kmem_cache_create("sfmmu8_cache", HME8BLK_SZ,
   1319 	    HMEBLK_ALIGN, sfmmu_hblkcache_constructor,
   1320 	    sfmmu_hblkcache_destructor,
   1321 	    sfmmu_hblkcache_reclaim, (void *)HME8BLK_SZ,
   1322 	    hat_memload_arena, KMC_NOHASH);
   1323 
   1324 	hat_memload1_arena = vmem_create("hat_memload1", NULL, 0, PAGESIZE,
   1325 	    segkmem_alloc_permanent, segkmem_free, heap_arena, 0,
   1326 	    VMC_DUMPSAFE | VM_SLEEP);
   1327 
   1328 	sfmmu1_cache = kmem_cache_create("sfmmu1_cache", HME1BLK_SZ,
   1329 	    HMEBLK_ALIGN, sfmmu_hblkcache_constructor,
   1330 	    sfmmu_hblkcache_destructor,
   1331 	    NULL, (void *)HME1BLK_SZ,
   1332 	    hat_memload1_arena, KMC_NOHASH);
   1333 
   1334 	pa_hment_cache = kmem_cache_create("pa_hment_cache", PAHME_SZ,
   1335 	    0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
   1336 
   1337 	ism_blk_cache = kmem_cache_create("ism_blk_cache",
   1338 	    sizeof (ism_blk_t), ecache_alignsize, NULL, NULL,
   1339 	    NULL, NULL, static_arena, KMC_NOHASH);
   1340 
   1341 	ism_ment_cache = kmem_cache_create("ism_ment_cache",
   1342 	    sizeof (ism_ment_t), 0, NULL, NULL,
   1343 	    NULL, NULL, NULL, 0);
   1344 
   1345 	/*
   1346 	 * We grab the first hat for the kernel,
   1347 	 */
   1348 	AS_LOCK_ENTER(&kas, &kas.a_lock, RW_WRITER);
   1349 	kas.a_hat = hat_alloc(&kas);
   1350 	AS_LOCK_EXIT(&kas, &kas.a_lock);
   1351 
   1352 	/*
   1353 	 * Initialize hblk_reserve.
   1354 	 */
   1355 	((struct hme_blk *)hblk_reserve)->hblk_nextpa =
   1356 	    va_to_pa((caddr_t)hblk_reserve);
   1357 
   1358 #ifndef UTSB_PHYS
   1359 	/*
   1360 	 * Reserve some kernel virtual address space for the locked TTEs
   1361 	 * that allow us to probe the TSB from TL>0.
   1362 	 */
   1363 	utsb_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size,
   1364 	    0, 0, NULL, NULL, VM_SLEEP);
   1365 	utsb4m_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size,
   1366 	    0, 0, NULL, NULL, VM_SLEEP);
   1367 #endif
   1368 
   1369 #ifdef VAC
   1370 	/*
   1371 	 * The big page VAC handling code assumes VAC
   1372 	 * will not be bigger than the smallest big
   1373 	 * page- which is 64K.
   1374 	 */
   1375 	if (TTEPAGES(TTE64K) < CACHE_NUM_COLOR) {
   1376 		cmn_err(CE_PANIC, "VAC too big!");
   1377 	}
   1378 #endif
   1379 
   1380 	(void) xhat_init();
   1381 
   1382 	uhme_hash_pa = va_to_pa(uhme_hash);
   1383 	khme_hash_pa = va_to_pa(khme_hash);
   1384 
   1385 	/*
   1386 	 * Initialize relocation locks. kpr_suspendlock is held
   1387 	 * at PIL_MAX to prevent interrupts from pinning the holder
   1388 	 * of a suspended TTE which may access it leading to a
   1389 	 * deadlock condition.
   1390 	 */
   1391 	mutex_init(&kpr_mutex, NULL, MUTEX_DEFAULT, NULL);
   1392 	mutex_init(&kpr_suspendlock, NULL, MUTEX_SPIN, (void *)PIL_MAX);
   1393 
   1394 	/*
   1395 	 * If Shared context support is disabled via /etc/system
   1396 	 * set shctx_on to 0 here if it was set to 1 earlier in boot
   1397 	 * sequence by cpu module initialization code.
   1398 	 */
   1399 	if (shctx_on && disable_shctx) {
   1400 		shctx_on = 0;
   1401 	}
   1402 
   1403 	if (shctx_on) {
   1404 		srd_buckets = kmem_zalloc(SFMMU_MAX_SRD_BUCKETS *
   1405 		    sizeof (srd_buckets[0]), KM_SLEEP);
   1406 		for (i = 0; i < SFMMU_MAX_SRD_BUCKETS; i++) {
   1407 			mutex_init(&srd_buckets[i].srdb_lock, NULL,
   1408 			    MUTEX_DEFAULT, NULL);
   1409 		}
   1410 
   1411 		srd_cache = kmem_cache_create("srd_cache", sizeof (sf_srd_t),
   1412 		    0, sfmmu_srdcache_constructor, sfmmu_srdcache_destructor,
   1413 		    NULL, NULL, NULL, 0);
   1414 		region_cache = kmem_cache_create("region_cache",
   1415 		    sizeof (sf_region_t), 0, sfmmu_rgncache_constructor,
   1416 		    sfmmu_rgncache_destructor, NULL, NULL, NULL, 0);
   1417 		scd_cache = kmem_cache_create("scd_cache", sizeof (sf_scd_t),
   1418 		    0, sfmmu_scdcache_constructor,  sfmmu_scdcache_destructor,
   1419 		    NULL, NULL, NULL, 0);
   1420 	}
   1421 
   1422 	/*
   1423 	 * Pre-allocate hrm_hashtab before enabling the collection of
   1424 	 * refmod statistics.  Allocating on the fly would mean us
   1425 	 * running the risk of suffering recursive mutex enters or
   1426 	 * deadlocks.
   1427 	 */
   1428 	hrm_hashtab = kmem_zalloc(HRM_HASHSIZE * sizeof (struct hrmstat *),
   1429 	    KM_SLEEP);
   1430 
   1431 	/* Allocate per-cpu pending freelist of hmeblks */
   1432 	cpu_hme_pend = kmem_zalloc((NCPU * sizeof (cpu_hme_pend_t)) + 64,
   1433 	    KM_SLEEP);
   1434 	cpu_hme_pend = (cpu_hme_pend_t *)P2ROUNDUP(
   1435 	    (uintptr_t)cpu_hme_pend, 64);
   1436 
   1437 	for (i = 0; i < NCPU; i++) {
   1438 		mutex_init(&cpu_hme_pend[i].chp_mutex, NULL, MUTEX_DEFAULT,
   1439 		    NULL);
   1440 	}
   1441 
   1442 	if (cpu_hme_pend_thresh == 0) {
   1443 		cpu_hme_pend_thresh = CPU_HME_PEND_THRESH;
   1444 	}
   1445 }
   1446 
   1447 /*
   1448  * Initialize locking for the hat layer, called early during boot.
   1449  */
   1450 static void
   1451 hat_lock_init()
   1452 {
   1453 	int i;
   1454 
   1455 	/*
   1456 	 * initialize the array of mutexes protecting a page's mapping
   1457 	 * list and p_nrm field.
   1458 	 */
   1459 	for (i = 0; i < mml_table_sz; i++)
   1460 		mutex_init(&mml_table[i], NULL, MUTEX_DEFAULT, NULL);
   1461 
   1462 	if (kpm_enable) {
   1463 		for (i = 0; i < kpmp_table_sz; i++) {
   1464 			mutex_init(&kpmp_table[i].khl_mutex, NULL,
   1465 			    MUTEX_DEFAULT, NULL);
   1466 		}
   1467 	}
   1468 
   1469 	/*
   1470 	 * Initialize array of mutex locks that protects sfmmu fields and
   1471 	 * TSB lists.
   1472 	 */
   1473 	for (i = 0; i < SFMMU_NUM_LOCK; i++)
   1474 		mutex_init(HATLOCK_MUTEXP(&hat_lock[i]), NULL, MUTEX_DEFAULT,
   1475 		    NULL);
   1476 }
   1477 
   1478 #define	SFMMU_KERNEL_MAXVA \
   1479 	(kmem64_base ? (uintptr_t)kmem64_end : (SYSLIMIT))
   1480 
   1481 /*
   1482  * Allocate a hat structure.
   1483  * Called when an address space first uses a hat.
   1484  */
   1485 struct hat *
   1486 hat_alloc(struct as *as)
   1487 {
   1488 	sfmmu_t *sfmmup;
   1489 	int i;
   1490 	uint64_t cnum;
   1491 	extern uint_t get_color_start(struct as *);
   1492 
   1493 	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
   1494 	sfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP);
   1495 	sfmmup->sfmmu_as = as;
   1496 	sfmmup->sfmmu_flags = 0;
   1497 	sfmmup->sfmmu_tteflags = 0;
   1498 	sfmmup->sfmmu_rtteflags = 0;
   1499 	LOCK_INIT_CLEAR(&sfmmup->sfmmu_ctx_lock);
   1500 
   1501 	if (as == &kas) {
   1502 		ksfmmup = sfmmup;
   1503 		sfmmup->sfmmu_cext = 0;
   1504 		cnum = KCONTEXT;
   1505 
   1506 		sfmmup->sfmmu_clrstart = 0;
   1507 		sfmmup->sfmmu_tsb = NULL;
   1508 		/*
   1509 		 * hat_kern_setup() will call sfmmu_init_ktsbinfo()
   1510 		 * to setup tsb_info for ksfmmup.
   1511 		 */
   1512 	} else {
   1513 
   1514 		/*
   1515 		 * Just set to invalid ctx. When it faults, it will
   1516 		 * get a valid ctx. This would avoid the situation
   1517 		 * where we get a ctx, but it gets stolen and then
   1518 		 * we fault when we try to run and so have to get
   1519 		 * another ctx.
   1520 		 */
   1521 		sfmmup->sfmmu_cext = 0;
   1522 		cnum = INVALID_CONTEXT;
   1523 
   1524 		/* initialize original physical page coloring bin */
   1525 		sfmmup->sfmmu_clrstart = get_color_start(as);
   1526 #ifdef DEBUG
   1527 		if (tsb_random_size) {
   1528 			uint32_t randval = (uint32_t)gettick() >> 4;
   1529 			int size = randval % (tsb_max_growsize + 1);
   1530 
   1531 			/* chose a random tsb size for stress testing */
   1532 			(void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, size,
   1533 			    TSB8K|TSB64K|TSB512K, 0, sfmmup);
   1534 		} else
   1535 #endif /* DEBUG */
   1536 			(void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb,
   1537 			    default_tsb_size,
   1538 			    TSB8K|TSB64K|TSB512K, 0, sfmmup);
   1539 		sfmmup->sfmmu_flags = HAT_SWAPPED | HAT_ALLCTX_INVALID;
   1540 		ASSERT(sfmmup->sfmmu_tsb != NULL);
   1541 	}
   1542 
   1543 	ASSERT(max_mmu_ctxdoms > 0);
   1544 	for (i = 0; i < max_mmu_ctxdoms; i++) {
   1545 		sfmmup->sfmmu_ctxs[i].cnum = cnum;
   1546 		sfmmup->sfmmu_ctxs[i].gnum = 0;
   1547 	}
   1548 
   1549 	for (i = 0; i < max_mmu_page_sizes; i++) {
   1550 		sfmmup->sfmmu_ttecnt[i] = 0;
   1551 		sfmmup->sfmmu_scdrttecnt[i] = 0;
   1552 		sfmmup->sfmmu_ismttecnt[i] = 0;
   1553 		sfmmup->sfmmu_scdismttecnt[i] = 0;
   1554 		sfmmup->sfmmu_pgsz[i] = TTE8K;
   1555 	}
   1556 	sfmmup->sfmmu_tsb0_4minflcnt = 0;
   1557 	sfmmup->sfmmu_iblk = NULL;
   1558 	sfmmup->sfmmu_ismhat = 0;
   1559 	sfmmup->sfmmu_scdhat = 0;
   1560 	sfmmup->sfmmu_ismblkpa = (uint64_t)-1;
   1561 	if (sfmmup == ksfmmup) {
   1562 		CPUSET_ALL(sfmmup->sfmmu_cpusran);
   1563 	} else {
   1564 		CPUSET_ZERO(sfmmup->sfmmu_cpusran);
   1565 	}
   1566 	sfmmup->sfmmu_free = 0;
   1567 	sfmmup->sfmmu_rmstat = 0;
   1568 	sfmmup->sfmmu_clrbin = sfmmup->sfmmu_clrstart;
   1569 	sfmmup->sfmmu_xhat_provider = NULL;
   1570 	cv_init(&sfmmup->sfmmu_tsb_cv, NULL, CV_DEFAULT, NULL);
   1571 	sfmmup->sfmmu_srdp = NULL;
   1572 	SF_RGNMAP_ZERO(sfmmup->sfmmu_region_map);
   1573 	bzero(sfmmup->sfmmu_hmeregion_links, SFMMU_L1_HMERLINKS_SIZE);
   1574 	sfmmup->sfmmu_scdp = NULL;
   1575 	sfmmup->sfmmu_scd_link.next = NULL;
   1576 	sfmmup->sfmmu_scd_link.prev = NULL;
   1577 	return (sfmmup);
   1578 }
   1579 
   1580 /*
   1581  * Create per-MMU context domain kstats for a given MMU ctx.
   1582  */
   1583 static void
   1584 sfmmu_mmu_kstat_create(mmu_ctx_t *mmu_ctxp)
   1585 {
   1586 	mmu_ctx_stat_t	stat;
   1587 	kstat_t		*mmu_kstat;
   1588 
   1589 	ASSERT(MUTEX_HELD(&cpu_lock));
   1590 	ASSERT(mmu_ctxp->mmu_kstat == NULL);
   1591 
   1592 	mmu_kstat = kstat_create("unix", mmu_ctxp->mmu_idx, "mmu_ctx",
   1593 	    "hat", KSTAT_TYPE_NAMED, MMU_CTX_NUM_STATS, KSTAT_FLAG_VIRTUAL);
   1594 
   1595 	if (mmu_kstat == NULL) {
   1596 		cmn_err(CE_WARN, "kstat_create for MMU %d failed",
   1597 		    mmu_ctxp->mmu_idx);
   1598 	} else {
   1599 		mmu_kstat->ks_data = mmu_ctxp->mmu_kstat_data;
   1600 		for (stat = 0; stat < MMU_CTX_NUM_STATS; stat++)
   1601 			kstat_named_init(&mmu_ctxp->mmu_kstat_data[stat],
   1602 			    mmu_ctx_kstat_names[stat], KSTAT_DATA_INT64);
   1603 		mmu_ctxp->mmu_kstat = mmu_kstat;
   1604 		kstat_install(mmu_kstat);
   1605 	}
   1606 }
   1607 
   1608 /*
   1609  * plat_cpuid_to_mmu_ctx_info() is a platform interface that returns MMU
   1610  * context domain information for a given CPU. If a platform does not
   1611  * specify that interface, then the function below is used instead to return
   1612  * default information. The defaults are as follows:
   1613  *
   1614  *	- For sun4u systems there's one MMU context domain per CPU.
   1615  *	  This default is used by all sun4u systems except OPL. OPL systems
   1616  *	  provide platform specific interface to map CPU ids to MMU ids
   1617  *	  because on OPL more than 1 CPU shares a single MMU.
   1618  *        Note that on sun4v, there is one global context domain for
   1619  *	  the entire system. This is to avoid running into potential problem
   1620  *	  with ldom physical cpu substitution feature.
   1621  *	- The number of MMU context IDs supported on any CPU in the
   1622  *	  system is 8K.
   1623  */
   1624 /*ARGSUSED*/
   1625 static void
   1626 sfmmu_cpuid_to_mmu_ctx_info(processorid_t cpuid, mmu_ctx_info_t *infop)
   1627 {
   1628 	infop->mmu_nctxs = nctxs;
   1629 #ifndef sun4v
   1630 	infop->mmu_idx = cpu[cpuid]->cpu_seqid;
   1631 #else /* sun4v */
   1632 	infop->mmu_idx = 0;
   1633 #endif /* sun4v */
   1634 }
   1635 
   1636 /*
   1637  * Called during CPU initialization to set the MMU context-related information
   1638  * for a CPU.
   1639  *
   1640  * cpu_lock serializes accesses to mmu_ctxs and mmu_saved_gnum.
   1641  */
   1642 void
   1643 sfmmu_cpu_init(cpu_t *cp)
   1644 {
   1645 	mmu_ctx_info_t	info;
   1646 	mmu_ctx_t	*mmu_ctxp;
   1647 
   1648 	ASSERT(MUTEX_HELD(&cpu_lock));
   1649 
   1650 	if (&plat_cpuid_to_mmu_ctx_info == NULL)
   1651 		sfmmu_cpuid_to_mmu_ctx_info(cp->cpu_id, &info);
   1652 	else
   1653 		plat_cpuid_to_mmu_ctx_info(cp->cpu_id, &info);
   1654 
   1655 	ASSERT(info.mmu_idx < max_mmu_ctxdoms);
   1656 
   1657 	if ((mmu_ctxp = mmu_ctxs_tbl[info.mmu_idx]) == NULL) {
   1658 		/* Each mmu_ctx is cacheline aligned. */
   1659 		mmu_ctxp = kmem_cache_alloc(mmuctxdom_cache, KM_SLEEP);
   1660 		bzero(mmu_ctxp, sizeof (mmu_ctx_t));
   1661 
   1662 		mutex_init(&mmu_ctxp->mmu_lock, NULL, MUTEX_SPIN,
   1663 		    (void *)ipltospl(DISP_LEVEL));
   1664 		mmu_ctxp->mmu_idx = info.mmu_idx;
   1665 		mmu_ctxp->mmu_nctxs = info.mmu_nctxs;
   1666 		/*
   1667 		 * Globally for lifetime of a system,
   1668 		 * gnum must always increase.
   1669 		 * mmu_saved_gnum is protected by the cpu_lock.
   1670 		 */
   1671 		mmu_ctxp->mmu_gnum = mmu_saved_gnum + 1;
   1672 		mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS;
   1673 
   1674 		sfmmu_mmu_kstat_create(mmu_ctxp);
   1675 
   1676 		mmu_ctxs_tbl[info.mmu_idx] = mmu_ctxp;
   1677 	} else {
   1678 		ASSERT(mmu_ctxp->mmu_idx == info.mmu_idx);
   1679 	}
   1680 
   1681 	/*
   1682 	 * The mmu_lock is acquired here to prevent races with
   1683 	 * the wrap-around code.
   1684 	 */
   1685 	mutex_enter(&mmu_ctxp->mmu_lock);
   1686 
   1687 
   1688 	mmu_ctxp->mmu_ncpus++;
   1689 	CPUSET_ADD(mmu_ctxp->mmu_cpuset, cp->cpu_id);
   1690 	CPU_MMU_IDX(cp) = info.mmu_idx;
   1691 	CPU_MMU_CTXP(cp) = mmu_ctxp;
   1692 
   1693 	mutex_exit(&mmu_ctxp->mmu_lock);
   1694 }
   1695 
   1696 /*
   1697  * Called to perform MMU context-related cleanup for a CPU.
   1698  */
   1699 void
   1700 sfmmu_cpu_cleanup(cpu_t *cp)
   1701 {
   1702 	mmu_ctx_t	*mmu_ctxp;
   1703 
   1704 	ASSERT(MUTEX_HELD(&cpu_lock));
   1705 
   1706 	mmu_ctxp = CPU_MMU_CTXP(cp);
   1707 	ASSERT(mmu_ctxp != NULL);
   1708 
   1709 	/*
   1710 	 * The mmu_lock is acquired here to prevent races with
   1711 	 * the wrap-around code.
   1712 	 */
   1713 	mutex_enter(&mmu_ctxp->mmu_lock);
   1714 
   1715 	CPU_MMU_CTXP(cp) = NULL;
   1716 
   1717 	CPUSET_DEL(mmu_ctxp->mmu_cpuset, cp->cpu_id);
   1718 	if (--mmu_ctxp->mmu_ncpus == 0) {
   1719 		mmu_ctxs_tbl[mmu_ctxp->mmu_idx] = NULL;
   1720 		mutex_exit(&mmu_ctxp->mmu_lock);
   1721 		mutex_destroy(&mmu_ctxp->mmu_lock);
   1722 
   1723 		if (mmu_ctxp->mmu_kstat)
   1724 			kstat_delete(mmu_ctxp->mmu_kstat);
   1725 
   1726 		/* mmu_saved_gnum is protected by the cpu_lock. */
   1727 		if (mmu_saved_gnum < mmu_ctxp->mmu_gnum)
   1728 			mmu_saved_gnum = mmu_ctxp->mmu_gnum;
   1729 
   1730 		kmem_cache_free(mmuctxdom_cache, mmu_ctxp);
   1731 
   1732 		return;
   1733 	}
   1734 
   1735 	mutex_exit(&mmu_ctxp->mmu_lock);
   1736 }
   1737 
   1738 /*
   1739  * Hat_setup, makes an address space context the current active one.
   1740  * In sfmmu this translates to setting the secondary context with the
   1741  * corresponding context.
   1742  */
   1743 void
   1744 hat_setup(struct hat *sfmmup, int allocflag)
   1745 {
   1746 	hatlock_t *hatlockp;
   1747 
   1748 	/* Init needs some special treatment. */
   1749 	if (allocflag == HAT_INIT) {
   1750 		/*
   1751 		 * Make sure that we have
   1752 		 * 1. a TSB
   1753 		 * 2. a valid ctx that doesn't get stolen after this point.
   1754 		 */
   1755 		hatlockp = sfmmu_hat_enter(sfmmup);
   1756 
   1757 		/*
   1758 		 * Swap in the TSB.  hat_init() allocates tsbinfos without
   1759 		 * TSBs, but we need one for init, since the kernel does some
   1760 		 * special things to set up its stack and needs the TSB to
   1761 		 * resolve page faults.
   1762 		 */
   1763 		sfmmu_tsb_swapin(sfmmup, hatlockp);
   1764 
   1765 		sfmmu_get_ctx(sfmmup);
   1766 
   1767 		sfmmu_hat_exit(hatlockp);
   1768 	} else {
   1769 		ASSERT(allocflag == HAT_ALLOC);
   1770 
   1771 		hatlockp = sfmmu_hat_enter(sfmmup);
   1772 		kpreempt_disable();
   1773 
   1774 		CPUSET_ADD(sfmmup->sfmmu_cpusran, CPU->cpu_id);
   1775 		/*
   1776 		 * sfmmu_setctx_sec takes <pgsz|cnum> as a parameter,
   1777 		 * pagesize bits don't matter in this case since we are passing
   1778 		 * INVALID_CONTEXT to it.
   1779 		 * Compatibility Note: hw takes care of MMU_SCONTEXT1
   1780 		 */
   1781 		sfmmu_setctx_sec(INVALID_CONTEXT);
   1782 		sfmmu_clear_utsbinfo();
   1783 
   1784 		kpreempt_enable();
   1785 		sfmmu_hat_exit(hatlockp);
   1786 	}
   1787 }
   1788 
   1789 /*
   1790  * Free all the translation resources for the specified address space.
   1791  * Called from as_free when an address space is being destroyed.
   1792  */
   1793 void
   1794 hat_free_start(struct hat *sfmmup)
   1795 {
   1796 	ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock));
   1797 	ASSERT(sfmmup != ksfmmup);
   1798 	ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
   1799 
   1800 	sfmmup->sfmmu_free = 1;
   1801 	if (sfmmup->sfmmu_scdp != NULL) {
   1802 		sfmmu_leave_scd(sfmmup, 0);
   1803 	}
   1804 
   1805 	ASSERT(sfmmup->sfmmu_scdp == NULL);
   1806 }
   1807 
   1808 void
   1809 hat_free_end(struct hat *sfmmup)
   1810 {
   1811 	int i;
   1812 
   1813 	ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
   1814 	ASSERT(sfmmup->sfmmu_free == 1);
   1815 	ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0);
   1816 	ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0);
   1817 	ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0);
   1818 	ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0);
   1819 	ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0);
   1820 	ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0);
   1821 
   1822 	if (sfmmup->sfmmu_rmstat) {
   1823 		hat_freestat(sfmmup->sfmmu_as, NULL);
   1824 	}
   1825 
   1826 	while (sfmmup->sfmmu_tsb != NULL) {
   1827 		struct tsb_info *next = sfmmup->sfmmu_tsb->tsb_next;
   1828 		sfmmu_tsbinfo_free(sfmmup->sfmmu_tsb);
   1829 		sfmmup->sfmmu_tsb = next;
   1830 	}
   1831 
   1832 	if (sfmmup->sfmmu_srdp != NULL) {
   1833 		sfmmu_leave_srd(sfmmup);
   1834 		ASSERT(sfmmup->sfmmu_srdp == NULL);
   1835 		for (i = 0; i < SFMMU_L1_HMERLINKS; i++) {
   1836 			if (sfmmup->sfmmu_hmeregion_links[i] != NULL) {
   1837 				kmem_free(sfmmup->sfmmu_hmeregion_links[i],
   1838 				    SFMMU_L2_HMERLINKS_SIZE);
   1839 				sfmmup->sfmmu_hmeregion_links[i] = NULL;
   1840 			}
   1841 		}
   1842 	}
   1843 	sfmmu_free_sfmmu(sfmmup);
   1844 
   1845 #ifdef DEBUG
   1846 	for (i = 0; i < SFMMU_L1_HMERLINKS; i++) {
   1847 		ASSERT(sfmmup->sfmmu_hmeregion_links[i] == NULL);
   1848 	}
   1849 #endif
   1850 
   1851 	kmem_cache_free(sfmmuid_cache, sfmmup);
   1852 }
   1853 
   1854 /*
   1855  * Set up any translation structures, for the specified address space,
   1856  * that are needed or preferred when the process is being swapped in.
   1857  */
   1858 /* ARGSUSED */
   1859 void
   1860 hat_swapin(struct hat *hat)
   1861 {
   1862 	ASSERT(hat->sfmmu_xhat_provider == NULL);
   1863 }
   1864 
   1865 /*
   1866  * Free all of the translation resources, for the specified address space,
   1867  * that can be freed while the process is swapped out. Called from as_swapout.
   1868  * Also, free up the ctx that this process was using.
   1869  */
   1870 void
   1871 hat_swapout(struct hat *sfmmup)
   1872 {
   1873 	struct hmehash_bucket *hmebp;
   1874 	struct hme_blk *hmeblkp;
   1875 	struct hme_blk *pr_hblk = NULL;
   1876 	struct hme_blk *nx_hblk;
   1877 	int i;
   1878 	struct hme_blk *list = NULL;
   1879 	hatlock_t *hatlockp;
   1880 	struct tsb_info *tsbinfop;
   1881 	struct free_tsb {
   1882 		struct free_tsb *next;
   1883 		struct tsb_info *tsbinfop;
   1884 	};			/* free list of TSBs */
   1885 	struct free_tsb *freelist, *last, *next;
   1886 
   1887 	ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
   1888 	SFMMU_STAT(sf_swapout);
   1889 
   1890 	/*
   1891 	 * There is no way to go from an as to all its translations in sfmmu.
   1892 	 * Here is one of the times when we take the big hit and traverse
   1893 	 * the hash looking for hme_blks to free up.  Not only do we free up
   1894 	 * this as hme_blks but all those that are free.  We are obviously
   1895 	 * swapping because we need memory so let's free up as much
   1896 	 * as we can.
   1897 	 *
   1898 	 * Note that we don't flush TLB/TSB here -- it's not necessary
   1899 	 * because:
   1900 	 *  1) we free the ctx we're using and throw away the TSB(s);
   1901 	 *  2) processes aren't runnable while being swapped out.
   1902 	 */
   1903 	ASSERT(sfmmup != KHATID);
   1904 	for (i = 0; i <= UHMEHASH_SZ; i++) {
   1905 		hmebp = &uhme_hash[i];
   1906 		SFMMU_HASH_LOCK(hmebp);
   1907 		hmeblkp = hmebp->hmeblkp;
   1908 		pr_hblk = NULL;
   1909 		while (hmeblkp) {
   1910 
   1911 			ASSERT(!hmeblkp->hblk_xhat_bit);
   1912 
   1913 			if ((hmeblkp->hblk_tag.htag_id == sfmmup) &&
   1914 			    !hmeblkp->hblk_shw_bit && !hmeblkp->hblk_lckcnt) {
   1915 				ASSERT(!hmeblkp->hblk_shared);
   1916 				(void) sfmmu_hblk_unload(sfmmup, hmeblkp,
   1917 				    (caddr_t)get_hblk_base(hmeblkp),
   1918 				    get_hblk_endaddr(hmeblkp),
   1919 				    NULL, HAT_UNLOAD);
   1920 			}
   1921 			nx_hblk = hmeblkp->hblk_next;
   1922 			if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) {
   1923 				ASSERT(!hmeblkp->hblk_lckcnt);
   1924 				sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
   1925 				    &list, 0);
   1926 			} else {
   1927 				pr_hblk = hmeblkp;
   1928 			}
   1929 			hmeblkp = nx_hblk;
   1930 		}
   1931 		SFMMU_HASH_UNLOCK(hmebp);
   1932 	}
   1933 
   1934 	sfmmu_hblks_list_purge(&list, 0);
   1935 
   1936 	/*
   1937 	 * Now free up the ctx so that others can reuse it.
   1938 	 */
   1939 	hatlockp = sfmmu_hat_enter(sfmmup);
   1940 
   1941 	sfmmu_invalidate_ctx(sfmmup);
   1942 
   1943 	/*
   1944 	 * Free TSBs, but not tsbinfos, and set SWAPPED flag.
   1945 	 * If TSBs were never swapped in, just return.
   1946 	 * This implies that we don't support partial swapping
   1947 	 * of TSBs -- either all are swapped out, or none are.
   1948 	 *
   1949 	 * We must hold the HAT lock here to prevent racing with another
   1950 	 * thread trying to unmap TTEs from the TSB or running the post-
   1951 	 * relocator after relocating the TSB's memory.  Unfortunately, we
   1952 	 * can't free memory while holding the HAT lock or we could
   1953 	 * deadlock, so we build a list of TSBs to be freed after marking
   1954 	 * the tsbinfos as swapped out and free them after dropping the
   1955 	 * lock.
   1956 	 */
   1957 	if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) {
   1958 		sfmmu_hat_exit(hatlockp);
   1959 		return;
   1960 	}
   1961 
   1962 	SFMMU_FLAGS_SET(sfmmup, HAT_SWAPPED);
   1963 	last = freelist = NULL;
   1964 	for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL;
   1965 	    tsbinfop = tsbinfop->tsb_next) {
   1966 		ASSERT((tsbinfop->tsb_flags & TSB_SWAPPED) == 0);
   1967 
   1968 		/*
   1969 		 * Cast the TSB into a struct free_tsb and put it on the free
   1970 		 * list.
   1971 		 */
   1972 		if (freelist == NULL) {
   1973 			last = freelist = (struct free_tsb *)tsbinfop->tsb_va;
   1974 		} else {
   1975 			last->next = (struct free_tsb *)tsbinfop->tsb_va;
   1976 			last = last->next;
   1977 		}
   1978 		last->next = NULL;
   1979 		last->tsbinfop = tsbinfop;
   1980 		tsbinfop->tsb_flags |= TSB_SWAPPED;
   1981 		/*
   1982 		 * Zero out the TTE to clear the valid bit.
   1983 		 * Note we can't use a value like 0xbad because we want to
   1984 		 * ensure diagnostic bits are NEVER set on TTEs that might
   1985 		 * be loaded.  The intent is to catch any invalid access
   1986 		 * to the swapped TSB, such as a thread running with a valid
   1987 		 * context without first calling sfmmu_tsb_swapin() to
   1988 		 * allocate TSB memory.
   1989 		 */
   1990 		tsbinfop->tsb_tte.ll = 0;
   1991 	}
   1992 
   1993 	/* Now we can drop the lock and free the TSB memory. */
   1994 	sfmmu_hat_exit(hatlockp);
   1995 	for (; freelist != NULL; freelist = next) {
   1996 		next = freelist->next;
   1997 		sfmmu_tsb_free(freelist->tsbinfop);
   1998 	}
   1999 }
   2000 
   2001 /*
   2002  * Duplicate the translations of an as into another newas
   2003  */
   2004 /* ARGSUSED */
   2005 int
   2006 hat_dup(struct hat *hat, struct hat *newhat, caddr_t addr, size_t len,
   2007 	uint_t flag)
   2008 {
   2009 	sf_srd_t *srdp;
   2010 	sf_scd_t *scdp;
   2011 	int i;
   2012 	extern uint_t get_color_start(struct as *);
   2013 
   2014 	ASSERT(hat->sfmmu_xhat_provider == NULL);
   2015 	ASSERT((flag == 0) || (flag == HAT_DUP_ALL) || (flag == HAT_DUP_COW) ||
   2016 	    (flag == HAT_DUP_SRD));
   2017 	ASSERT(hat != ksfmmup);
   2018 	ASSERT(newhat != ksfmmup);
   2019 	ASSERT(flag != HAT_DUP_ALL || hat->sfmmu_srdp == newhat->sfmmu_srdp);
   2020 
   2021 	if (flag == HAT_DUP_COW) {
   2022 		panic("hat_dup: HAT_DUP_COW not supported");
   2023 	}
   2024 
   2025 	if (flag == HAT_DUP_SRD && ((srdp = hat->sfmmu_srdp) != NULL)) {
   2026 		ASSERT(srdp->srd_evp != NULL);
   2027 		VN_HOLD(srdp->srd_evp);
   2028 		ASSERT(srdp->srd_refcnt > 0);
   2029 		newhat->sfmmu_srdp = srdp;
   2030 		atomic_add_32((volatile uint_t *)&srdp->srd_refcnt, 1);
   2031 	}
   2032 
   2033 	/*
   2034 	 * HAT_DUP_ALL flag is used after as duplication is done.
   2035 	 */
   2036 	if (flag == HAT_DUP_ALL && ((srdp = newhat->sfmmu_srdp) != NULL)) {
   2037 		ASSERT(newhat->sfmmu_srdp->srd_refcnt >= 2);
   2038 		newhat->sfmmu_rtteflags = hat->sfmmu_rtteflags;
   2039 		if (hat->sfmmu_flags & HAT_4MTEXT_FLAG) {
   2040 			newhat->sfmmu_flags |= HAT_4MTEXT_FLAG;
   2041 		}
   2042 
   2043 		/* check if need to join scd */
   2044 		if ((scdp = hat->sfmmu_scdp) != NULL &&
   2045 		    newhat->sfmmu_scdp != scdp) {
   2046 			int ret;
   2047 			SF_RGNMAP_IS_SUBSET(&newhat->sfmmu_region_map,
   2048 			    &scdp->scd_region_map, ret);
   2049 			ASSERT(ret);
   2050 			sfmmu_join_scd(scdp, newhat);
   2051 			ASSERT(newhat->sfmmu_scdp == scdp &&
   2052 			    scdp->scd_refcnt >= 2);
   2053 			for (i = 0; i < max_mmu_page_sizes; i++) {
   2054 				newhat->sfmmu_ismttecnt[i] =
   2055 				    hat->sfmmu_ismttecnt[i];
   2056 				newhat->sfmmu_scdismttecnt[i] =
   2057 				    hat->sfmmu_scdismttecnt[i];
   2058 			}
   2059 		}
   2060 
   2061 		sfmmu_check_page_sizes(newhat, 1);
   2062 	}
   2063 
   2064 	if (flag == HAT_DUP_ALL && consistent_coloring == 0 &&
   2065 	    update_proc_pgcolorbase_after_fork != 0) {
   2066 		hat->sfmmu_clrbin = get_color_start(hat->sfmmu_as);
   2067 	}
   2068 	return (0);
   2069 }
   2070 
   2071 void
   2072 hat_memload(struct hat *hat, caddr_t addr, struct page *pp,
   2073 	uint_t attr, uint_t flags)
   2074 {
   2075 	hat_do_memload(hat, addr, pp, attr, flags,
   2076 	    SFMMU_INVALID_SHMERID);
   2077 }
   2078 
   2079 void
   2080 hat_memload_region(struct hat *hat, caddr_t addr, struct page *pp,
   2081 	uint_t attr, uint_t flags, hat_region_cookie_t rcookie)
   2082 {
   2083 	uint_t rid;
   2084 	if (rcookie == HAT_INVALID_REGION_COOKIE ||
   2085 	    hat->sfmmu_xhat_provider != NULL) {
   2086 		hat_do_memload(hat, addr, pp, attr, flags,
   2087 		    SFMMU_INVALID_SHMERID);
   2088 		return;
   2089 	}
   2090 	rid = (uint_t)((uint64_t)rcookie);
   2091 	ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   2092 	hat_do_memload(hat, addr, pp, attr, flags, rid);
   2093 }
   2094 
   2095 /*
   2096  * Set up addr to map to page pp with protection prot.
   2097  * As an optimization we also load the TSB with the
   2098  * corresponding tte but it is no big deal if  the tte gets kicked out.
   2099  */
   2100 static void
   2101 hat_do_memload(struct hat *hat, caddr_t addr, struct page *pp,
   2102 	uint_t attr, uint_t flags, uint_t rid)
   2103 {
   2104 	tte_t tte;
   2105 
   2106 
   2107 	ASSERT(hat != NULL);
   2108 	ASSERT(PAGE_LOCKED(pp));
   2109 	ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET));
   2110 	ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG));
   2111 	ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
   2112 	SFMMU_VALIDATE_HMERID(hat, rid, addr, MMU_PAGESIZE);
   2113 
   2114 	if (PP_ISFREE(pp)) {
   2115 		panic("hat_memload: loading a mapping to free page %p",
   2116 		    (void *)pp);
   2117 	}
   2118 
   2119 	if (hat->sfmmu_xhat_provider) {
   2120 		/* no regions for xhats */
   2121 		ASSERT(!SFMMU_IS_SHMERID_VALID(rid));
   2122 		XHAT_MEMLOAD(hat, addr, pp, attr, flags);
   2123 		return;
   2124 	}
   2125 
   2126 	ASSERT((hat == ksfmmup) ||
   2127 	    AS_LOCK_HELD(hat->sfmmu_as, &hat->sfmmu_as->a_lock));
   2128 
   2129 	if (flags & ~SFMMU_LOAD_ALLFLAG)
   2130 		cmn_err(CE_NOTE, "hat_memload: unsupported flags %d",
   2131 		    flags & ~SFMMU_LOAD_ALLFLAG);
   2132 
   2133 	if (hat->sfmmu_rmstat)
   2134 		hat_resvstat(MMU_PAGESIZE, hat->sfmmu_as, addr);
   2135 
   2136 #if defined(SF_ERRATA_57)
   2137 	if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) &&
   2138 	    (addr < errata57_limit) && (attr & PROT_EXEC) &&
   2139 	    !(flags & HAT_LOAD_SHARE)) {
   2140 		cmn_err(CE_WARN, "hat_memload: illegal attempt to make user "
   2141 		    " page executable");
   2142 		attr &= ~PROT_EXEC;
   2143 	}
   2144 #endif
   2145 
   2146 	sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K);
   2147 	(void) sfmmu_tteload_array(hat, &tte, addr, &pp, flags, rid);
   2148 
   2149 	/*
   2150 	 * Check TSB and TLB page sizes.
   2151 	 */
   2152 	if ((flags & HAT_LOAD_SHARE) == 0) {
   2153 		sfmmu_check_page_sizes(hat, 1);
   2154 	}
   2155 }
   2156 
   2157 /*
   2158  * hat_devload can be called to map real memory (e.g.
   2159  * /dev/kmem) and even though hat_devload will determine pf is
   2160  * for memory, it will be unable to get a shared lock on the
   2161  * page (because someone else has it exclusively) and will
   2162  * pass dp = NULL.  If tteload doesn't get a non-NULL
   2163  * page pointer it can't cache memory.
   2164  */
   2165 void
   2166 hat_devload(struct hat *hat, caddr_t addr, size_t len, pfn_t pfn,
   2167 	uint_t attr, int flags)
   2168 {
   2169 	tte_t tte;
   2170 	struct page *pp = NULL;
   2171 	int use_lgpg = 0;
   2172 
   2173 	ASSERT(hat != NULL);
   2174 
   2175 	if (hat->sfmmu_xhat_provider) {
   2176 		XHAT_DEVLOAD(hat, addr, len, pfn, attr, flags);
   2177 		return;
   2178 	}
   2179 
   2180 	ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG));
   2181 	ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
   2182 	ASSERT((hat == ksfmmup) ||
   2183 	    AS_LOCK_HELD(hat->sfmmu_as, &hat->sfmmu_as->a_lock));
   2184 	if (len == 0)
   2185 		panic("hat_devload: zero len");
   2186 	if (flags & ~SFMMU_LOAD_ALLFLAG)
   2187 		cmn_err(CE_NOTE, "hat_devload: unsupported flags %d",
   2188 		    flags & ~SFMMU_LOAD_ALLFLAG);
   2189 
   2190 #if defined(SF_ERRATA_57)
   2191 	if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) &&
   2192 	    (addr < errata57_limit) && (attr & PROT_EXEC) &&
   2193 	    !(flags & HAT_LOAD_SHARE)) {
   2194 		cmn_err(CE_WARN, "hat_devload: illegal attempt to make user "
   2195 		    " page executable");
   2196 		attr &= ~PROT_EXEC;
   2197 	}
   2198 #endif
   2199 
   2200 	/*
   2201 	 * If it's a memory page find its pp
   2202 	 */
   2203 	if (!(flags & HAT_LOAD_NOCONSIST) && pf_is_memory(pfn)) {
   2204 		pp = page_numtopp_nolock(pfn);
   2205 		if (pp == NULL) {
   2206 			flags |= HAT_LOAD_NOCONSIST;
   2207 		} else {
   2208 			if (PP_ISFREE(pp)) {
   2209 				panic("hat_memload: loading "
   2210 				    "a mapping to free page %p",
   2211 				    (void *)pp);
   2212 			}
   2213 			if (!PAGE_LOCKED(pp) && !PP_ISNORELOC(pp)) {
   2214 				panic("hat_memload: loading a mapping "
   2215 				    "to unlocked relocatable page %p",
   2216 				    (void *)pp);
   2217 			}
   2218 			ASSERT(len == MMU_PAGESIZE);
   2219 		}
   2220 	}
   2221 
   2222 	if (hat->sfmmu_rmstat)
   2223 		hat_resvstat(len, hat->sfmmu_as, addr);
   2224 
   2225 	if (flags & HAT_LOAD_NOCONSIST) {
   2226 		attr |= SFMMU_UNCACHEVTTE;
   2227 		use_lgpg = 1;
   2228 	}
   2229 	if (!pf_is_memory(pfn)) {
   2230 		attr |= SFMMU_UNCACHEPTTE | HAT_NOSYNC;
   2231 		use_lgpg = 1;
   2232 		switch (attr & HAT_ORDER_MASK) {
   2233 			case HAT_STRICTORDER:
   2234 			case HAT_UNORDERED_OK:
   2235 				/*
   2236 				 * we set the side effect bit for all non
   2237 				 * memory mappings unless merging is ok
   2238 				 */
   2239 				attr |= SFMMU_SIDEFFECT;
   2240 				break;
   2241 			case HAT_MERGING_OK:
   2242 			case HAT_LOADCACHING_OK:
   2243 			case HAT_STORECACHING_OK:
   2244 				break;
   2245 			default:
   2246 				panic("hat_devload: bad attr");
   2247 				break;
   2248 		}
   2249 	}
   2250 	while (len) {
   2251 		if (!use_lgpg) {
   2252 			sfmmu_memtte(&tte, pfn, attr, TTE8K);
   2253 			(void) sfmmu_tteload_array(hat, &tte, addr, &pp,
   2254 			    flags, SFMMU_INVALID_SHMERID);
   2255 			len -= MMU_PAGESIZE;
   2256 			addr += MMU_PAGESIZE;
   2257 			pfn++;
   2258 			continue;
   2259 		}
   2260 		/*
   2261 		 *  try to use large pages, check va/pa alignments
   2262 		 *  Note that 32M/256M page sizes are not (yet) supported.
   2263 		 */
   2264 		if ((len >= MMU_PAGESIZE4M) &&
   2265 		    !((uintptr_t)addr & MMU_PAGEOFFSET4M) &&
   2266 		    !(disable_large_pages & (1 << TTE4M)) &&
   2267 		    !(mmu_ptob(pfn) & MMU_PAGEOFFSET4M)) {
   2268 			sfmmu_memtte(&tte, pfn, attr, TTE4M);
   2269 			(void) sfmmu_tteload_array(hat, &tte, addr, &pp,
   2270 			    flags, SFMMU_INVALID_SHMERID);
   2271 			len -= MMU_PAGESIZE4M;
   2272 			addr += MMU_PAGESIZE4M;
   2273 			pfn += MMU_PAGESIZE4M / MMU_PAGESIZE;
   2274 		} else if ((len >= MMU_PAGESIZE512K) &&
   2275 		    !((uintptr_t)addr & MMU_PAGEOFFSET512K) &&
   2276 		    !(disable_large_pages & (1 << TTE512K)) &&
   2277 		    !(mmu_ptob(pfn) & MMU_PAGEOFFSET512K)) {
   2278 			sfmmu_memtte(&tte, pfn, attr, TTE512K);
   2279 			(void) sfmmu_tteload_array(hat, &tte, addr, &pp,
   2280 			    flags, SFMMU_INVALID_SHMERID);
   2281 			len -= MMU_PAGESIZE512K;
   2282 			addr += MMU_PAGESIZE512K;
   2283 			pfn += MMU_PAGESIZE512K / MMU_PAGESIZE;
   2284 		} else if ((len >= MMU_PAGESIZE64K) &&
   2285 		    !((uintptr_t)addr & MMU_PAGEOFFSET64K) &&
   2286 		    !(disable_large_pages & (1 << TTE64K)) &&
   2287 		    !(mmu_ptob(pfn) & MMU_PAGEOFFSET64K)) {
   2288 			sfmmu_memtte(&tte, pfn, attr, TTE64K);
   2289 			(void) sfmmu_tteload_array(hat, &tte, addr, &pp,
   2290 			    flags, SFMMU_INVALID_SHMERID);
   2291 			len -= MMU_PAGESIZE64K;
   2292 			addr += MMU_PAGESIZE64K;
   2293 			pfn += MMU_PAGESIZE64K / MMU_PAGESIZE;
   2294 		} else {
   2295 			sfmmu_memtte(&tte, pfn, attr, TTE8K);
   2296 			(void) sfmmu_tteload_array(hat, &tte, addr, &pp,
   2297 			    flags, SFMMU_INVALID_SHMERID);
   2298 			len -= MMU_PAGESIZE;
   2299 			addr += MMU_PAGESIZE;
   2300 			pfn++;
   2301 		}
   2302 	}
   2303 
   2304 	/*
   2305 	 * Check TSB and TLB page sizes.
   2306 	 */
   2307 	if ((flags & HAT_LOAD_SHARE) == 0) {
   2308 		sfmmu_check_page_sizes(hat, 1);
   2309 	}
   2310 }
   2311 
   2312 void
   2313 hat_memload_array(struct hat *hat, caddr_t addr, size_t len,
   2314 	struct page **pps, uint_t attr, uint_t flags)
   2315 {
   2316 	hat_do_memload_array(hat, addr, len, pps, attr, flags,
   2317 	    SFMMU_INVALID_SHMERID);
   2318 }
   2319 
   2320 void
   2321 hat_memload_array_region(struct hat *hat, caddr_t addr, size_t len,
   2322 	struct page **pps, uint_t attr, uint_t flags,
   2323 	hat_region_cookie_t rcookie)
   2324 {
   2325 	uint_t rid;
   2326 	if (rcookie == HAT_INVALID_REGION_COOKIE ||
   2327 	    hat->sfmmu_xhat_provider != NULL) {
   2328 		hat_do_memload_array(hat, addr, len, pps, attr, flags,
   2329 		    SFMMU_INVALID_SHMERID);
   2330 		return;
   2331 	}
   2332 	rid = (uint_t)((uint64_t)rcookie);
   2333 	ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   2334 	hat_do_memload_array(hat, addr, len, pps, attr, flags, rid);
   2335 }
   2336 
   2337 /*
   2338  * Map the largest extend possible out of the page array. The array may NOT
   2339  * be in order.  The largest possible mapping a page can have
   2340  * is specified in the p_szc field.  The p_szc field
   2341  * cannot change as long as there any mappings (large or small)
   2342  * to any of the pages that make up the large page. (ie. any
   2343  * promotion/demotion of page size is not up to the hat but up to
   2344  * the page free list manager).  The array
   2345  * should consist of properly aligned contigous pages that are
   2346  * part of a big page for a large mapping to be created.
   2347  */
   2348 static void
   2349 hat_do_memload_array(struct hat *hat, caddr_t addr, size_t len,
   2350 	struct page **pps, uint_t attr, uint_t flags, uint_t rid)
   2351 {
   2352 	int  ttesz;
   2353 	size_t mapsz;
   2354 	pgcnt_t	numpg, npgs;
   2355 	tte_t tte;
   2356 	page_t *pp;
   2357 	uint_t large_pages_disable;
   2358 
   2359 	ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET));
   2360 	SFMMU_VALIDATE_HMERID(hat, rid, addr, len);
   2361 
   2362 	if (hat->sfmmu_xhat_provider) {
   2363 		ASSERT(!SFMMU_IS_SHMERID_VALID(rid));
   2364 		XHAT_MEMLOAD_ARRAY(hat, addr, len, pps, attr, flags);
   2365 		return;
   2366 	}
   2367 
   2368 	if (hat->sfmmu_rmstat)
   2369 		hat_resvstat(len, hat->sfmmu_as, addr);
   2370 
   2371 #if defined(SF_ERRATA_57)
   2372 	if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) &&
   2373 	    (addr < errata57_limit) && (attr & PROT_EXEC) &&
   2374 	    !(flags & HAT_LOAD_SHARE)) {
   2375 		cmn_err(CE_WARN, "hat_memload_array: illegal attempt to make "
   2376 		    "user page executable");
   2377 		attr &= ~PROT_EXEC;
   2378 	}
   2379 #endif
   2380 
   2381 	/* Get number of pages */
   2382 	npgs = len >> MMU_PAGESHIFT;
   2383 
   2384 	if (flags & HAT_LOAD_SHARE) {
   2385 		large_pages_disable = disable_ism_large_pages;
   2386 	} else {
   2387 		large_pages_disable = disable_large_pages;
   2388 	}
   2389 
   2390 	if (npgs < NHMENTS || large_pages_disable == LARGE_PAGES_OFF) {
   2391 		sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs,
   2392 		    rid);
   2393 		return;
   2394 	}
   2395 
   2396 	while (npgs >= NHMENTS) {
   2397 		pp = *pps;
   2398 		for (ttesz = pp->p_szc; ttesz != TTE8K; ttesz--) {
   2399 			/*
   2400 			 * Check if this page size is disabled.
   2401 			 */
   2402 			if (large_pages_disable & (1 << ttesz))
   2403 				continue;
   2404 
   2405 			numpg = TTEPAGES(ttesz);
   2406 			mapsz = numpg << MMU_PAGESHIFT;
   2407 			if ((npgs >= numpg) &&
   2408 			    IS_P2ALIGNED(addr, mapsz) &&
   2409 			    IS_P2ALIGNED(pp->p_pagenum, numpg)) {
   2410 				/*
   2411 				 * At this point we have enough pages and
   2412 				 * we know the virtual address and the pfn
   2413 				 * are properly aligned.  We still need
   2414 				 * to check for physical contiguity but since
   2415 				 * it is very likely that this is the case
   2416 				 * we will assume they are so and undo
   2417 				 * the request if necessary.  It would
   2418 				 * be great if we could get a hint flag
   2419 				 * like HAT_CONTIG which would tell us
   2420 				 * the pages are contigous for sure.
   2421 				 */
   2422 				sfmmu_memtte(&tte, (*pps)->p_pagenum,
   2423 				    attr, ttesz);
   2424 				if (!sfmmu_tteload_array(hat, &tte, addr,
   2425 				    pps, flags, rid)) {
   2426 					break;
   2427 				}
   2428 			}
   2429 		}
   2430 		if (ttesz == TTE8K) {
   2431 			/*
   2432 			 * We were not able to map array using a large page
   2433 			 * batch a hmeblk or fraction at a time.
   2434 			 */
   2435 			numpg = ((uintptr_t)addr >> MMU_PAGESHIFT)
   2436 			    & (NHMENTS-1);
   2437 			numpg = NHMENTS - numpg;
   2438 			ASSERT(numpg <= npgs);
   2439 			mapsz = numpg * MMU_PAGESIZE;
   2440 			sfmmu_memload_batchsmall(hat, addr, pps, attr, flags,
   2441 			    numpg, rid);
   2442 		}
   2443 		addr += mapsz;
   2444 		npgs -= numpg;
   2445 		pps += numpg;
   2446 	}
   2447 
   2448 	if (npgs) {
   2449 		sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs,
   2450 		    rid);
   2451 	}
   2452 
   2453 	/*
   2454 	 * Check TSB and TLB page sizes.
   2455 	 */
   2456 	if ((flags & HAT_LOAD_SHARE) == 0) {
   2457 		sfmmu_check_page_sizes(hat, 1);
   2458 	}
   2459 }
   2460 
   2461 /*
   2462  * Function tries to batch 8K pages into the same hme blk.
   2463  */
   2464 static void
   2465 sfmmu_memload_batchsmall(struct hat *hat, caddr_t vaddr, page_t **pps,
   2466 		    uint_t attr, uint_t flags, pgcnt_t npgs, uint_t rid)
   2467 {
   2468 	tte_t	tte;
   2469 	page_t *pp;
   2470 	struct hmehash_bucket *hmebp;
   2471 	struct hme_blk *hmeblkp;
   2472 	int	index;
   2473 
   2474 	while (npgs) {
   2475 		/*
   2476 		 * Acquire the hash bucket.
   2477 		 */
   2478 		hmebp = sfmmu_tteload_acquire_hashbucket(hat, vaddr, TTE8K,
   2479 		    rid);
   2480 		ASSERT(hmebp);
   2481 
   2482 		/*
   2483 		 * Find the hment block.
   2484 		 */
   2485 		hmeblkp = sfmmu_tteload_find_hmeblk(hat, hmebp, vaddr,
   2486 		    TTE8K, flags, rid);
   2487 		ASSERT(hmeblkp);
   2488 
   2489 		do {
   2490 			/*
   2491 			 * Make the tte.
   2492 			 */
   2493 			pp = *pps;
   2494 			sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K);
   2495 
   2496 			/*
   2497 			 * Add the translation.
   2498 			 */
   2499 			(void) sfmmu_tteload_addentry(hat, hmeblkp, &tte,
   2500 			    vaddr, pps, flags, rid);
   2501 
   2502 			/*
   2503 			 * Goto next page.
   2504 			 */
   2505 			pps++;
   2506 			npgs--;
   2507 
   2508 			/*
   2509 			 * Goto next address.
   2510 			 */
   2511 			vaddr += MMU_PAGESIZE;
   2512 
   2513 			/*
   2514 			 * Don't crossover into a different hmentblk.
   2515 			 */
   2516 			index = (int)(((uintptr_t)vaddr >> MMU_PAGESHIFT) &
   2517 			    (NHMENTS-1));
   2518 
   2519 		} while (index != 0 && npgs != 0);
   2520 
   2521 		/*
   2522 		 * Release the hash bucket.
   2523 		 */
   2524 
   2525 		sfmmu_tteload_release_hashbucket(hmebp);
   2526 	}
   2527 }
   2528 
   2529 /*
   2530  * Construct a tte for a page:
   2531  *
   2532  * tte_valid = 1
   2533  * tte_size2 = size & TTE_SZ2_BITS (Panther and Olympus-C only)
   2534  * tte_size = size
   2535  * tte_nfo = attr & HAT_NOFAULT
   2536  * tte_ie = attr & HAT_STRUCTURE_LE
   2537  * tte_hmenum = hmenum
   2538  * tte_pahi = pp->p_pagenum >> TTE_PASHIFT;
   2539  * tte_palo = pp->p_pagenum & TTE_PALOMASK;
   2540  * tte_ref = 1 (optimization)
   2541  * tte_wr_perm = attr & PROT_WRITE;
   2542  * tte_no_sync = attr & HAT_NOSYNC
   2543  * tte_lock = attr & SFMMU_LOCKTTE
   2544  * tte_cp = !(attr & SFMMU_UNCACHEPTTE)
   2545  * tte_cv = !(attr & SFMMU_UNCACHEVTTE)
   2546  * tte_e = attr & SFMMU_SIDEFFECT
   2547  * tte_priv = !(attr & PROT_USER)
   2548  * tte_hwwr = if nosync is set and it is writable we set the mod bit (opt)
   2549  * tte_glb = 0
   2550  */
   2551 void
   2552 sfmmu_memtte(tte_t *ttep, pfn_t pfn, uint_t attr, int tte_sz)
   2553 {
   2554 	ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
   2555 
   2556 	ttep->tte_inthi = MAKE_TTE_INTHI(pfn, attr, tte_sz, 0 /* hmenum */);
   2557 	ttep->tte_intlo = MAKE_TTE_INTLO(pfn, attr, tte_sz, 0 /* hmenum */);
   2558 
   2559 	if (TTE_IS_NOSYNC(ttep)) {
   2560 		TTE_SET_REF(ttep);
   2561 		if (TTE_IS_WRITABLE(ttep)) {
   2562 			TTE_SET_MOD(ttep);
   2563 		}
   2564 	}
   2565 	if (TTE_IS_NFO(ttep) && TTE_IS_EXECUTABLE(ttep)) {
   2566 		panic("sfmmu_memtte: can't set both NFO and EXEC bits");
   2567 	}
   2568 }
   2569 
   2570 /*
   2571  * This function will add a translation to the hme_blk and allocate the
   2572  * hme_blk if one does not exist.
   2573  * If a page structure is specified then it will add the
   2574  * corresponding hment to the mapping list.
   2575  * It will also update the hmenum field for the tte.
   2576  *
   2577  * Currently this function is only used for kernel mappings.
   2578  * So pass invalid region to sfmmu_tteload_array().
   2579  */
   2580 void
   2581 sfmmu_tteload(struct hat *sfmmup, tte_t *ttep, caddr_t vaddr, page_t *pp,
   2582 	uint_t flags)
   2583 {
   2584 	ASSERT(sfmmup == ksfmmup);
   2585 	(void) sfmmu_tteload_array(sfmmup, ttep, vaddr, &pp, flags,
   2586 	    SFMMU_INVALID_SHMERID);
   2587 }
   2588 
   2589 /*
   2590  * Load (ttep != NULL) or unload (ttep == NULL) one entry in the TSB.
   2591  * Assumes that a particular page size may only be resident in one TSB.
   2592  */
   2593 static void
   2594 sfmmu_mod_tsb(sfmmu_t *sfmmup, caddr_t vaddr, tte_t *ttep, int ttesz)
   2595 {
   2596 	struct tsb_info *tsbinfop = NULL;
   2597 	uint64_t tag;
   2598 	struct tsbe *tsbe_addr;
   2599 	uint64_t tsb_base;
   2600 	uint_t tsb_size;
   2601 	int vpshift = MMU_PAGESHIFT;
   2602 	int phys = 0;
   2603 
   2604 	if (sfmmup == ksfmmup) { /* No support for 32/256M ksfmmu pages */
   2605 		phys = ktsb_phys;
   2606 		if (ttesz >= TTE4M) {
   2607 #ifndef sun4v
   2608 			ASSERT((ttesz != TTE32M) && (ttesz != TTE256M));
   2609 #endif
   2610 			tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base;
   2611 			tsb_size = ktsb4m_szcode;
   2612 		} else {
   2613 			tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base;
   2614 			tsb_size = ktsb_szcode;
   2615 		}
   2616 	} else {
   2617 		SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz);
   2618 
   2619 		/*
   2620 		 * If there isn't a TSB for this page size, or the TSB is
   2621 		 * swapped out, there is nothing to do.  Note that the latter
   2622 		 * case seems impossible but can occur if hat_pageunload()
   2623 		 * is called on an ISM mapping while the process is swapped
   2624 		 * out.
   2625 		 */
   2626 		if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED))
   2627 			return;
   2628 
   2629 		/*
   2630 		 * If another thread is in the middle of relocating a TSB
   2631 		 * we can't unload the entry so set a flag so that the
   2632 		 * TSB will be flushed before it can be accessed by the
   2633 		 * process.
   2634 		 */
   2635 		if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) {
   2636 			if (ttep == NULL)
   2637 				tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED;
   2638 			return;
   2639 		}
   2640 #if defined(UTSB_PHYS)
   2641 		phys = 1;
   2642 		tsb_base = (uint64_t)tsbinfop->tsb_pa;
   2643 #else
   2644 		tsb_base = (uint64_t)tsbinfop->tsb_va;
   2645 #endif
   2646 		tsb_size = tsbinfop->tsb_szc;
   2647 	}
   2648 	if (ttesz >= TTE4M)
   2649 		vpshift = MMU_PAGESHIFT4M;
   2650 
   2651 	tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size);
   2652 	tag = sfmmu_make_tsbtag(vaddr);
   2653 
   2654 	if (ttep == NULL) {
   2655 		sfmmu_unload_tsbe(tsbe_addr, tag, phys);
   2656 	} else {
   2657 		if (ttesz >= TTE4M) {
   2658 			SFMMU_STAT(sf_tsb_load4m);
   2659 		} else {
   2660 			SFMMU_STAT(sf_tsb_load8k);
   2661 		}
   2662 
   2663 		sfmmu_load_tsbe(tsbe_addr, tag, ttep, phys);
   2664 	}
   2665 }
   2666 
   2667 /*
   2668  * Unmap all entries from [start, end) matching the given page size.
   2669  *
   2670  * This function is used primarily to unmap replicated 64K or 512K entries
   2671  * from the TSB that are inserted using the base page size TSB pointer, but
   2672  * it may also be called to unmap a range of addresses from the TSB.
   2673  */
   2674 void
   2675 sfmmu_unload_tsb_range(sfmmu_t *sfmmup, caddr_t start, caddr_t end, int ttesz)
   2676 {
   2677 	struct tsb_info *tsbinfop;
   2678 	uint64_t tag;
   2679 	struct tsbe *tsbe_addr;
   2680 	caddr_t vaddr;
   2681 	uint64_t tsb_base;
   2682 	int vpshift, vpgsz;
   2683 	uint_t tsb_size;
   2684 	int phys = 0;
   2685 
   2686 	/*
   2687 	 * Assumptions:
   2688 	 *  If ttesz == 8K, 64K or 512K, we walk through the range 8K
   2689 	 *  at a time shooting down any valid entries we encounter.
   2690 	 *
   2691 	 *  If ttesz >= 4M we walk the range 4M at a time shooting
   2692 	 *  down any valid mappings we find.
   2693 	 */
   2694 	if (sfmmup == ksfmmup) {
   2695 		phys = ktsb_phys;
   2696 		if (ttesz >= TTE4M) {
   2697 #ifndef sun4v
   2698 			ASSERT((ttesz != TTE32M) && (ttesz != TTE256M));
   2699 #endif
   2700 			tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base;
   2701 			tsb_size = ktsb4m_szcode;
   2702 		} else {
   2703 			tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base;
   2704 			tsb_size = ktsb_szcode;
   2705 		}
   2706 	} else {
   2707 		SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz);
   2708 
   2709 		/*
   2710 		 * If there isn't a TSB for this page size, or the TSB is
   2711 		 * swapped out, there is nothing to do.  Note that the latter
   2712 		 * case seems impossible but can occur if hat_pageunload()
   2713 		 * is called on an ISM mapping while the process is swapped
   2714 		 * out.
   2715 		 */
   2716 		if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED))
   2717 			return;
   2718 
   2719 		/*
   2720 		 * If another thread is in the middle of relocating a TSB
   2721 		 * we can't unload the entry so set a flag so that the
   2722 		 * TSB will be flushed before it can be accessed by the
   2723 		 * process.
   2724 		 */
   2725 		if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) {
   2726 			tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED;
   2727 			return;
   2728 		}
   2729 #if defined(UTSB_PHYS)
   2730 		phys = 1;
   2731 		tsb_base = (uint64_t)tsbinfop->tsb_pa;
   2732 #else
   2733 		tsb_base = (uint64_t)tsbinfop->tsb_va;
   2734 #endif
   2735 		tsb_size = tsbinfop->tsb_szc;
   2736 	}
   2737 	if (ttesz >= TTE4M) {
   2738 		vpshift = MMU_PAGESHIFT4M;
   2739 		vpgsz = MMU_PAGESIZE4M;
   2740 	} else {
   2741 		vpshift = MMU_PAGESHIFT;
   2742 		vpgsz = MMU_PAGESIZE;
   2743 	}
   2744 
   2745 	for (vaddr = start; vaddr < end; vaddr += vpgsz) {
   2746 		tag = sfmmu_make_tsbtag(vaddr);
   2747 		tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size);
   2748 		sfmmu_unload_tsbe(tsbe_addr, tag, phys);
   2749 	}
   2750 }
   2751 
   2752 /*
   2753  * Select the optimum TSB size given the number of mappings
   2754  * that need to be cached.
   2755  */
   2756 static int
   2757 sfmmu_select_tsb_szc(pgcnt_t pgcnt)
   2758 {
   2759 	int szc = 0;
   2760 
   2761 #ifdef DEBUG
   2762 	if (tsb_grow_stress) {
   2763 		uint32_t randval = (uint32_t)gettick() >> 4;
   2764 		return (randval % (tsb_max_growsize + 1));
   2765 	}
   2766 #endif	/* DEBUG */
   2767 
   2768 	while ((szc < tsb_max_growsize) && (pgcnt > SFMMU_RSS_TSBSIZE(szc)))
   2769 		szc++;
   2770 	return (szc);
   2771 }
   2772 
   2773 /*
   2774  * This function will add a translation to the hme_blk and allocate the
   2775  * hme_blk if one does not exist.
   2776  * If a page structure is specified then it will add the
   2777  * corresponding hment to the mapping list.
   2778  * It will also update the hmenum field for the tte.
   2779  * Furthermore, it attempts to create a large page translation
   2780  * for <addr,hat> at page array pps.  It assumes addr and first
   2781  * pp is correctly aligned.  It returns 0 if successful and 1 otherwise.
   2782  */
   2783 static int
   2784 sfmmu_tteload_array(sfmmu_t *sfmmup, tte_t *ttep, caddr_t vaddr,
   2785 	page_t **pps, uint_t flags, uint_t rid)
   2786 {
   2787 	struct hmehash_bucket *hmebp;
   2788 	struct hme_blk *hmeblkp;
   2789 	int 	ret;
   2790 	uint_t	size;
   2791 
   2792 	/*
   2793 	 * Get mapping size.
   2794 	 */
   2795 	size = TTE_CSZ(ttep);
   2796 	ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size)));
   2797 
   2798 	/*
   2799 	 * Acquire the hash bucket.
   2800 	 */
   2801 	hmebp = sfmmu_tteload_acquire_hashbucket(sfmmup, vaddr, size, rid);
   2802 	ASSERT(hmebp);
   2803 
   2804 	/*
   2805 	 * Find the hment block.
   2806 	 */
   2807 	hmeblkp = sfmmu_tteload_find_hmeblk(sfmmup, hmebp, vaddr, size, flags,
   2808 	    rid);
   2809 	ASSERT(hmeblkp);
   2810 
   2811 	/*
   2812 	 * Add the translation.
   2813 	 */
   2814 	ret = sfmmu_tteload_addentry(sfmmup, hmeblkp, ttep, vaddr, pps, flags,
   2815 	    rid);
   2816 
   2817 	/*
   2818 	 * Release the hash bucket.
   2819 	 */
   2820 	sfmmu_tteload_release_hashbucket(hmebp);
   2821 
   2822 	return (ret);
   2823 }
   2824 
   2825 /*
   2826  * Function locks and returns a pointer to the hash bucket for vaddr and size.
   2827  */
   2828 static struct hmehash_bucket *
   2829 sfmmu_tteload_acquire_hashbucket(sfmmu_t *sfmmup, caddr_t vaddr, int size,
   2830     uint_t rid)
   2831 {
   2832 	struct hmehash_bucket *hmebp;
   2833 	int hmeshift;
   2834 	void *htagid = sfmmutohtagid(sfmmup, rid);
   2835 
   2836 	ASSERT(htagid != NULL);
   2837 
   2838 	hmeshift = HME_HASH_SHIFT(size);
   2839 
   2840 	hmebp = HME_HASH_FUNCTION(htagid, vaddr, hmeshift);
   2841 
   2842 	SFMMU_HASH_LOCK(hmebp);
   2843 
   2844 	return (hmebp);
   2845 }
   2846 
   2847 /*
   2848  * Function returns a pointer to an hmeblk in the hash bucket, hmebp. If the
   2849  * hmeblk doesn't exists for the [sfmmup, vaddr & size] signature, a hmeblk is
   2850  * allocated.
   2851  */
   2852 static struct hme_blk *
   2853 sfmmu_tteload_find_hmeblk(sfmmu_t *sfmmup, struct hmehash_bucket *hmebp,
   2854 	caddr_t vaddr, uint_t size, uint_t flags, uint_t rid)
   2855 {
   2856 	hmeblk_tag hblktag;
   2857 	int hmeshift;
   2858 	struct hme_blk *hmeblkp, *pr_hblk, *list = NULL;
   2859 
   2860 	SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size));
   2861 
   2862 	hblktag.htag_id = sfmmutohtagid(sfmmup, rid);
   2863 	ASSERT(hblktag.htag_id != NULL);
   2864 	hmeshift = HME_HASH_SHIFT(size);
   2865 	hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift);
   2866 	hblktag.htag_rehash = HME_HASH_REHASH(size);
   2867 	hblktag.htag_rid = rid;
   2868 
   2869 ttearray_realloc:
   2870 
   2871 	HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list);
   2872 
   2873 	/*
   2874 	 * We block until hblk_reserve_lock is released; it's held by
   2875 	 * the thread, temporarily using hblk_reserve, until hblk_reserve is
   2876 	 * replaced by a hblk from sfmmu8_cache.
   2877 	 */
   2878 	if (hmeblkp == (struct hme_blk *)hblk_reserve &&
   2879 	    hblk_reserve_thread != curthread) {
   2880 		SFMMU_HASH_UNLOCK(hmebp);
   2881 		mutex_enter(&hblk_reserve_lock);
   2882 		mutex_exit(&hblk_reserve_lock);
   2883 		SFMMU_STAT(sf_hblk_reserve_hit);
   2884 		SFMMU_HASH_LOCK(hmebp);
   2885 		goto ttearray_realloc;
   2886 	}
   2887 
   2888 	if (hmeblkp == NULL) {
   2889 		hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size,
   2890 		    hblktag, flags, rid);
   2891 		ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared);
   2892 		ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared);
   2893 	} else {
   2894 		/*
   2895 		 * It is possible for 8k and 64k hblks to collide since they
   2896 		 * have the same rehash value. This is because we
   2897 		 * lazily free hblks and 8K/64K blks could be lingering.
   2898 		 * If we find size mismatch we free the block and & try again.
   2899 		 */
   2900 		if (get_hblk_ttesz(hmeblkp) != size) {
   2901 			ASSERT(!hmeblkp->hblk_vcnt);
   2902 			ASSERT(!hmeblkp->hblk_hmecnt);
   2903 			sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
   2904 			    &list, 0);
   2905 			goto ttearray_realloc;
   2906 		}
   2907 		if (hmeblkp->hblk_shw_bit) {
   2908 			/*
   2909 			 * if the hblk was previously used as a shadow hblk then
   2910 			 * we will change it to a normal hblk
   2911 			 */
   2912 			ASSERT(!hmeblkp->hblk_shared);
   2913 			if (hmeblkp->hblk_shw_mask) {
   2914 				sfmmu_shadow_hcleanup(sfmmup, hmeblkp, hmebp);
   2915 				ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
   2916 				goto ttearray_realloc;
   2917 			} else {
   2918 				hmeblkp->hblk_shw_bit = 0;
   2919 			}
   2920 		}
   2921 		SFMMU_STAT(sf_hblk_hit);
   2922 	}
   2923 
   2924 	/*
   2925 	 * hat_memload() should never call kmem_cache_free() for kernel hmeblks;
   2926 	 * see block comment showing the stacktrace in sfmmu_hblk_alloc();
   2927 	 * set the flag parameter to 1 so that sfmmu_hblks_list_purge() will
   2928 	 * just add these hmeblks to the per-cpu pending queue.
   2929 	 */
   2930 	sfmmu_hblks_list_purge(&list, 1);
   2931 
   2932 	ASSERT(get_hblk_ttesz(hmeblkp) == size);
   2933 	ASSERT(!hmeblkp->hblk_shw_bit);
   2934 	ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared);
   2935 	ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared);
   2936 	ASSERT(hmeblkp->hblk_tag.htag_rid == rid);
   2937 
   2938 	return (hmeblkp);
   2939 }
   2940 
   2941 /*
   2942  * Function adds a tte entry into the hmeblk. It returns 0 if successful and 1
   2943  * otherwise.
   2944  */
   2945 static int
   2946 sfmmu_tteload_addentry(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, tte_t *ttep,
   2947 	caddr_t vaddr, page_t **pps, uint_t flags, uint_t rid)
   2948 {
   2949 	page_t *pp = *pps;
   2950 	int hmenum, size, remap;
   2951 	tte_t tteold, flush_tte;
   2952 #ifdef DEBUG
   2953 	tte_t orig_old;
   2954 #endif /* DEBUG */
   2955 	struct sf_hment *sfhme;
   2956 	kmutex_t *pml, *pmtx;
   2957 	hatlock_t *hatlockp;
   2958 	int myflt;
   2959 
   2960 	/*
   2961 	 * remove this panic when we decide to let user virtual address
   2962 	 * space be >= USERLIMIT.
   2963 	 */
   2964 	if (!TTE_IS_PRIVILEGED(ttep) && vaddr >= (caddr_t)USERLIMIT)
   2965 		panic("user addr %p in kernel space", (void *)vaddr);
   2966 #if defined(TTE_IS_GLOBAL)
   2967 	if (TTE_IS_GLOBAL(ttep))
   2968 		panic("sfmmu_tteload: creating global tte");
   2969 #endif
   2970 
   2971 #ifdef DEBUG
   2972 	if (pf_is_memory(sfmmu_ttetopfn(ttep, vaddr)) &&
   2973 	    !TTE_IS_PCACHEABLE(ttep) && !sfmmu_allow_nc_trans)
   2974 		panic("sfmmu_tteload: non cacheable memory tte");
   2975 #endif /* DEBUG */
   2976 
   2977 	/* don't simulate dirty bit for writeable ISM/DISM mappings */
   2978 	if ((flags & HAT_LOAD_SHARE) && TTE_IS_WRITABLE(ttep)) {
   2979 		TTE_SET_REF(ttep);
   2980 		TTE_SET_MOD(ttep);
   2981 	}
   2982 
   2983 	if ((flags & HAT_LOAD_SHARE) || !TTE_IS_REF(ttep) ||
   2984 	    !TTE_IS_MOD(ttep)) {
   2985 		/*
   2986 		 * Don't load TSB for dummy as in ISM.  Also don't preload
   2987 		 * the TSB if the TTE isn't writable since we're likely to
   2988 		 * fault on it again -- preloading can be fairly expensive.
   2989 		 */
   2990 		flags |= SFMMU_NO_TSBLOAD;
   2991 	}
   2992 
   2993 	size = TTE_CSZ(ttep);
   2994 	switch (size) {
   2995 	case TTE8K:
   2996 		SFMMU_STAT(sf_tteload8k);
   2997 		break;
   2998 	case TTE64K:
   2999 		SFMMU_STAT(sf_tteload64k);
   3000 		break;
   3001 	case TTE512K:
   3002 		SFMMU_STAT(sf_tteload512k);
   3003 		break;
   3004 	case TTE4M:
   3005 		SFMMU_STAT(sf_tteload4m);
   3006 		break;
   3007 	case (TTE32M):
   3008 		SFMMU_STAT(sf_tteload32m);
   3009 		ASSERT(mmu_page_sizes == max_mmu_page_sizes);
   3010 		break;
   3011 	case (TTE256M):
   3012 		SFMMU_STAT(sf_tteload256m);
   3013 		ASSERT(mmu_page_sizes == max_mmu_page_sizes);
   3014 		break;
   3015 	}
   3016 
   3017 	ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size)));
   3018 	SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size));
   3019 	ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared);
   3020 	ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared);
   3021 
   3022 	HBLKTOHME_IDX(sfhme, hmeblkp, vaddr, hmenum);
   3023 
   3024 	/*
   3025 	 * Need to grab mlist lock here so that pageunload
   3026 	 * will not change tte behind us.
   3027 	 */
   3028 	if (pp) {
   3029 		pml = sfmmu_mlist_enter(pp);
   3030 	}
   3031 
   3032 	sfmmu_copytte(&sfhme->hme_tte, &tteold);
   3033 	/*
   3034 	 * Look for corresponding hment and if valid verify
   3035 	 * pfns are equal.
   3036 	 */
   3037 	remap = TTE_IS_VALID(&tteold);
   3038 	if (remap) {
   3039 		pfn_t	new_pfn, old_pfn;
   3040 
   3041 		old_pfn = TTE_TO_PFN(vaddr, &tteold);
   3042 		new_pfn = TTE_TO_PFN(vaddr, ttep);
   3043 
   3044 		if (flags & HAT_LOAD_REMAP) {
   3045 			/* make sure we are remapping same type of pages */
   3046 			if (pf_is_memory(old_pfn) != pf_is_memory(new_pfn)) {
   3047 				panic("sfmmu_tteload - tte remap io<->memory");
   3048 			}
   3049 			if (old_pfn != new_pfn &&
   3050 			    (pp != NULL || sfhme->hme_page != NULL)) {
   3051 				panic("sfmmu_tteload - tte remap pp != NULL");
   3052 			}
   3053 		} else if (old_pfn != new_pfn) {
   3054 			panic("sfmmu_tteload - tte remap, hmeblkp 0x%p",
   3055 			    (void *)hmeblkp);
   3056 		}
   3057 		ASSERT(TTE_CSZ(&tteold) == TTE_CSZ(ttep));
   3058 	}
   3059 
   3060 	if (pp) {
   3061 		if (size == TTE8K) {
   3062 #ifdef VAC
   3063 			/*
   3064 			 * Handle VAC consistency
   3065 			 */
   3066 			if (!remap && (cache & CACHE_VAC) && !PP_ISNC(pp)) {
   3067 				sfmmu_vac_conflict(sfmmup, vaddr, pp);
   3068 			}
   3069 #endif
   3070 
   3071 			if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) {
   3072 				pmtx = sfmmu_page_enter(pp);
   3073 				PP_CLRRO(pp);
   3074 				sfmmu_page_exit(pmtx);
   3075 			} else if (!PP_ISMAPPED(pp) &&
   3076 			    (!TTE_IS_WRITABLE(ttep)) && !(PP_ISMOD(pp))) {
   3077 				pmtx = sfmmu_page_enter(pp);
   3078 				if (!(PP_ISMOD(pp))) {
   3079 					PP_SETRO(pp);
   3080 				}
   3081 				sfmmu_page_exit(pmtx);
   3082 			}
   3083 
   3084 		} else if (sfmmu_pagearray_setup(vaddr, pps, ttep, remap)) {
   3085 			/*
   3086 			 * sfmmu_pagearray_setup failed so return
   3087 			 */
   3088 			sfmmu_mlist_exit(pml);
   3089 			return (1);
   3090 		}
   3091 	}
   3092 
   3093 	/*
   3094 	 * Make sure hment is not on a mapping list.
   3095 	 */
   3096 	ASSERT(remap || (sfhme->hme_page == NULL));
   3097 
   3098 	/* if it is not a remap then hme->next better be NULL */
   3099 	ASSERT((!remap) ? sfhme->hme_next == NULL : 1);
   3100 
   3101 	if (flags & HAT_LOAD_LOCK) {
   3102 		if ((hmeblkp->hblk_lckcnt + 1) >= MAX_HBLK_LCKCNT) {
   3103 			panic("too high lckcnt-hmeblk %p",
   3104 			    (void *)hmeblkp);
   3105 		}
   3106 		atomic_add_32(&hmeblkp->hblk_lckcnt, 1);
   3107 
   3108 		HBLK_STACK_TRACE(hmeblkp, HBLK_LOCK);
   3109 	}
   3110 
   3111 #ifdef VAC
   3112 	if (pp && PP_ISNC(pp)) {
   3113 		/*
   3114 		 * If the physical page is marked to be uncacheable, like
   3115 		 * by a vac conflict, make sure the new mapping is also
   3116 		 * uncacheable.
   3117 		 */
   3118 		TTE_CLR_VCACHEABLE(ttep);
   3119 		ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR);
   3120 	}
   3121 #endif
   3122 	ttep->tte_hmenum = hmenum;
   3123 
   3124 #ifdef DEBUG
   3125 	orig_old = tteold;
   3126 #endif /* DEBUG */
   3127 
   3128 	while (sfmmu_modifytte_try(&tteold, ttep, &sfhme->hme_tte) < 0) {
   3129 		if ((sfmmup == KHATID) &&
   3130 		    (flags & (HAT_LOAD_LOCK | HAT_LOAD_REMAP))) {
   3131 			sfmmu_copytte(&sfhme->hme_tte, &tteold);
   3132 		}
   3133 #ifdef DEBUG
   3134 		chk_tte(&orig_old, &tteold, ttep, hmeblkp);
   3135 #endif /* DEBUG */
   3136 	}
   3137 	ASSERT(TTE_IS_VALID(&sfhme->hme_tte));
   3138 
   3139 	if (!TTE_IS_VALID(&tteold)) {
   3140 
   3141 		atomic_add_16(&hmeblkp->hblk_vcnt, 1);
   3142 		if (rid == SFMMU_INVALID_SHMERID) {
   3143 			atomic_add_long(&sfmmup->sfmmu_ttecnt[size], 1);
   3144 		} else {
   3145 			sf_srd_t *srdp = sfmmup->sfmmu_srdp;
   3146 			sf_region_t *rgnp = srdp->srd_hmergnp[rid];
   3147 			/*
   3148 			 * We already accounted for region ttecnt's in sfmmu
   3149 			 * during hat_join_region() processing. Here we
   3150 			 * only update ttecnt's in region struture.
   3151 			 */
   3152 			atomic_add_long(&rgnp->rgn_ttecnt[size], 1);
   3153 		}
   3154 	}
   3155 
   3156 	myflt = (astosfmmu(curthread->t_procp->p_as) == sfmmup);
   3157 	if (size > TTE8K && (flags & HAT_LOAD_SHARE) == 0 &&
   3158 	    sfmmup != ksfmmup) {
   3159 		uchar_t tteflag = 1 << size;
   3160 		if (rid == SFMMU_INVALID_SHMERID) {
   3161 			if (!(sfmmup->sfmmu_tteflags & tteflag)) {
   3162 				hatlockp = sfmmu_hat_enter(sfmmup);
   3163 				sfmmup->sfmmu_tteflags |= tteflag;
   3164 				sfmmu_hat_exit(hatlockp);
   3165 			}
   3166 		} else if (!(sfmmup->sfmmu_rtteflags & tteflag)) {
   3167 			hatlockp = sfmmu_hat_enter(sfmmup);
   3168 			sfmmup->sfmmu_rtteflags |= tteflag;
   3169 			sfmmu_hat_exit(hatlockp);
   3170 		}
   3171 		/*
   3172 		 * Update the current CPU tsbmiss area, so the current thread
   3173 		 * won't need to take the tsbmiss for the new pagesize.
   3174 		 * The other threads in the process will update their tsb
   3175 		 * miss area lazily in sfmmu_tsbmiss_exception() when they
   3176 		 * fail to find the translation for a newly added pagesize.
   3177 		 */
   3178 		if (size > TTE64K && myflt) {
   3179 			struct tsbmiss *tsbmp;
   3180 			kpreempt_disable();
   3181 			tsbmp = &tsbmiss_area[CPU->cpu_id];
   3182 			if (rid == SFMMU_INVALID_SHMERID) {
   3183 				if (!(tsbmp->uhat_tteflags & tteflag)) {
   3184 					tsbmp->uhat_tteflags |= tteflag;
   3185 				}
   3186 			} else {
   3187 				if (!(tsbmp->uhat_rtteflags & tteflag)) {
   3188 					tsbmp->uhat_rtteflags |= tteflag;
   3189 				}
   3190 			}
   3191 			kpreempt_enable();
   3192 		}
   3193 	}
   3194 
   3195 	if (size >= TTE4M && (flags & HAT_LOAD_TEXT) &&
   3196 	    !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) {
   3197 		hatlockp = sfmmu_hat_enter(sfmmup);
   3198 		SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG);
   3199 		sfmmu_hat_exit(hatlockp);
   3200 	}
   3201 
   3202 	flush_tte.tte_intlo = (tteold.tte_intlo ^ ttep->tte_intlo) &
   3203 	    hw_tte.tte_intlo;
   3204 	flush_tte.tte_inthi = (tteold.tte_inthi ^ ttep->tte_inthi) &
   3205 	    hw_tte.tte_inthi;
   3206 
   3207 	if (remap && (flush_tte.tte_inthi || flush_tte.tte_intlo)) {
   3208 		/*
   3209 		 * If remap and new tte differs from old tte we need
   3210 		 * to sync the mod bit and flush TLB/TSB.  We don't
   3211 		 * need to sync ref bit because we currently always set
   3212 		 * ref bit in tteload.
   3213 		 */
   3214 		ASSERT(TTE_IS_REF(ttep));
   3215 		if (TTE_IS_MOD(&tteold)) {
   3216 			sfmmu_ttesync(sfmmup, vaddr, &tteold, pp);
   3217 		}
   3218 		/*
   3219 		 * hwtte bits shouldn't change for SRD hmeblks as long as SRD
   3220 		 * hmes are only used for read only text. Adding this code for
   3221 		 * completeness and future use of shared hmeblks with writable
   3222 		 * mappings of VMODSORT vnodes.
   3223 		 */
   3224 		if (hmeblkp->hblk_shared) {
   3225 			cpuset_t cpuset = sfmmu_rgntlb_demap(vaddr,
   3226 			    sfmmup->sfmmu_srdp->srd_hmergnp[rid], hmeblkp, 1);
   3227 			xt_sync(cpuset);
   3228 			SFMMU_STAT_ADD(sf_region_remap_demap, 1);
   3229 		} else {
   3230 			sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 0);
   3231 			xt_sync(sfmmup->sfmmu_cpusran);
   3232 		}
   3233 	}
   3234 
   3235 	if ((flags & SFMMU_NO_TSBLOAD) == 0) {
   3236 		/*
   3237 		 * We only preload 8K and 4M mappings into the TSB, since
   3238 		 * 64K and 512K mappings are replicated and hence don't
   3239 		 * have a single, unique TSB entry. Ditto for 32M/256M.
   3240 		 */
   3241 		if (size == TTE8K || size == TTE4M) {
   3242 			sf_scd_t *scdp;
   3243 			hatlockp = sfmmu_hat_enter(sfmmup);
   3244 			/*
   3245 			 * Don't preload private TSB if the mapping is used
   3246 			 * by the shctx in the SCD.
   3247 			 */
   3248 			scdp = sfmmup->sfmmu_scdp;
   3249 			if (rid == SFMMU_INVALID_SHMERID || scdp == NULL ||
   3250 			    !SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) {
   3251 				sfmmu_load_tsb(sfmmup, vaddr, &sfhme->hme_tte,
   3252 				    size);
   3253 			}
   3254 			sfmmu_hat_exit(hatlockp);
   3255 		}
   3256 	}
   3257 	if (pp) {
   3258 		if (!remap) {
   3259 			HME_ADD(sfhme, pp);
   3260 			atomic_add_16(&hmeblkp->hblk_hmecnt, 1);
   3261 			ASSERT(hmeblkp->hblk_hmecnt > 0);
   3262 
   3263 			/*
   3264 			 * Cannot ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS)
   3265 			 * see pageunload() for comment.
   3266 			 */
   3267 		}
   3268 		sfmmu_mlist_exit(pml);
   3269 	}
   3270 
   3271 	return (0);
   3272 }
   3273 /*
   3274  * Function unlocks hash bucket.
   3275  */
   3276 static void
   3277 sfmmu_tteload_release_hashbucket(struct hmehash_bucket *hmebp)
   3278 {
   3279 	ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
   3280 	SFMMU_HASH_UNLOCK(hmebp);
   3281 }
   3282 
   3283 /*
   3284  * function which checks and sets up page array for a large
   3285  * translation.  Will set p_vcolor, p_index, p_ro fields.
   3286  * Assumes addr and pfnum of first page are properly aligned.
   3287  * Will check for physical contiguity. If check fails it return
   3288  * non null.
   3289  */
   3290 static int
   3291 sfmmu_pagearray_setup(caddr_t addr, page_t **pps, tte_t *ttep, int remap)
   3292 {
   3293 	int 	i, index, ttesz;
   3294 	pfn_t	pfnum;
   3295 	pgcnt_t	npgs;
   3296 	page_t *pp, *pp1;
   3297 	kmutex_t *pmtx;
   3298 #ifdef VAC
   3299 	int osz;
   3300 	int cflags = 0;
   3301 	int vac_err = 0;
   3302 #endif
   3303 	int newidx = 0;
   3304 
   3305 	ttesz = TTE_CSZ(ttep);
   3306 
   3307 	ASSERT(ttesz > TTE8K);
   3308 
   3309 	npgs = TTEPAGES(ttesz);
   3310 	index = PAGESZ_TO_INDEX(ttesz);
   3311 
   3312 	pfnum = (*pps)->p_pagenum;
   3313 	ASSERT(IS_P2ALIGNED(pfnum, npgs));
   3314 
   3315 	/*
   3316 	 * Save the first pp so we can do HAT_TMPNC at the end.
   3317 	 */
   3318 	pp1 = *pps;
   3319 #ifdef VAC
   3320 	osz = fnd_mapping_sz(pp1);
   3321 #endif
   3322 
   3323 	for (i = 0; i < npgs; i++, pps++) {
   3324 		pp = *pps;
   3325 		ASSERT(PAGE_LOCKED(pp));
   3326 		ASSERT(pp->p_szc >= ttesz);
   3327 		ASSERT(pp->p_szc == pp1->p_szc);
   3328 		ASSERT(sfmmu_mlist_held(pp));
   3329 
   3330 		/*
   3331 		 * XXX is it possible to maintain P_RO on the root only?
   3332 		 */
   3333 		if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) {
   3334 			pmtx = sfmmu_page_enter(pp);
   3335 			PP_CLRRO(pp);
   3336 			sfmmu_page_exit(pmtx);
   3337 		} else if (!PP_ISMAPPED(pp) && !TTE_IS_WRITABLE(ttep) &&
   3338 		    !PP_ISMOD(pp)) {
   3339 			pmtx = sfmmu_page_enter(pp);
   3340 			if (!(PP_ISMOD(pp))) {
   3341 				PP_SETRO(pp);
   3342 			}
   3343 			sfmmu_page_exit(pmtx);
   3344 		}
   3345 
   3346 		/*
   3347 		 * If this is a remap we skip vac & contiguity checks.
   3348 		 */
   3349 		if (remap)
   3350 			continue;
   3351 
   3352 		/*
   3353 		 * set p_vcolor and detect any vac conflicts.
   3354 		 */
   3355 #ifdef VAC
   3356 		if (vac_err == 0) {
   3357 			vac_err = sfmmu_vacconflict_array(addr, pp, &cflags);
   3358 
   3359 		}
   3360 #endif
   3361 
   3362 		/*
   3363 		 * Save current index in case we need to undo it.
   3364 		 * Note: "PAGESZ_TO_INDEX(sz)	(1 << (sz))"
   3365 		 *	"SFMMU_INDEX_SHIFT	6"
   3366 		 *	 "SFMMU_INDEX_MASK	((1 << SFMMU_INDEX_SHIFT) - 1)"
   3367 		 *	 "PP_MAPINDEX(p_index)	(p_index & SFMMU_INDEX_MASK)"
   3368 		 *
   3369 		 * So:	index = PAGESZ_TO_INDEX(ttesz);
   3370 		 *	if ttesz == 1 then index = 0x2
   3371 		 *		    2 then index = 0x4
   3372 		 *		    3 then index = 0x8
   3373 		 *		    4 then index = 0x10
   3374 		 *		    5 then index = 0x20
   3375 		 * The code below checks if it's a new pagesize (ie, newidx)
   3376 		 * in case we need to take it back out of p_index,
   3377 		 * and then or's the new index into the existing index.
   3378 		 */
   3379 		if ((PP_MAPINDEX(pp) & index) == 0)
   3380 			newidx = 1;
   3381 		pp->p_index = (PP_MAPINDEX(pp) | index);
   3382 
   3383 		/*
   3384 		 * contiguity check
   3385 		 */
   3386 		if (pp->p_pagenum != pfnum) {
   3387 			/*
   3388 			 * If we fail the contiguity test then
   3389 			 * the only thing we need to fix is the p_index field.
   3390 			 * We might get a few extra flushes but since this
   3391 			 * path is rare that is ok.  The p_ro field will
   3392 			 * get automatically fixed on the next tteload to
   3393 			 * the page.  NO TNC bit is set yet.
   3394 			 */
   3395 			while (i >= 0) {
   3396 				pp = *pps;
   3397 				if (newidx)
   3398 					pp->p_index = (PP_MAPINDEX(pp) &
   3399 					    ~index);
   3400 				pps--;
   3401 				i--;
   3402 			}
   3403 			return (1);
   3404 		}
   3405 		pfnum++;
   3406 		addr += MMU_PAGESIZE;
   3407 	}
   3408 
   3409 #ifdef VAC
   3410 	if (vac_err) {
   3411 		if (ttesz > osz) {
   3412 			/*
   3413 			 * There are some smaller mappings that causes vac
   3414 			 * conflicts. Convert all existing small mappings to
   3415 			 * TNC.
   3416 			 */
   3417 			SFMMU_STAT_ADD(sf_uncache_conflict, npgs);
   3418 			sfmmu_page_cache_array(pp1, HAT_TMPNC, CACHE_FLUSH,
   3419 			    npgs);
   3420 		} else {
   3421 			/* EMPTY */
   3422 			/*
   3423 			 * If there exists an big page mapping,
   3424 			 * that means the whole existing big page
   3425 			 * has TNC setting already. No need to covert to
   3426 			 * TNC again.
   3427 			 */
   3428 			ASSERT(PP_ISTNC(pp1));
   3429 		}
   3430 	}
   3431 #endif	/* VAC */
   3432 
   3433 	return (0);
   3434 }
   3435 
   3436 #ifdef VAC
   3437 /*
   3438  * Routine that detects vac consistency for a large page. It also
   3439  * sets virtual color for all pp's for this big mapping.
   3440  */
   3441 static int
   3442 sfmmu_vacconflict_array(caddr_t addr, page_t *pp, int *cflags)
   3443 {
   3444 	int vcolor, ocolor;
   3445 
   3446 	ASSERT(sfmmu_mlist_held(pp));
   3447 
   3448 	if (PP_ISNC(pp)) {
   3449 		return (HAT_TMPNC);
   3450 	}
   3451 
   3452 	vcolor = addr_to_vcolor(addr);
   3453 	if (PP_NEWPAGE(pp)) {
   3454 		PP_SET_VCOLOR(pp, vcolor);
   3455 		return (0);
   3456 	}
   3457 
   3458 	ocolor = PP_GET_VCOLOR(pp);
   3459 	if (ocolor == vcolor) {
   3460 		return (0);
   3461 	}
   3462 
   3463 	if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) {
   3464 		/*
   3465 		 * Previous user of page had a differnet color
   3466 		 * but since there are no current users
   3467 		 * we just flush the cache and change the color.
   3468 		 * As an optimization for large pages we flush the
   3469 		 * entire cache of that color and set a flag.
   3470 		 */
   3471 		SFMMU_STAT(sf_pgcolor_conflict);
   3472 		if (!CacheColor_IsFlushed(*cflags, ocolor)) {
   3473 			CacheColor_SetFlushed(*cflags, ocolor);
   3474 			sfmmu_cache_flushcolor(ocolor, pp->p_pagenum);
   3475 		}
   3476 		PP_SET_VCOLOR(pp, vcolor);
   3477 		return (0);
   3478 	}
   3479 
   3480 	/*
   3481 	 * We got a real conflict with a current mapping.
   3482 	 * set flags to start unencaching all mappings
   3483 	 * and return failure so we restart looping
   3484 	 * the pp array from the beginning.
   3485 	 */
   3486 	return (HAT_TMPNC);
   3487 }
   3488 #endif	/* VAC */
   3489 
   3490 /*
   3491  * creates a large page shadow hmeblk for a tte.
   3492  * The purpose of this routine is to allow us to do quick unloads because
   3493  * the vm layer can easily pass a very large but sparsely populated range.
   3494  */
   3495 static struct hme_blk *
   3496 sfmmu_shadow_hcreate(sfmmu_t *sfmmup, caddr_t vaddr, int ttesz, uint_t flags)
   3497 {
   3498 	struct hmehash_bucket *hmebp;
   3499 	hmeblk_tag hblktag;
   3500 	int hmeshift, size, vshift;
   3501 	uint_t shw_mask, newshw_mask;
   3502 	struct hme_blk *hmeblkp;
   3503 
   3504 	ASSERT(sfmmup != KHATID);
   3505 	if (mmu_page_sizes == max_mmu_page_sizes) {
   3506 		ASSERT(ttesz < TTE256M);
   3507 	} else {
   3508 		ASSERT(ttesz < TTE4M);
   3509 		ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0);
   3510 		ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0);
   3511 	}
   3512 
   3513 	if (ttesz == TTE8K) {
   3514 		size = TTE512K;
   3515 	} else {
   3516 		size = ++ttesz;
   3517 	}
   3518 
   3519 	hblktag.htag_id = sfmmup;
   3520 	hmeshift = HME_HASH_SHIFT(size);
   3521 	hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift);
   3522 	hblktag.htag_rehash = HME_HASH_REHASH(size);
   3523 	hblktag.htag_rid = SFMMU_INVALID_SHMERID;
   3524 	hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift);
   3525 
   3526 	SFMMU_HASH_LOCK(hmebp);
   3527 
   3528 	HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp);
   3529 	ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve);
   3530 	if (hmeblkp == NULL) {
   3531 		hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size,
   3532 		    hblktag, flags, SFMMU_INVALID_SHMERID);
   3533 	}
   3534 	ASSERT(hmeblkp);
   3535 	if (!hmeblkp->hblk_shw_mask) {
   3536 		/*
   3537 		 * if this is a unused hblk it was just allocated or could
   3538 		 * potentially be a previous large page hblk so we need to
   3539 		 * set the shadow bit.
   3540 		 */
   3541 		ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt);
   3542 		hmeblkp->hblk_shw_bit = 1;
   3543 	} else if (hmeblkp->hblk_shw_bit == 0) {
   3544 		panic("sfmmu_shadow_hcreate: shw bit not set in hmeblkp 0x%p",
   3545 		    (void *)hmeblkp);
   3546 	}
   3547 	ASSERT(hmeblkp->hblk_shw_bit == 1);
   3548 	ASSERT(!hmeblkp->hblk_shared);
   3549 	vshift = vaddr_to_vshift(hblktag, vaddr, size);
   3550 	ASSERT(vshift < 8);
   3551 	/*
   3552 	 * Atomically set shw mask bit
   3553 	 */
   3554 	do {
   3555 		shw_mask = hmeblkp->hblk_shw_mask;
   3556 		newshw_mask = shw_mask | (1 << vshift);
   3557 		newshw_mask = cas32(&hmeblkp->hblk_shw_mask, shw_mask,
   3558 		    newshw_mask);
   3559 	} while (newshw_mask != shw_mask);
   3560 
   3561 	SFMMU_HASH_UNLOCK(hmebp);
   3562 
   3563 	return (hmeblkp);
   3564 }
   3565 
   3566 /*
   3567  * This routine cleanup a previous shadow hmeblk and changes it to
   3568  * a regular hblk.  This happens rarely but it is possible
   3569  * when a process wants to use large pages and there are hblks still
   3570  * lying around from the previous as that used these hmeblks.
   3571  * The alternative was to cleanup the shadow hblks at unload time
   3572  * but since so few user processes actually use large pages, it is
   3573  * better to be lazy and cleanup at this time.
   3574  */
   3575 static void
   3576 sfmmu_shadow_hcleanup(sfmmu_t *sfmmup, struct hme_blk *hmeblkp,
   3577 	struct hmehash_bucket *hmebp)
   3578 {
   3579 	caddr_t addr, endaddr;
   3580 	int hashno, size;
   3581 
   3582 	ASSERT(hmeblkp->hblk_shw_bit);
   3583 	ASSERT(!hmeblkp->hblk_shared);
   3584 
   3585 	ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
   3586 
   3587 	if (!hmeblkp->hblk_shw_mask) {
   3588 		hmeblkp->hblk_shw_bit = 0;
   3589 		return;
   3590 	}
   3591 	addr = (caddr_t)get_hblk_base(hmeblkp);
   3592 	endaddr = get_hblk_endaddr(hmeblkp);
   3593 	size = get_hblk_ttesz(hmeblkp);
   3594 	hashno = size - 1;
   3595 	ASSERT(hashno > 0);
   3596 	SFMMU_HASH_UNLOCK(hmebp);
   3597 
   3598 	sfmmu_free_hblks(sfmmup, addr, endaddr, hashno);
   3599 
   3600 	SFMMU_HASH_LOCK(hmebp);
   3601 }
   3602 
   3603 static void
   3604 sfmmu_free_hblks(sfmmu_t *sfmmup, caddr_t addr, caddr_t endaddr,
   3605 	int hashno)
   3606 {
   3607 	int hmeshift, shadow = 0;
   3608 	hmeblk_tag hblktag;
   3609 	struct hmehash_bucket *hmebp;
   3610 	struct hme_blk *hmeblkp;
   3611 	struct hme_blk *nx_hblk, *pr_hblk, *list = NULL;
   3612 
   3613 	ASSERT(hashno > 0);
   3614 	hblktag.htag_id = sfmmup;
   3615 	hblktag.htag_rehash = hashno;
   3616 	hblktag.htag_rid = SFMMU_INVALID_SHMERID;
   3617 
   3618 	hmeshift = HME_HASH_SHIFT(hashno);
   3619 
   3620 	while (addr < endaddr) {
   3621 		hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
   3622 		hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
   3623 		SFMMU_HASH_LOCK(hmebp);
   3624 		/* inline HME_HASH_SEARCH */
   3625 		hmeblkp = hmebp->hmeblkp;
   3626 		pr_hblk = NULL;
   3627 		while (hmeblkp) {
   3628 			if (HTAGS_EQ(hmeblkp->hblk_tag, hblktag)) {
   3629 				/* found hme_blk */
   3630 				ASSERT(!hmeblkp->hblk_shared);
   3631 				if (hmeblkp->hblk_shw_bit) {
   3632 					if (hmeblkp->hblk_shw_mask) {
   3633 						shadow = 1;
   3634 						sfmmu_shadow_hcleanup(sfmmup,
   3635 						    hmeblkp, hmebp);
   3636 						break;
   3637 					} else {
   3638 						hmeblkp->hblk_shw_bit = 0;
   3639 					}
   3640 				}
   3641 
   3642 				/*
   3643 				 * Hblk_hmecnt and hblk_vcnt could be non zero
   3644 				 * since hblk_unload() does not gurantee that.
   3645 				 *
   3646 				 * XXX - this could cause tteload() to spin
   3647 				 * where sfmmu_shadow_hcleanup() is called.
   3648 				 */
   3649 			}
   3650 
   3651 			nx_hblk = hmeblkp->hblk_next;
   3652 			if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) {
   3653 				sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
   3654 				    &list, 0);
   3655 			} else {
   3656 				pr_hblk = hmeblkp;
   3657 			}
   3658 			hmeblkp = nx_hblk;
   3659 		}
   3660 
   3661 		SFMMU_HASH_UNLOCK(hmebp);
   3662 
   3663 		if (shadow) {
   3664 			/*
   3665 			 * We found another shadow hblk so cleaned its
   3666 			 * children.  We need to go back and cleanup
   3667 			 * the original hblk so we don't change the
   3668 			 * addr.
   3669 			 */
   3670 			shadow = 0;
   3671 		} else {
   3672 			addr = (caddr_t)roundup((uintptr_t)addr + 1,
   3673 			    (1 << hmeshift));
   3674 		}
   3675 	}
   3676 	sfmmu_hblks_list_purge(&list, 0);
   3677 }
   3678 
   3679 /*
   3680  * This routine's job is to delete stale invalid shared hmeregions hmeblks that
   3681  * may still linger on after pageunload.
   3682  */
   3683 static void
   3684 sfmmu_cleanup_rhblk(sf_srd_t *srdp, caddr_t addr, uint_t rid, int ttesz)
   3685 {
   3686 	int hmeshift;
   3687 	hmeblk_tag hblktag;
   3688 	struct hmehash_bucket *hmebp;
   3689 	struct hme_blk *hmeblkp;
   3690 	struct hme_blk *pr_hblk;
   3691 	struct hme_blk *list = NULL;
   3692 
   3693 	ASSERT(SFMMU_IS_SHMERID_VALID(rid));
   3694 	ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   3695 
   3696 	hmeshift = HME_HASH_SHIFT(ttesz);
   3697 	hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
   3698 	hblktag.htag_rehash = ttesz;
   3699 	hblktag.htag_rid = rid;
   3700 	hblktag.htag_id = srdp;
   3701 	hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift);
   3702 
   3703 	SFMMU_HASH_LOCK(hmebp);
   3704 	HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list);
   3705 	if (hmeblkp != NULL) {
   3706 		ASSERT(hmeblkp->hblk_shared);
   3707 		ASSERT(!hmeblkp->hblk_shw_bit);
   3708 		if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) {
   3709 			panic("sfmmu_cleanup_rhblk: valid hmeblk");
   3710 		}
   3711 		ASSERT(!hmeblkp->hblk_lckcnt);
   3712 		sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
   3713 		    &list, 0);
   3714 	}
   3715 	SFMMU_HASH_UNLOCK(hmebp);
   3716 	sfmmu_hblks_list_purge(&list, 0);
   3717 }
   3718 
   3719 /* ARGSUSED */
   3720 static void
   3721 sfmmu_rgn_cb_noop(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr,
   3722     size_t r_size, void *r_obj, u_offset_t r_objoff)
   3723 {
   3724 }
   3725 
   3726 /*
   3727  * Searches for an hmeblk which maps addr, then unloads this mapping
   3728  * and updates *eaddrp, if the hmeblk is found.
   3729  */
   3730 static void
   3731 sfmmu_unload_hmeregion_va(sf_srd_t *srdp, uint_t rid, caddr_t addr,
   3732     caddr_t eaddr, int ttesz, caddr_t *eaddrp)
   3733 {
   3734 	int hmeshift;
   3735 	hmeblk_tag hblktag;
   3736 	struct hmehash_bucket *hmebp;
   3737 	struct hme_blk *hmeblkp;
   3738 	struct hme_blk *pr_hblk;
   3739 	struct hme_blk *list = NULL;
   3740 
   3741 	ASSERT(SFMMU_IS_SHMERID_VALID(rid));
   3742 	ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   3743 	ASSERT(ttesz >= HBLK_MIN_TTESZ);
   3744 
   3745 	hmeshift = HME_HASH_SHIFT(ttesz);
   3746 	hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
   3747 	hblktag.htag_rehash = ttesz;
   3748 	hblktag.htag_rid = rid;
   3749 	hblktag.htag_id = srdp;
   3750 	hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift);
   3751 
   3752 	SFMMU_HASH_LOCK(hmebp);
   3753 	HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list);
   3754 	if (hmeblkp != NULL) {
   3755 		ASSERT(hmeblkp->hblk_shared);
   3756 		ASSERT(!hmeblkp->hblk_lckcnt);
   3757 		if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) {
   3758 			*eaddrp = sfmmu_hblk_unload(NULL, hmeblkp, addr,
   3759 			    eaddr, NULL, HAT_UNLOAD);
   3760 			ASSERT(*eaddrp > addr);
   3761 		}
   3762 		ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt);
   3763 		sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
   3764 		    &list, 0);
   3765 	}
   3766 	SFMMU_HASH_UNLOCK(hmebp);
   3767 	sfmmu_hblks_list_purge(&list, 0);
   3768 }
   3769 
   3770 static void
   3771 sfmmu_unload_hmeregion(sf_srd_t *srdp, sf_region_t *rgnp)
   3772 {
   3773 	int ttesz = rgnp->rgn_pgszc;
   3774 	size_t rsz = rgnp->rgn_size;
   3775 	caddr_t rsaddr = rgnp->rgn_saddr;
   3776 	caddr_t readdr = rsaddr + rsz;
   3777 	caddr_t rhsaddr;
   3778 	caddr_t va;
   3779 	uint_t rid = rgnp->rgn_id;
   3780 	caddr_t cbsaddr;
   3781 	caddr_t cbeaddr;
   3782 	hat_rgn_cb_func_t rcbfunc;
   3783 	ulong_t cnt;
   3784 
   3785 	ASSERT(SFMMU_IS_SHMERID_VALID(rid));
   3786 	ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   3787 
   3788 	ASSERT(IS_P2ALIGNED(rsaddr, TTEBYTES(ttesz)));
   3789 	ASSERT(IS_P2ALIGNED(rsz, TTEBYTES(ttesz)));
   3790 	if (ttesz < HBLK_MIN_TTESZ) {
   3791 		ttesz = HBLK_MIN_TTESZ;
   3792 		rhsaddr = (caddr_t)P2ALIGN((uintptr_t)rsaddr, HBLK_MIN_BYTES);
   3793 	} else {
   3794 		rhsaddr = rsaddr;
   3795 	}
   3796 
   3797 	if ((rcbfunc = rgnp->rgn_cb_function) == NULL) {
   3798 		rcbfunc = sfmmu_rgn_cb_noop;
   3799 	}
   3800 
   3801 	while (ttesz >= HBLK_MIN_TTESZ) {
   3802 		cbsaddr = rsaddr;
   3803 		cbeaddr = rsaddr;
   3804 		if (!(rgnp->rgn_hmeflags & (1 << ttesz))) {
   3805 			ttesz--;
   3806 			continue;
   3807 		}
   3808 		cnt = 0;
   3809 		va = rsaddr;
   3810 		while (va < readdr) {
   3811 			ASSERT(va >= rhsaddr);
   3812 			if (va != cbeaddr) {
   3813 				if (cbeaddr != cbsaddr) {
   3814 					ASSERT(cbeaddr > cbsaddr);
   3815 					(*rcbfunc)(cbsaddr, cbeaddr,
   3816 					    rsaddr, rsz, rgnp->rgn_obj,
   3817 					    rgnp->rgn_objoff);
   3818 				}
   3819 				cbsaddr = va;
   3820 				cbeaddr = va;
   3821 			}
   3822 			sfmmu_unload_hmeregion_va(srdp, rid, va, readdr,
   3823 			    ttesz, &cbeaddr);
   3824 			cnt++;
   3825 			va = rhsaddr + (cnt << TTE_PAGE_SHIFT(ttesz));
   3826 		}
   3827 		if (cbeaddr != cbsaddr) {
   3828 			ASSERT(cbeaddr > cbsaddr);
   3829 			(*rcbfunc)(cbsaddr, cbeaddr, rsaddr,
   3830 			    rsz, rgnp->rgn_obj,
   3831 			    rgnp->rgn_objoff);
   3832 		}
   3833 		ttesz--;
   3834 	}
   3835 }
   3836 
   3837 /*
   3838  * Release one hardware address translation lock on the given address range.
   3839  */
   3840 void
   3841 hat_unlock(struct hat *sfmmup, caddr_t addr, size_t len)
   3842 {
   3843 	struct hmehash_bucket *hmebp;
   3844 	hmeblk_tag hblktag;
   3845 	int hmeshift, hashno = 1;
   3846 	struct hme_blk *hmeblkp, *list = NULL;
   3847 	caddr_t endaddr;
   3848 
   3849 	ASSERT(sfmmup != NULL);
   3850 	ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
   3851 
   3852 	ASSERT((sfmmup == ksfmmup) ||
   3853 	    AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock));
   3854 	ASSERT((len & MMU_PAGEOFFSET) == 0);
   3855 	endaddr = addr + len;
   3856 	hblktag.htag_id = sfmmup;
   3857 	hblktag.htag_rid = SFMMU_INVALID_SHMERID;
   3858 
   3859 	/*
   3860 	 * Spitfire supports 4 page sizes.
   3861 	 * Most pages are expected to be of the smallest page size (8K) and
   3862 	 * these will not need to be rehashed. 64K pages also don't need to be
   3863 	 * rehashed because an hmeblk spans 64K of address space. 512K pages
   3864 	 * might need 1 rehash and and 4M pages might need 2 rehashes.
   3865 	 */
   3866 	while (addr < endaddr) {
   3867 		hmeshift = HME_HASH_SHIFT(hashno);
   3868 		hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
   3869 		hblktag.htag_rehash = hashno;
   3870 		hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
   3871 
   3872 		SFMMU_HASH_LOCK(hmebp);
   3873 
   3874 		HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list);
   3875 		if (hmeblkp != NULL) {
   3876 			ASSERT(!hmeblkp->hblk_shared);
   3877 			/*
   3878 			 * If we encounter a shadow hmeblk then
   3879 			 * we know there are no valid hmeblks mapping
   3880 			 * this address at this size or larger.
   3881 			 * Just increment address by the smallest
   3882 			 * page size.
   3883 			 */
   3884 			if (hmeblkp->hblk_shw_bit) {
   3885 				addr += MMU_PAGESIZE;
   3886 			} else {
   3887 				addr = sfmmu_hblk_unlock(hmeblkp, addr,
   3888 				    endaddr);
   3889 			}
   3890 			SFMMU_HASH_UNLOCK(hmebp);
   3891 			hashno = 1;
   3892 			continue;
   3893 		}
   3894 		SFMMU_HASH_UNLOCK(hmebp);
   3895 
   3896 		if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) {
   3897 			/*
   3898 			 * We have traversed the whole list and rehashed
   3899 			 * if necessary without finding the address to unlock
   3900 			 * which should never happen.
   3901 			 */
   3902 			panic("sfmmu_unlock: addr not found. "
   3903 			    "addr %p hat %p", (void *)addr, (void *)sfmmup);
   3904 		} else {
   3905 			hashno++;
   3906 		}
   3907 	}
   3908 
   3909 	sfmmu_hblks_list_purge(&list, 0);
   3910 }
   3911 
   3912 void
   3913 hat_unlock_region(struct hat *sfmmup, caddr_t addr, size_t len,
   3914     hat_region_cookie_t rcookie)
   3915 {
   3916 	sf_srd_t *srdp;
   3917 	sf_region_t *rgnp;
   3918 	int ttesz;
   3919 	uint_t rid;
   3920 	caddr_t eaddr;
   3921 	caddr_t va;
   3922 	int hmeshift;
   3923 	hmeblk_tag hblktag;
   3924 	struct hmehash_bucket *hmebp;
   3925 	struct hme_blk *hmeblkp;
   3926 	struct hme_blk *pr_hblk;
   3927 	struct hme_blk *list;
   3928 
   3929 	if (rcookie == HAT_INVALID_REGION_COOKIE) {
   3930 		hat_unlock(sfmmup, addr, len);
   3931 		return;
   3932 	}
   3933 
   3934 	ASSERT(sfmmup != NULL);
   3935 	ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
   3936 	ASSERT(sfmmup != ksfmmup);
   3937 
   3938 	srdp = sfmmup->sfmmu_srdp;
   3939 	rid = (uint_t)((uint64_t)rcookie);
   3940 	ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   3941 	eaddr = addr + len;
   3942 	va = addr;
   3943 	list = NULL;
   3944 	rgnp = srdp->srd_hmergnp[rid];
   3945 	SFMMU_VALIDATE_HMERID(sfmmup, rid, addr, len);
   3946 
   3947 	ASSERT(IS_P2ALIGNED(addr, TTEBYTES(rgnp->rgn_pgszc)));
   3948 	ASSERT(IS_P2ALIGNED(len, TTEBYTES(rgnp->rgn_pgszc)));
   3949 	if (rgnp->rgn_pgszc < HBLK_MIN_TTESZ) {
   3950 		ttesz = HBLK_MIN_TTESZ;
   3951 	} else {
   3952 		ttesz = rgnp->rgn_pgszc;
   3953 	}
   3954 	while (va < eaddr) {
   3955 		while (ttesz < rgnp->rgn_pgszc &&
   3956 		    IS_P2ALIGNED(va, TTEBYTES(ttesz + 1))) {
   3957 			ttesz++;
   3958 		}
   3959 		while (ttesz >= HBLK_MIN_TTESZ) {
   3960 			if (!(rgnp->rgn_hmeflags & (1 << ttesz))) {
   3961 				ttesz--;
   3962 				continue;
   3963 			}
   3964 			hmeshift = HME_HASH_SHIFT(ttesz);
   3965 			hblktag.htag_bspage = HME_HASH_BSPAGE(va, hmeshift);
   3966 			hblktag.htag_rehash = ttesz;
   3967 			hblktag.htag_rid = rid;
   3968 			hblktag.htag_id = srdp;
   3969 			hmebp = HME_HASH_FUNCTION(srdp, va, hmeshift);
   3970 			SFMMU_HASH_LOCK(hmebp);
   3971 			HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk,
   3972 			    &list);
   3973 			if (hmeblkp == NULL) {
   3974 				SFMMU_HASH_UNLOCK(hmebp);
   3975 				ttesz--;
   3976 				continue;
   3977 			}
   3978 			ASSERT(hmeblkp->hblk_shared);
   3979 			va = sfmmu_hblk_unlock(hmeblkp, va, eaddr);
   3980 			ASSERT(va >= eaddr ||
   3981 			    IS_P2ALIGNED((uintptr_t)va, TTEBYTES(ttesz)));
   3982 			SFMMU_HASH_UNLOCK(hmebp);
   3983 			break;
   3984 		}
   3985 		if (ttesz < HBLK_MIN_TTESZ) {
   3986 			panic("hat_unlock_region: addr not found "
   3987 			    "addr %p hat %p", (void *)va, (void *)sfmmup);
   3988 		}
   3989 	}
   3990 	sfmmu_hblks_list_purge(&list, 0);
   3991 }
   3992 
   3993 /*
   3994  * Function to unlock a range of addresses in an hmeblk.  It returns the
   3995  * next address that needs to be unlocked.
   3996  * Should be called with the hash lock held.
   3997  */
   3998 static caddr_t
   3999 sfmmu_hblk_unlock(struct hme_blk *hmeblkp, caddr_t addr, caddr_t endaddr)
   4000 {
   4001 	struct sf_hment *sfhme;
   4002 	tte_t tteold, ttemod;
   4003 	int ttesz, ret;
   4004 
   4005 	ASSERT(in_hblk_range(hmeblkp, addr));
   4006 	ASSERT(hmeblkp->hblk_shw_bit == 0);
   4007 
   4008 	endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
   4009 	ttesz = get_hblk_ttesz(hmeblkp);
   4010 
   4011 	HBLKTOHME(sfhme, hmeblkp, addr);
   4012 	while (addr < endaddr) {
   4013 readtte:
   4014 		sfmmu_copytte(&sfhme->hme_tte, &tteold);
   4015 		if (TTE_IS_VALID(&tteold)) {
   4016 
   4017 			ttemod = tteold;
   4018 
   4019 			ret = sfmmu_modifytte_try(&tteold, &ttemod,
   4020 			    &sfhme->hme_tte);
   4021 
   4022 			if (ret < 0)
   4023 				goto readtte;
   4024 
   4025 			if (hmeblkp->hblk_lckcnt == 0)
   4026 				panic("zero hblk lckcnt");
   4027 
   4028 			if (((uintptr_t)addr + TTEBYTES(ttesz)) >
   4029 			    (uintptr_t)endaddr)
   4030 				panic("can't unlock large tte");
   4031 
   4032 			ASSERT(hmeblkp->hblk_lckcnt > 0);
   4033 			atomic_add_32(&hmeblkp->hblk_lckcnt, -1);
   4034 			HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK);
   4035 		} else {
   4036 			panic("sfmmu_hblk_unlock: invalid tte");
   4037 		}
   4038 		addr += TTEBYTES(ttesz);
   4039 		sfhme++;
   4040 	}
   4041 	return (addr);
   4042 }
   4043 
   4044 /*
   4045  * Physical Address Mapping Framework
   4046  *
   4047  * General rules:
   4048  *
   4049  * (1) Applies only to seg_kmem memory pages. To make things easier,
   4050  *     seg_kpm addresses are also accepted by the routines, but nothing
   4051  *     is done with them since by definition their PA mappings are static.
   4052  * (2) hat_add_callback() may only be called while holding the page lock
   4053  *     SE_SHARED or SE_EXCL of the underlying page (e.g., as_pagelock()),
   4054  *     or passing HAC_PAGELOCK flag.
   4055  * (3) prehandler() and posthandler() may not call hat_add_callback() or
   4056  *     hat_delete_callback(), nor should they allocate memory. Post quiesce
   4057  *     callbacks may not sleep or acquire adaptive mutex locks.
   4058  * (4) Either prehandler() or posthandler() (but not both) may be specified
   4059  *     as being NULL.  Specifying an errhandler() is optional.
   4060  *
   4061  * Details of using the framework:
   4062  *
   4063  * registering a callback (hat_register_callback())
   4064  *
   4065  *	Pass prehandler, posthandler, errhandler addresses
   4066  *	as described below. If capture_cpus argument is nonzero,
   4067  *	suspend callback to the prehandler will occur with CPUs
   4068  *	captured and executing xc_loop() and CPUs will remain
   4069  *	captured until after the posthandler suspend callback
   4070  *	occurs.
   4071  *
   4072  * adding a callback (hat_add_callback())
   4073  *
   4074  *      as_pagelock();
   4075  *	hat_add_callback();
   4076  *      save returned pfn in private data structures or program registers;
   4077  *      as_pageunlock();
   4078  *
   4079  * prehandler()
   4080  *
   4081  *	Stop all accesses by physical address to this memory page.
   4082  *	Called twice: the first, PRESUSPEND, is a context safe to acquire
   4083  *	adaptive locks. The second, SUSPEND, is called at high PIL with
   4084  *	CPUs captured so adaptive locks may NOT be acquired (and all spin
   4085  *	locks must be XCALL_PIL or higher locks).
   4086  *
   4087  *	May return the following errors:
   4088  *		EIO:	A fatal error has occurred. This will result in panic.
   4089  *		EAGAIN:	The page cannot be suspended. This will fail the
   4090  *			relocation.
   4091  *		0:	Success.
   4092  *
   4093  * posthandler()
   4094  *
   4095  *      Save new pfn in private data structures or program registers;
   4096  *	not allowed to fail (non-zero return values will result in panic).
   4097  *
   4098  * errhandler()
   4099  *
   4100  *	called when an error occurs related to the callback.  Currently
   4101  *	the only such error is HAT_CB_ERR_LEAKED which indicates that
   4102  *	a page is being freed, but there are still outstanding callback(s)
   4103  *	registered on the page.
   4104  *
   4105  * removing a callback (hat_delete_callback(); e.g., prior to freeing memory)
   4106  *
   4107  *	stop using physical address
   4108  *	hat_delete_callback();
   4109  *
   4110  */
   4111 
   4112 /*
   4113  * Register a callback class.  Each subsystem should do this once and
   4114  * cache the id_t returned for use in setting up and tearing down callbacks.
   4115  *
   4116  * There is no facility for removing callback IDs once they are created;
   4117  * the "key" should be unique for each module, so in case a module is unloaded
   4118  * and subsequently re-loaded, we can recycle the module's previous entry.
   4119  */
   4120 id_t
   4121 hat_register_callback(int key,
   4122 	int (*prehandler)(caddr_t, uint_t, uint_t, void *),
   4123 	int (*posthandler)(caddr_t, uint_t, uint_t, void *, pfn_t),
   4124 	int (*errhandler)(caddr_t, uint_t, uint_t, void *),
   4125 	int capture_cpus)
   4126 {
   4127 	id_t id;
   4128 
   4129 	/*
   4130 	 * Search the table for a pre-existing callback associated with
   4131 	 * the identifier "key".  If one exists, we re-use that entry in
   4132 	 * the table for this instance, otherwise we assign the next
   4133 	 * available table slot.
   4134 	 */
   4135 	for (id = 0; id < sfmmu_max_cb_id; id++) {
   4136 		if (sfmmu_cb_table[id].key == key)
   4137 			break;
   4138 	}
   4139 
   4140 	if (id == sfmmu_max_cb_id) {
   4141 		id = sfmmu_cb_nextid++;
   4142 		if (id >= sfmmu_max_cb_id)
   4143 			panic("hat_register_callback: out of callback IDs");
   4144 	}
   4145 
   4146 	ASSERT(prehandler != NULL || posthandler != NULL);
   4147 
   4148 	sfmmu_cb_table[id].key = key;
   4149 	sfmmu_cb_table[id].prehandler = prehandler;
   4150 	sfmmu_cb_table[id].posthandler = posthandler;
   4151 	sfmmu_cb_table[id].errhandler = errhandler;
   4152 	sfmmu_cb_table[id].capture_cpus = capture_cpus;
   4153 
   4154 	return (id);
   4155 }
   4156 
   4157 #define	HAC_COOKIE_NONE	(void *)-1
   4158 
   4159 /*
   4160  * Add relocation callbacks to the specified addr/len which will be called
   4161  * when relocating the associated page. See the description of pre and
   4162  * posthandler above for more details.
   4163  *
   4164  * If HAC_PAGELOCK is included in flags, the underlying memory page is
   4165  * locked internally so the caller must be able to deal with the callback
   4166  * running even before this function has returned.  If HAC_PAGELOCK is not
   4167  * set, it is assumed that the underlying memory pages are locked.
   4168  *
   4169  * Since the caller must track the individual page boundaries anyway,
   4170  * we only allow a callback to be added to a single page (large
   4171  * or small).  Thus [addr, addr + len) MUST be contained within a single
   4172  * page.
   4173  *
   4174  * Registering multiple callbacks on the same [addr, addr+len) is supported,
   4175  * _provided_that_ a unique parameter is specified for each callback.
   4176  * If multiple callbacks are registered on the same range the callback will
   4177  * be invoked with each unique parameter. Registering the same callback with
   4178  * the same argument more than once will result in corrupted kernel state.
   4179  *
   4180  * Returns the pfn of the underlying kernel page in *rpfn
   4181  * on success, or PFN_INVALID on failure.
   4182  *
   4183  * cookiep (if passed) provides storage space for an opaque cookie
   4184  * to return later to hat_delete_callback(). This cookie makes the callback
   4185  * deletion significantly quicker by avoiding a potentially lengthy hash
   4186  * search.
   4187  *
   4188  * Returns values:
   4189  *    0:      success
   4190  *    ENOMEM: memory allocation failure (e.g. flags was passed as HAC_NOSLEEP)
   4191  *    EINVAL: callback ID is not valid
   4192  *    ENXIO:  ["vaddr", "vaddr" + len) is not mapped in the kernel's address
   4193  *            space
   4194  *    ERANGE: ["vaddr", "vaddr" + len) crosses a page boundary
   4195  */
   4196 int
   4197 hat_add_callback(id_t callback_id, caddr_t vaddr, uint_t len, uint_t flags,
   4198 	void *pvt, pfn_t *rpfn, void **cookiep)
   4199 {
   4200 	struct 		hmehash_bucket *hmebp;
   4201 	hmeblk_tag 	hblktag;
   4202 	struct hme_blk	*hmeblkp;
   4203 	int 		hmeshift, hashno;
   4204 	caddr_t 	saddr, eaddr, baseaddr;
   4205 	struct pa_hment *pahmep;
   4206 	struct sf_hment *sfhmep, *osfhmep;
   4207 	kmutex_t	*pml;
   4208 	tte_t   	tte;
   4209 	page_t		*pp;
   4210 	vnode_t		*vp;
   4211 	u_offset_t	off;
   4212 	pfn_t		pfn;
   4213 	int		kmflags = (flags & HAC_SLEEP)? KM_SLEEP : KM_NOSLEEP;
   4214 	int		locked = 0;
   4215 
   4216 	/*
   4217 	 * For KPM mappings, just return the physical address since we
   4218 	 * don't need to register any callbacks.
   4219 	 */
   4220 	if (IS_KPM_ADDR(vaddr)) {
   4221 		uint64_t paddr;
   4222 		SFMMU_KPM_VTOP(vaddr, paddr);
   4223 		*rpfn = btop(paddr);
   4224 		if (cookiep != NULL)
   4225 			*cookiep = HAC_COOKIE_NONE;
   4226 		return (0);
   4227 	}
   4228 
   4229 	if (callback_id < (id_t)0 || callback_id >= sfmmu_cb_nextid) {
   4230 		*rpfn = PFN_INVALID;
   4231 		return (EINVAL);
   4232 	}
   4233 
   4234 	if ((pahmep = kmem_cache_alloc(pa_hment_cache, kmflags)) == NULL) {
   4235 		*rpfn = PFN_INVALID;
   4236 		return (ENOMEM);
   4237 	}
   4238 
   4239 	sfhmep = &pahmep->sfment;
   4240 
   4241 	saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK);
   4242 	eaddr = saddr + len;
   4243 
   4244 rehash:
   4245 	/* Find the mapping(s) for this page */
   4246 	for (hashno = TTE64K, hmeblkp = NULL;
   4247 	    hmeblkp == NULL && hashno <= mmu_hashcnt;
   4248 	    hashno++) {
   4249 		hmeshift = HME_HASH_SHIFT(hashno);
   4250 		hblktag.htag_id = ksfmmup;
   4251 		hblktag.htag_rid = SFMMU_INVALID_SHMERID;
   4252 		hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift);
   4253 		hblktag.htag_rehash = hashno;
   4254 		hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift);
   4255 
   4256 		SFMMU_HASH_LOCK(hmebp);
   4257 
   4258 		HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp);
   4259 
   4260 		if (hmeblkp == NULL)
   4261 			SFMMU_HASH_UNLOCK(hmebp);
   4262 	}
   4263 
   4264 	if (hmeblkp == NULL) {
   4265 		kmem_cache_free(pa_hment_cache, pahmep);
   4266 		*rpfn = PFN_INVALID;
   4267 		return (ENXIO);
   4268 	}
   4269 
   4270 	ASSERT(!hmeblkp->hblk_shared);
   4271 
   4272 	HBLKTOHME(osfhmep, hmeblkp, saddr);
   4273 	sfmmu_copytte(&osfhmep->hme_tte, &tte);
   4274 
   4275 	if (!TTE_IS_VALID(&tte)) {
   4276 		SFMMU_HASH_UNLOCK(hmebp);
   4277 		kmem_cache_free(pa_hment_cache, pahmep);
   4278 		*rpfn = PFN_INVALID;
   4279 		return (ENXIO);
   4280 	}
   4281 
   4282 	/*
   4283 	 * Make sure the boundaries for the callback fall within this
   4284 	 * single mapping.
   4285 	 */
   4286 	baseaddr = (caddr_t)get_hblk_base(hmeblkp);
   4287 	ASSERT(saddr >= baseaddr);
   4288 	if (eaddr > saddr + TTEBYTES(TTE_CSZ(&tte))) {
   4289 		SFMMU_HASH_UNLOCK(hmebp);
   4290 		kmem_cache_free(pa_hment_cache, pahmep);
   4291 		*rpfn = PFN_INVALID;
   4292 		return (ERANGE);
   4293 	}
   4294 
   4295 	pfn = sfmmu_ttetopfn(&tte, vaddr);
   4296 
   4297 	/*
   4298 	 * The pfn may not have a page_t underneath in which case we
   4299 	 * just return it. This can happen if we are doing I/O to a
   4300 	 * static portion of the kernel's address space, for instance.
   4301 	 */
   4302 	pp = osfhmep->hme_page;
   4303 	if (pp == NULL) {
   4304 		SFMMU_HASH_UNLOCK(hmebp);
   4305 		kmem_cache_free(pa_hment_cache, pahmep);
   4306 		*rpfn = pfn;
   4307 		if (cookiep)
   4308 			*cookiep = HAC_COOKIE_NONE;
   4309 		return (0);
   4310 	}
   4311 	ASSERT(pp == PP_PAGEROOT(pp));
   4312 
   4313 	vp = pp->p_vnode;
   4314 	off = pp->p_offset;
   4315 
   4316 	pml = sfmmu_mlist_enter(pp);
   4317 
   4318 	if (flags & HAC_PAGELOCK) {
   4319 		if (!page_trylock(pp, SE_SHARED)) {
   4320 			/*
   4321 			 * Somebody is holding SE_EXCL lock. Might
   4322 			 * even be hat_page_relocate(). Drop all
   4323 			 * our locks, lookup the page in &kvp, and
   4324 			 * retry. If it doesn't exist in &kvp and &zvp,
   4325 			 * then we must be dealing with a kernel mapped
   4326 			 * page which doesn't actually belong to
   4327 			 * segkmem so we punt.
   4328 			 */
   4329 			sfmmu_mlist_exit(pml);
   4330 			SFMMU_HASH_UNLOCK(hmebp);
   4331 			pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED);
   4332 
   4333 			/* check zvp before giving up */
   4334 			if (pp == NULL)
   4335 				pp = page_lookup(&zvp, (u_offset_t)saddr,
   4336 				    SE_SHARED);
   4337 
   4338 			/* Okay, we didn't find it, give up */
   4339 			if (pp == NULL) {
   4340 				kmem_cache_free(pa_hment_cache, pahmep);
   4341 				*rpfn = pfn;
   4342 				if (cookiep)
   4343 					*cookiep = HAC_COOKIE_NONE;
   4344 				return (0);
   4345 			}
   4346 			page_unlock(pp);
   4347 			goto rehash;
   4348 		}
   4349 		locked = 1;
   4350 	}
   4351 
   4352 	if (!PAGE_LOCKED(pp) && !panicstr)
   4353 		panic("hat_add_callback: page 0x%p not locked", (void *)pp);
   4354 
   4355 	if (osfhmep->hme_page != pp || pp->p_vnode != vp ||
   4356 	    pp->p_offset != off) {
   4357 		/*
   4358 		 * The page moved before we got our hands on it.  Drop
   4359 		 * all the locks and try again.
   4360 		 */
   4361 		ASSERT((flags & HAC_PAGELOCK) != 0);
   4362 		sfmmu_mlist_exit(pml);
   4363 		SFMMU_HASH_UNLOCK(hmebp);
   4364 		page_unlock(pp);
   4365 		locked = 0;
   4366 		goto rehash;
   4367 	}
   4368 
   4369 	if (!VN_ISKAS(vp)) {
   4370 		/*
   4371 		 * This is not a segkmem page but another page which
   4372 		 * has been kernel mapped. It had better have at least
   4373 		 * a share lock on it. Return the pfn.
   4374 		 */
   4375 		sfmmu_mlist_exit(pml);
   4376 		SFMMU_HASH_UNLOCK(hmebp);
   4377 		if (locked)
   4378 			page_unlock(pp);
   4379 		kmem_cache_free(pa_hment_cache, pahmep);
   4380 		ASSERT(PAGE_LOCKED(pp));
   4381 		*rpfn = pfn;
   4382 		if (cookiep)
   4383 			*cookiep = HAC_COOKIE_NONE;
   4384 		return (0);
   4385 	}
   4386 
   4387 	/*
   4388 	 * Setup this pa_hment and link its embedded dummy sf_hment into
   4389 	 * the mapping list.
   4390 	 */
   4391 	pp->p_share++;
   4392 	pahmep->cb_id = callback_id;
   4393 	pahmep->addr = vaddr;
   4394 	pahmep->len = len;
   4395 	pahmep->refcnt = 1;
   4396 	pahmep->flags = 0;
   4397 	pahmep->pvt = pvt;
   4398 
   4399 	sfhmep->hme_tte.ll = 0;
   4400 	sfhmep->hme_data = pahmep;
   4401 	sfhmep->hme_prev = osfhmep;
   4402 	sfhmep->hme_next = osfhmep->hme_next;
   4403 
   4404 	if (osfhmep->hme_next)
   4405 		osfhmep->hme_next->hme_prev = sfhmep;
   4406 
   4407 	osfhmep->hme_next = sfhmep;
   4408 
   4409 	sfmmu_mlist_exit(pml);
   4410 	SFMMU_HASH_UNLOCK(hmebp);
   4411 
   4412 	if (locked)
   4413 		page_unlock(pp);
   4414 
   4415 	*rpfn = pfn;
   4416 	if (cookiep)
   4417 		*cookiep = (void *)pahmep;
   4418 
   4419 	return (0);
   4420 }
   4421 
   4422 /*
   4423  * Remove the relocation callbacks from the specified addr/len.
   4424  */
   4425 void
   4426 hat_delete_callback(caddr_t vaddr, uint_t len, void *pvt, uint_t flags,
   4427 	void *cookie)
   4428 {
   4429 	struct		hmehash_bucket *hmebp;
   4430 	hmeblk_tag	hblktag;
   4431 	struct hme_blk	*hmeblkp;
   4432 	int		hmeshift, hashno;
   4433 	caddr_t		saddr;
   4434 	struct pa_hment	*pahmep;
   4435 	struct sf_hment	*sfhmep, *osfhmep;
   4436 	kmutex_t	*pml;
   4437 	tte_t		tte;
   4438 	page_t		*pp;
   4439 	vnode_t		*vp;
   4440 	u_offset_t	off;
   4441 	int		locked = 0;
   4442 
   4443 	/*
   4444 	 * If the cookie is HAC_COOKIE_NONE then there is no pa_hment to
   4445 	 * remove so just return.
   4446 	 */
   4447 	if (cookie == HAC_COOKIE_NONE || IS_KPM_ADDR(vaddr))
   4448 		return;
   4449 
   4450 	saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK);
   4451 
   4452 rehash:
   4453 	/* Find the mapping(s) for this page */
   4454 	for (hashno = TTE64K, hmeblkp = NULL;
   4455 	    hmeblkp == NULL && hashno <= mmu_hashcnt;
   4456 	    hashno++) {
   4457 		hmeshift = HME_HASH_SHIFT(hashno);
   4458 		hblktag.htag_id = ksfmmup;
   4459 		hblktag.htag_rid = SFMMU_INVALID_SHMERID;
   4460 		hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift);
   4461 		hblktag.htag_rehash = hashno;
   4462 		hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift);
   4463 
   4464 		SFMMU_HASH_LOCK(hmebp);
   4465 
   4466 		HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp);
   4467 
   4468 		if (hmeblkp == NULL)
   4469 			SFMMU_HASH_UNLOCK(hmebp);
   4470 	}
   4471 
   4472 	if (hmeblkp == NULL)
   4473 		return;
   4474 
   4475 	ASSERT(!hmeblkp->hblk_shared);
   4476 
   4477 	HBLKTOHME(osfhmep, hmeblkp, saddr);
   4478 
   4479 	sfmmu_copytte(&osfhmep->hme_tte, &tte);
   4480 	if (!TTE_IS_VALID(&tte)) {
   4481 		SFMMU_HASH_UNLOCK(hmebp);
   4482 		return;
   4483 	}
   4484 
   4485 	pp = osfhmep->hme_page;
   4486 	if (pp == NULL) {
   4487 		SFMMU_HASH_UNLOCK(hmebp);
   4488 		ASSERT(cookie == NULL);
   4489 		return;
   4490 	}
   4491 
   4492 	vp = pp->p_vnode;
   4493 	off = pp->p_offset;
   4494 
   4495 	pml = sfmmu_mlist_enter(pp);
   4496 
   4497 	if (flags & HAC_PAGELOCK) {
   4498 		if (!page_trylock(pp, SE_SHARED)) {
   4499 			/*
   4500 			 * Somebody is holding SE_EXCL lock. Might
   4501 			 * even be hat_page_relocate(). Drop all
   4502 			 * our locks, lookup the page in &kvp, and
   4503 			 * retry. If it doesn't exist in &kvp and &zvp,
   4504 			 * then we must be dealing with a kernel mapped
   4505 			 * page which doesn't actually belong to
   4506 			 * segkmem so we punt.
   4507 			 */
   4508 			sfmmu_mlist_exit(pml);
   4509 			SFMMU_HASH_UNLOCK(hmebp);
   4510 			pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED);
   4511 			/* check zvp before giving up */
   4512 			if (pp == NULL)
   4513 				pp = page_lookup(&zvp, (u_offset_t)saddr,
   4514 				    SE_SHARED);
   4515 
   4516 			if (pp == NULL) {
   4517 				ASSERT(cookie == NULL);
   4518 				return;
   4519 			}
   4520 			page_unlock(pp);
   4521 			goto rehash;
   4522 		}
   4523 		locked = 1;
   4524 	}
   4525 
   4526 	ASSERT(PAGE_LOCKED(pp));
   4527 
   4528 	if (osfhmep->hme_page != pp || pp->p_vnode != vp ||
   4529 	    pp->p_offset != off) {
   4530 		/*
   4531 		 * The page moved before we got our hands on it.  Drop
   4532 		 * all the locks and try again.
   4533 		 */
   4534 		ASSERT((flags & HAC_PAGELOCK) != 0);
   4535 		sfmmu_mlist_exit(pml);
   4536 		SFMMU_HASH_UNLOCK(hmebp);
   4537 		page_unlock(pp);
   4538 		locked = 0;
   4539 		goto rehash;
   4540 	}
   4541 
   4542 	if (!VN_ISKAS(vp)) {
   4543 		/*
   4544 		 * This is not a segkmem page but another page which
   4545 		 * has been kernel mapped.
   4546 		 */
   4547 		sfmmu_mlist_exit(pml);
   4548 		SFMMU_HASH_UNLOCK(hmebp);
   4549 		if (locked)
   4550 			page_unlock(pp);
   4551 		ASSERT(cookie == NULL);
   4552 		return;
   4553 	}
   4554 
   4555 	if (cookie != NULL) {
   4556 		pahmep = (struct pa_hment *)cookie;
   4557 		sfhmep = &pahmep->sfment;
   4558 	} else {
   4559 		for (sfhmep = pp->p_mapping; sfhmep != NULL;
   4560 		    sfhmep = sfhmep->hme_next) {
   4561 
   4562 			/*
   4563 			 * skip va<->pa mappings
   4564 			 */
   4565 			if (!IS_PAHME(sfhmep))
   4566 				continue;
   4567 
   4568 			pahmep = sfhmep->hme_data;
   4569 			ASSERT(pahmep != NULL);
   4570 
   4571 			/*
   4572 			 * if pa_hment matches, remove it
   4573 			 */
   4574 			if ((pahmep->pvt == pvt) &&
   4575 			    (pahmep->addr == vaddr) &&
   4576 			    (pahmep->len == len)) {
   4577 				break;
   4578 			}
   4579 		}
   4580 	}
   4581 
   4582 	if (sfhmep == NULL) {
   4583 		if (!panicstr) {
   4584 			panic("hat_delete_callback: pa_hment not found, pp %p",
   4585 			    (void *)pp);
   4586 		}
   4587 		return;
   4588 	}
   4589 
   4590 	/*
   4591 	 * Note: at this point a valid kernel mapping must still be
   4592 	 * present on this page.
   4593 	 */
   4594 	pp->p_share--;
   4595 	if (pp->p_share <= 0)
   4596 		panic("hat_delete_callback: zero p_share");
   4597 
   4598 	if (--pahmep->refcnt == 0) {
   4599 		if (pahmep->flags != 0)
   4600 			panic("hat_delete_callback: pa_hment is busy");
   4601 
   4602 		/*
   4603 		 * Remove sfhmep from the mapping list for the page.
   4604 		 */
   4605 		if (sfhmep->hme_prev) {
   4606 			sfhmep->hme_prev->hme_next = sfhmep->hme_next;
   4607 		} else {
   4608 			pp->p_mapping = sfhmep->hme_next;
   4609 		}
   4610 
   4611 		if (sfhmep->hme_next)
   4612 			sfhmep->hme_next->hme_prev = sfhmep->hme_prev;
   4613 
   4614 		sfmmu_mlist_exit(pml);
   4615 		SFMMU_HASH_UNLOCK(hmebp);
   4616 
   4617 		if (locked)
   4618 			page_unlock(pp);
   4619 
   4620 		kmem_cache_free(pa_hment_cache, pahmep);
   4621 		return;
   4622 	}
   4623 
   4624 	sfmmu_mlist_exit(pml);
   4625 	SFMMU_HASH_UNLOCK(hmebp);
   4626 	if (locked)
   4627 		page_unlock(pp);
   4628 }
   4629 
   4630 /*
   4631  * hat_probe returns 1 if the translation for the address 'addr' is
   4632  * loaded, zero otherwise.
   4633  *
   4634  * hat_probe should be used only for advisorary purposes because it may
   4635  * occasionally return the wrong value. The implementation must guarantee that
   4636  * returning the wrong value is a very rare event. hat_probe is used
   4637  * to implement optimizations in the segment drivers.
   4638  *
   4639  */
   4640 int
   4641 hat_probe(struct hat *sfmmup, caddr_t addr)
   4642 {
   4643 	pfn_t pfn;
   4644 	tte_t tte;
   4645 
   4646 	ASSERT(sfmmup != NULL);
   4647 	ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
   4648 
   4649 	ASSERT((sfmmup == ksfmmup) ||
   4650 	    AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock));
   4651 
   4652 	if (sfmmup == ksfmmup) {
   4653 		while ((pfn = sfmmu_vatopfn(addr, sfmmup, &tte))
   4654 		    == PFN_SUSPENDED) {
   4655 			sfmmu_vatopfn_suspended(addr, sfmmup, &tte);
   4656 		}
   4657 	} else {
   4658 		pfn = sfmmu_uvatopfn(addr, sfmmup, NULL);
   4659 	}
   4660 
   4661 	if (pfn != PFN_INVALID)
   4662 		return (1);
   4663 	else
   4664 		return (0);
   4665 }
   4666 
   4667 ssize_t
   4668 hat_getpagesize(struct hat *sfmmup, caddr_t addr)
   4669 {
   4670 	tte_t tte;
   4671 
   4672 	ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
   4673 
   4674 	if (sfmmup == ksfmmup) {
   4675 		if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) {
   4676 			return (-1);
   4677 		}
   4678 	} else {
   4679 		if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) {
   4680 			return (-1);
   4681 		}
   4682 	}
   4683 
   4684 	ASSERT(TTE_IS_VALID(&tte));
   4685 	return (TTEBYTES(TTE_CSZ(&tte)));
   4686 }
   4687 
   4688 uint_t
   4689 hat_getattr(struct hat *sfmmup, caddr_t addr, uint_t *attr)
   4690 {
   4691 	tte_t tte;
   4692 
   4693 	ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
   4694 
   4695 	if (sfmmup == ksfmmup) {
   4696 		if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) {
   4697 			tte.ll = 0;
   4698 		}
   4699 	} else {
   4700 		if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) {
   4701 			tte.ll = 0;
   4702 		}
   4703 	}
   4704 	if (TTE_IS_VALID(&tte)) {
   4705 		*attr = sfmmu_ptov_attr(&tte);
   4706 		return (0);
   4707 	}
   4708 	*attr = 0;
   4709 	return ((uint_t)0xffffffff);
   4710 }
   4711 
   4712 /*
   4713  * Enables more attributes on specified address range (ie. logical OR)
   4714  */
   4715 void
   4716 hat_setattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr)
   4717 {
   4718 	if (hat->sfmmu_xhat_provider) {
   4719 		XHAT_SETATTR(hat, addr, len, attr);
   4720 		return;
   4721 	} else {
   4722 		/*
   4723 		 * This must be a CPU HAT. If the address space has
   4724 		 * XHATs attached, change attributes for all of them,
   4725 		 * just in case
   4726 		 */
   4727 		ASSERT(hat->sfmmu_as != NULL);
   4728 		if (hat->sfmmu_as->a_xhat != NULL)
   4729 			xhat_setattr_all(hat->sfmmu_as, addr, len, attr);
   4730 	}
   4731 
   4732 	sfmmu_chgattr(hat, addr, len, attr, SFMMU_SETATTR);
   4733 }
   4734 
   4735 /*
   4736  * Assigns attributes to the specified address range.  All the attributes
   4737  * are specified.
   4738  */
   4739 void
   4740 hat_chgattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr)
   4741 {
   4742 	if (hat->sfmmu_xhat_provider) {
   4743 		XHAT_CHGATTR(hat, addr, len, attr);
   4744 		return;
   4745 	} else {
   4746 		/*
   4747 		 * This must be a CPU HAT. If the address space has
   4748 		 * XHATs attached, change attributes for all of them,
   4749 		 * just in case
   4750 		 */
   4751 		ASSERT(hat->sfmmu_as != NULL);
   4752 		if (hat->sfmmu_as->a_xhat != NULL)
   4753 			xhat_chgattr_all(hat->sfmmu_as, addr, len, attr);
   4754 	}
   4755 
   4756 	sfmmu_chgattr(hat, addr, len, attr, SFMMU_CHGATTR);
   4757 }
   4758 
   4759 /*
   4760  * Remove attributes on the specified address range (ie. loginal NAND)
   4761  */
   4762 void
   4763 hat_clrattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr)
   4764 {
   4765 	if (hat->sfmmu_xhat_provider) {
   4766 		XHAT_CLRATTR(hat, addr, len, attr);
   4767 		return;
   4768 	} else {
   4769 		/*
   4770 		 * This must be a CPU HAT. If the address space has
   4771 		 * XHATs attached, change attributes for all of them,
   4772 		 * just in case
   4773 		 */
   4774 		ASSERT(hat->sfmmu_as != NULL);
   4775 		if (hat->sfmmu_as->a_xhat != NULL)
   4776 			xhat_clrattr_all(hat->sfmmu_as, addr, len, attr);
   4777 	}
   4778 
   4779 	sfmmu_chgattr(hat, addr, len, attr, SFMMU_CLRATTR);
   4780 }
   4781 
   4782 /*
   4783  * Change attributes on an address range to that specified by attr and mode.
   4784  */
   4785 static void
   4786 sfmmu_chgattr(struct hat *sfmmup, caddr_t addr, size_t len, uint_t attr,
   4787 	int mode)
   4788 {
   4789 	struct hmehash_bucket *hmebp;
   4790 	hmeblk_tag hblktag;
   4791 	int hmeshift, hashno = 1;
   4792 	struct hme_blk *hmeblkp, *list = NULL;
   4793 	caddr_t endaddr;
   4794 	cpuset_t cpuset;
   4795 	demap_range_t dmr;
   4796 
   4797 	CPUSET_ZERO(cpuset);
   4798 
   4799 	ASSERT((sfmmup == ksfmmup) ||
   4800 	    AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock));
   4801 	ASSERT((len & MMU_PAGEOFFSET) == 0);
   4802 	ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0);
   4803 
   4804 	if ((attr & PROT_USER) && (mode != SFMMU_CLRATTR) &&
   4805 	    ((addr + len) > (caddr_t)USERLIMIT)) {
   4806 		panic("user addr %p in kernel space",
   4807 		    (void *)addr);
   4808 	}
   4809 
   4810 	endaddr = addr + len;
   4811 	hblktag.htag_id = sfmmup;
   4812 	hblktag.htag_rid = SFMMU_INVALID_SHMERID;
   4813 	DEMAP_RANGE_INIT(sfmmup, &dmr);
   4814 
   4815 	while (addr < endaddr) {
   4816 		hmeshift = HME_HASH_SHIFT(hashno);
   4817 		hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
   4818 		hblktag.htag_rehash = hashno;
   4819 		hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
   4820 
   4821 		SFMMU_HASH_LOCK(hmebp);
   4822 
   4823 		HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list);
   4824 		if (hmeblkp != NULL) {
   4825 			ASSERT(!hmeblkp->hblk_shared);
   4826 			/*
   4827 			 * We've encountered a shadow hmeblk so skip the range
   4828 			 * of the next smaller mapping size.
   4829 			 */
   4830 			if (hmeblkp->hblk_shw_bit) {
   4831 				ASSERT(sfmmup != ksfmmup);
   4832 				ASSERT(hashno > 1);
   4833 				addr = (caddr_t)P2END((uintptr_t)addr,
   4834 				    TTEBYTES(hashno - 1));
   4835 			} else {
   4836 				addr = sfmmu_hblk_chgattr(sfmmup,
   4837 				    hmeblkp, addr, endaddr, &dmr, attr, mode);
   4838 			}
   4839 			SFMMU_HASH_UNLOCK(hmebp);
   4840 			hashno = 1;
   4841 			continue;
   4842 		}
   4843 		SFMMU_HASH_UNLOCK(hmebp);
   4844 
   4845 		if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) {
   4846 			/*
   4847 			 * We have traversed the whole list and rehashed
   4848 			 * if necessary without finding the address to chgattr.
   4849 			 * This is ok, so we increment the address by the
   4850 			 * smallest hmeblk range for kernel mappings or for
   4851 			 * user mappings with no large pages, and the largest
   4852 			 * hmeblk range, to account for shadow hmeblks, for
   4853 			 * user mappings with large pages and continue.
   4854 			 */
   4855 			if (sfmmup == ksfmmup)
   4856 				addr = (caddr_t)P2END((uintptr_t)addr,
   4857 				    TTEBYTES(1));
   4858 			else
   4859 				addr = (caddr_t)P2END((uintptr_t)addr,
   4860 				    TTEBYTES(hashno));
   4861 			hashno = 1;
   4862 		} else {
   4863 			hashno++;
   4864 		}
   4865 	}
   4866 
   4867 	sfmmu_hblks_list_purge(&list, 0);
   4868 	DEMAP_RANGE_FLUSH(&dmr);
   4869 	cpuset = sfmmup->sfmmu_cpusran;
   4870 	xt_sync(cpuset);
   4871 }
   4872 
   4873 /*
   4874  * This function chgattr on a range of addresses in an hmeblk.  It returns the
   4875  * next addres that needs to be chgattr.
   4876  * It should be called with the hash lock held.
   4877  * XXX It should be possible to optimize chgattr by not flushing every time but
   4878  * on the other hand:
   4879  * 1. do one flush crosscall.
   4880  * 2. only flush if we are increasing permissions (make sure this will work)
   4881  */
   4882 static caddr_t
   4883 sfmmu_hblk_chgattr(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr,
   4884 	caddr_t endaddr, demap_range_t *dmrp, uint_t attr, int mode)
   4885 {
   4886 	tte_t tte, tteattr, tteflags, ttemod;
   4887 	struct sf_hment *sfhmep;
   4888 	int ttesz;
   4889 	struct page *pp = NULL;
   4890 	kmutex_t *pml, *pmtx;
   4891 	int ret;
   4892 	int use_demap_range;
   4893 #if defined(SF_ERRATA_57)
   4894 	int check_exec;
   4895 #endif
   4896 
   4897 	ASSERT(in_hblk_range(hmeblkp, addr));
   4898 	ASSERT(hmeblkp->hblk_shw_bit == 0);
   4899 	ASSERT(!hmeblkp->hblk_shared);
   4900 
   4901 	endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
   4902 	ttesz = get_hblk_ttesz(hmeblkp);
   4903 
   4904 	/*
   4905 	 * Flush the current demap region if addresses have been
   4906 	 * skipped or the page size doesn't match.
   4907 	 */
   4908 	use_demap_range = (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp));
   4909 	if (use_demap_range) {
   4910 		DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr);
   4911 	} else {
   4912 		DEMAP_RANGE_FLUSH(dmrp);
   4913 	}
   4914 
   4915 	tteattr.ll = sfmmu_vtop_attr(attr, mode, &tteflags);
   4916 #if defined(SF_ERRATA_57)
   4917 	check_exec = (sfmmup != ksfmmup) &&
   4918 	    AS_TYPE_64BIT(sfmmup->sfmmu_as) &&
   4919 	    TTE_IS_EXECUTABLE(&tteattr);
   4920 #endif
   4921 	HBLKTOHME(sfhmep, hmeblkp, addr);
   4922 	while (addr < endaddr) {
   4923 		sfmmu_copytte(&sfhmep->hme_tte, &tte);
   4924 		if (TTE_IS_VALID(&tte)) {
   4925 			if ((tte.ll & tteflags.ll) == tteattr.ll) {
   4926 				/*
   4927 				 * if the new attr is the same as old
   4928 				 * continue
   4929 				 */
   4930 				goto next_addr;
   4931 			}
   4932 			if (!TTE_IS_WRITABLE(&tteattr)) {
   4933 				/*
   4934 				 * make sure we clear hw modify bit if we
   4935 				 * removing write protections
   4936 				 */
   4937 				tteflags.tte_intlo |= TTE_HWWR_INT;
   4938 			}
   4939 
   4940 			pml = NULL;
   4941 			pp = sfhmep->hme_page;
   4942 			if (pp) {
   4943 				pml = sfmmu_mlist_enter(pp);
   4944 			}
   4945 
   4946 			if (pp != sfhmep->hme_page) {
   4947 				/*
   4948 				 * tte must have been unloaded.
   4949 				 */
   4950 				ASSERT(pml);
   4951 				sfmmu_mlist_exit(pml);
   4952 				continue;
   4953 			}
   4954 
   4955 			ASSERT(pp == NULL || sfmmu_mlist_held(pp));
   4956 
   4957 			ttemod = tte;
   4958 			ttemod.ll = (ttemod.ll & ~tteflags.ll) | tteattr.ll;
   4959 			ASSERT(TTE_TO_TTEPFN(&ttemod) == TTE_TO_TTEPFN(&tte));
   4960 
   4961 #if defined(SF_ERRATA_57)
   4962 			if (check_exec && addr < errata57_limit)
   4963 				ttemod.tte_exec_perm = 0;
   4964 #endif
   4965 			ret = sfmmu_modifytte_try(&tte, &ttemod,
   4966 			    &sfhmep->hme_tte);
   4967 
   4968 			if (ret < 0) {
   4969 				/* tte changed underneath us */
   4970 				if (pml) {
   4971 					sfmmu_mlist_exit(pml);
   4972 				}
   4973 				continue;
   4974 			}
   4975 
   4976 			if (tteflags.tte_intlo & TTE_HWWR_INT) {
   4977 				/*
   4978 				 * need to sync if we are clearing modify bit.
   4979 				 */
   4980 				sfmmu_ttesync(sfmmup, addr, &tte, pp);
   4981 			}
   4982 
   4983 			if (pp && PP_ISRO(pp)) {
   4984 				if (tteattr.tte_intlo & TTE_WRPRM_INT) {
   4985 					pmtx = sfmmu_page_enter(pp);
   4986 					PP_CLRRO(pp);
   4987 					sfmmu_page_exit(pmtx);
   4988 				}
   4989 			}
   4990 
   4991 			if (ret > 0 && use_demap_range) {
   4992 				DEMAP_RANGE_MARKPG(dmrp, addr);
   4993 			} else if (ret > 0) {
   4994 				sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0);
   4995 			}
   4996 
   4997 			if (pml) {
   4998 				sfmmu_mlist_exit(pml);
   4999 			}
   5000 		}
   5001 next_addr:
   5002 		addr += TTEBYTES(ttesz);
   5003 		sfhmep++;
   5004 		DEMAP_RANGE_NEXTPG(dmrp);
   5005 	}
   5006 	return (addr);
   5007 }
   5008 
   5009 /*
   5010  * This routine converts virtual attributes to physical ones.  It will
   5011  * update the tteflags field with the tte mask corresponding to the attributes
   5012  * affected and it returns the new attributes.  It will also clear the modify
   5013  * bit if we are taking away write permission.  This is necessary since the
   5014  * modify bit is the hardware permission bit and we need to clear it in order
   5015  * to detect write faults.
   5016  */
   5017 static uint64_t
   5018 sfmmu_vtop_attr(uint_t attr, int mode, tte_t *ttemaskp)
   5019 {
   5020 	tte_t ttevalue;
   5021 
   5022 	ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
   5023 
   5024 	switch (mode) {
   5025 	case SFMMU_CHGATTR:
   5026 		/* all attributes specified */
   5027 		ttevalue.tte_inthi = MAKE_TTEATTR_INTHI(attr);
   5028 		ttevalue.tte_intlo = MAKE_TTEATTR_INTLO(attr);
   5029 		ttemaskp->tte_inthi = TTEINTHI_ATTR;
   5030 		ttemaskp->tte_intlo = TTEINTLO_ATTR;
   5031 		break;
   5032 	case SFMMU_SETATTR:
   5033 		ASSERT(!(attr & ~HAT_PROT_MASK));
   5034 		ttemaskp->ll = 0;
   5035 		ttevalue.ll = 0;
   5036 		/*
   5037 		 * a valid tte implies exec and read for sfmmu
   5038 		 * so no need to do anything about them.
   5039 		 * since priviledged access implies user access
   5040 		 * PROT_USER doesn't make sense either.
   5041 		 */
   5042 		if (attr & PROT_WRITE) {
   5043 			ttemaskp->tte_intlo |= TTE_WRPRM_INT;
   5044 			ttevalue.tte_intlo |= TTE_WRPRM_INT;
   5045 		}
   5046 		break;
   5047 	case SFMMU_CLRATTR:
   5048 		/* attributes will be nand with current ones */
   5049 		if (attr & ~(PROT_WRITE | PROT_USER)) {
   5050 			panic("sfmmu: attr %x not supported", attr);
   5051 		}
   5052 		ttemaskp->ll = 0;
   5053 		ttevalue.ll = 0;
   5054 		if (attr & PROT_WRITE) {
   5055 			/* clear both writable and modify bit */
   5056 			ttemaskp->tte_intlo |= TTE_WRPRM_INT | TTE_HWWR_INT;
   5057 		}
   5058 		if (attr & PROT_USER) {
   5059 			ttemaskp->tte_intlo |= TTE_PRIV_INT;
   5060 			ttevalue.tte_intlo |= TTE_PRIV_INT;
   5061 		}
   5062 		break;
   5063 	default:
   5064 		panic("sfmmu_vtop_attr: bad mode %x", mode);
   5065 	}
   5066 	ASSERT(TTE_TO_TTEPFN(&ttevalue) == 0);
   5067 	return (ttevalue.ll);
   5068 }
   5069 
   5070 static uint_t
   5071 sfmmu_ptov_attr(tte_t *ttep)
   5072 {
   5073 	uint_t attr;
   5074 
   5075 	ASSERT(TTE_IS_VALID(ttep));
   5076 
   5077 	attr = PROT_READ;
   5078 
   5079 	if (TTE_IS_WRITABLE(ttep)) {
   5080 		attr |= PROT_WRITE;
   5081 	}
   5082 	if (TTE_IS_EXECUTABLE(ttep)) {
   5083 		attr |= PROT_EXEC;
   5084 	}
   5085 	if (!TTE_IS_PRIVILEGED(ttep)) {
   5086 		attr |= PROT_USER;
   5087 	}
   5088 	if (TTE_IS_NFO(ttep)) {
   5089 		attr |= HAT_NOFAULT;
   5090 	}
   5091 	if (TTE_IS_NOSYNC(ttep)) {
   5092 		attr |= HAT_NOSYNC;
   5093 	}
   5094 	if (TTE_IS_SIDEFFECT(ttep)) {
   5095 		attr |= SFMMU_SIDEFFECT;
   5096 	}
   5097 	if (!TTE_IS_VCACHEABLE(ttep)) {
   5098 		attr |= SFMMU_UNCACHEVTTE;
   5099 	}
   5100 	if (!TTE_IS_PCACHEABLE(ttep)) {
   5101 		attr |= SFMMU_UNCACHEPTTE;
   5102 	}
   5103 	return (attr);
   5104 }
   5105 
   5106 /*
   5107  * hat_chgprot is a deprecated hat call.  New segment drivers
   5108  * should store all attributes and use hat_*attr calls.
   5109  *
   5110  * Change the protections in the virtual address range
   5111  * given to the specified virtual protection.  If vprot is ~PROT_WRITE,
   5112  * then remove write permission, leaving the other
   5113  * permissions unchanged.  If vprot is ~PROT_USER, remove user permissions.
   5114  *
   5115  */
   5116 void
   5117 hat_chgprot(struct hat *sfmmup, caddr_t addr, size_t len, uint_t vprot)
   5118 {
   5119 	struct hmehash_bucket *hmebp;
   5120 	hmeblk_tag hblktag;
   5121 	int hmeshift, hashno = 1;
   5122 	struct hme_blk *hmeblkp, *list = NULL;
   5123 	caddr_t endaddr;
   5124 	cpuset_t cpuset;
   5125 	demap_range_t dmr;
   5126 
   5127 	ASSERT((len & MMU_PAGEOFFSET) == 0);
   5128 	ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0);
   5129 
   5130 	if (sfmmup->sfmmu_xhat_provider) {
   5131 		XHAT_CHGPROT(sfmmup, addr, len, vprot);
   5132 		return;
   5133 	} else {
   5134 		/*
   5135 		 * This must be a CPU HAT. If the address space has
   5136 		 * XHATs attached, change attributes for all of them,
   5137 		 * just in case
   5138 		 */
   5139 		ASSERT(sfmmup->sfmmu_as != NULL);
   5140 		if (sfmmup->sfmmu_as->a_xhat != NULL)
   5141 			xhat_chgprot_all(sfmmup->sfmmu_as, addr, len, vprot);
   5142 	}
   5143 
   5144 	CPUSET_ZERO(cpuset);
   5145 
   5146 	if ((vprot != (uint_t)~PROT_WRITE) && (vprot & PROT_USER) &&
   5147 	    ((addr + len) > (caddr_t)USERLIMIT)) {
   5148 		panic("user addr %p vprot %x in kernel space",
   5149 		    (void *)addr, vprot);
   5150 	}
   5151 	endaddr = addr + len;
   5152 	hblktag.htag_id = sfmmup;
   5153 	hblktag.htag_rid = SFMMU_INVALID_SHMERID;
   5154 	DEMAP_RANGE_INIT(sfmmup, &dmr);
   5155 
   5156 	while (addr < endaddr) {
   5157 		hmeshift = HME_HASH_SHIFT(hashno);
   5158 		hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
   5159 		hblktag.htag_rehash = hashno;
   5160 		hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
   5161 
   5162 		SFMMU_HASH_LOCK(hmebp);
   5163 
   5164 		HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list);
   5165 		if (hmeblkp != NULL) {
   5166 			ASSERT(!hmeblkp->hblk_shared);
   5167 			/*
   5168 			 * We've encountered a shadow hmeblk so skip the range
   5169 			 * of the next smaller mapping size.
   5170 			 */
   5171 			if (hmeblkp->hblk_shw_bit) {
   5172 				ASSERT(sfmmup != ksfmmup);
   5173 				ASSERT(hashno > 1);
   5174 				addr = (caddr_t)P2END((uintptr_t)addr,
   5175 				    TTEBYTES(hashno - 1));
   5176 			} else {
   5177 				addr = sfmmu_hblk_chgprot(sfmmup, hmeblkp,
   5178 				    addr, endaddr, &dmr, vprot);
   5179 			}
   5180 			SFMMU_HASH_UNLOCK(hmebp);
   5181 			hashno = 1;
   5182 			continue;
   5183 		}
   5184 		SFMMU_HASH_UNLOCK(hmebp);
   5185 
   5186 		if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) {
   5187 			/*
   5188 			 * We have traversed the whole list and rehashed
   5189 			 * if necessary without finding the address to chgprot.
   5190 			 * This is ok so we increment the address by the
   5191 			 * smallest hmeblk range for kernel mappings and the
   5192 			 * largest hmeblk range, to account for shadow hmeblks,
   5193 			 * for user mappings and continue.
   5194 			 */
   5195 			if (sfmmup == ksfmmup)
   5196 				addr = (caddr_t)P2END((uintptr_t)addr,
   5197 				    TTEBYTES(1));
   5198 			else
   5199 				addr = (caddr_t)P2END((uintptr_t)addr,
   5200 				    TTEBYTES(hashno));
   5201 			hashno = 1;
   5202 		} else {
   5203 			hashno++;
   5204 		}
   5205 	}
   5206 
   5207 	sfmmu_hblks_list_purge(&list, 0);
   5208 	DEMAP_RANGE_FLUSH(&dmr);
   5209 	cpuset = sfmmup->sfmmu_cpusran;
   5210 	xt_sync(cpuset);
   5211 }
   5212 
   5213 /*
   5214  * This function chgprots a range of addresses in an hmeblk.  It returns the
   5215  * next addres that needs to be chgprot.
   5216  * It should be called with the hash lock held.
   5217  * XXX It shold be possible to optimize chgprot by not flushing every time but
   5218  * on the other hand:
   5219  * 1. do one flush crosscall.
   5220  * 2. only flush if we are increasing permissions (make sure this will work)
   5221  */
   5222 static caddr_t
   5223 sfmmu_hblk_chgprot(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, caddr_t addr,
   5224 	caddr_t endaddr, demap_range_t *dmrp, uint_t vprot)
   5225 {
   5226 	uint_t pprot;
   5227 	tte_t tte, ttemod;
   5228 	struct sf_hment *sfhmep;
   5229 	uint_t tteflags;
   5230 	int ttesz;
   5231 	struct page *pp = NULL;
   5232 	kmutex_t *pml, *pmtx;
   5233 	int ret;
   5234 	int use_demap_range;
   5235 #if defined(SF_ERRATA_57)
   5236 	int check_exec;
   5237 #endif
   5238 
   5239 	ASSERT(in_hblk_range(hmeblkp, addr));
   5240 	ASSERT(hmeblkp->hblk_shw_bit == 0);
   5241 	ASSERT(!hmeblkp->hblk_shared);
   5242 
   5243 #ifdef DEBUG
   5244 	if (get_hblk_ttesz(hmeblkp) != TTE8K &&
   5245 	    (endaddr < get_hblk_endaddr(hmeblkp))) {
   5246 		panic("sfmmu_hblk_chgprot: partial chgprot of large page");
   5247 	}
   5248 #endif /* DEBUG */
   5249 
   5250 	endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
   5251 	ttesz = get_hblk_ttesz(hmeblkp);
   5252 
   5253 	pprot = sfmmu_vtop_prot(vprot, &tteflags);
   5254 #if defined(SF_ERRATA_57)
   5255 	check_exec = (sfmmup != ksfmmup) &&
   5256 	    AS_TYPE_64BIT(sfmmup->sfmmu_as) &&
   5257 	    ((vprot & PROT_EXEC) == PROT_EXEC);
   5258 #endif
   5259 	HBLKTOHME(sfhmep, hmeblkp, addr);
   5260 
   5261 	/*
   5262 	 * Flush the current demap region if addresses have been
   5263 	 * skipped or the page size doesn't match.
   5264 	 */
   5265 	use_demap_range = (TTEBYTES(ttesz) == MMU_PAGESIZE);
   5266 	if (use_demap_range) {
   5267 		DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr);
   5268 	} else {
   5269 		DEMAP_RANGE_FLUSH(dmrp);
   5270 	}
   5271 
   5272 	while (addr < endaddr) {
   5273 		sfmmu_copytte(&sfhmep->hme_tte, &tte);
   5274 		if (TTE_IS_VALID(&tte)) {
   5275 			if (TTE_GET_LOFLAGS(&tte, tteflags) == pprot) {
   5276 				/*
   5277 				 * if the new protection is the same as old
   5278 				 * continue
   5279 				 */
   5280 				goto next_addr;
   5281 			}
   5282 			pml = NULL;
   5283 			pp = sfhmep->hme_page;
   5284 			if (pp) {
   5285 				pml = sfmmu_mlist_enter(pp);
   5286 			}
   5287 			if (pp != sfhmep->hme_page) {
   5288 				/*
   5289 				 * tte most have been unloaded
   5290 				 * underneath us.  Recheck
   5291 				 */
   5292 				ASSERT(pml);
   5293 				sfmmu_mlist_exit(pml);
   5294 				continue;
   5295 			}
   5296 
   5297 			ASSERT(pp == NULL || sfmmu_mlist_held(pp));
   5298 
   5299 			ttemod = tte;
   5300 			TTE_SET_LOFLAGS(&ttemod, tteflags, pprot);
   5301 #if defined(SF_ERRATA_57)
   5302 			if (check_exec && addr < errata57_limit)
   5303 				ttemod.tte_exec_perm = 0;
   5304 #endif
   5305 			ret = sfmmu_modifytte_try(&tte, &ttemod,
   5306 			    &sfhmep->hme_tte);
   5307 
   5308 			if (ret < 0) {
   5309 				/* tte changed underneath us */
   5310 				if (pml) {
   5311 					sfmmu_mlist_exit(pml);
   5312 				}
   5313 				continue;
   5314 			}
   5315 
   5316 			if (tteflags & TTE_HWWR_INT) {
   5317 				/*
   5318 				 * need to sync if we are clearing modify bit.
   5319 				 */
   5320 				sfmmu_ttesync(sfmmup, addr, &tte, pp);
   5321 			}
   5322 
   5323 			if (pp && PP_ISRO(pp)) {
   5324 				if (pprot & TTE_WRPRM_INT) {
   5325 					pmtx = sfmmu_page_enter(pp);
   5326 					PP_CLRRO(pp);
   5327 					sfmmu_page_exit(pmtx);
   5328 				}
   5329 			}
   5330 
   5331 			if (ret > 0 && use_demap_range) {
   5332 				DEMAP_RANGE_MARKPG(dmrp, addr);
   5333 			} else if (ret > 0) {
   5334 				sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0);
   5335 			}
   5336 
   5337 			if (pml) {
   5338 				sfmmu_mlist_exit(pml);
   5339 			}
   5340 		}
   5341 next_addr:
   5342 		addr += TTEBYTES(ttesz);
   5343 		sfhmep++;
   5344 		DEMAP_RANGE_NEXTPG(dmrp);
   5345 	}
   5346 	return (addr);
   5347 }
   5348 
   5349 /*
   5350  * This routine is deprecated and should only be used by hat_chgprot.
   5351  * The correct routine is sfmmu_vtop_attr.
   5352  * This routine converts virtual page protections to physical ones.  It will
   5353  * update the tteflags field with the tte mask corresponding to the protections
   5354  * affected and it returns the new protections.  It will also clear the modify
   5355  * bit if we are taking away write permission.  This is necessary since the
   5356  * modify bit is the hardware permission bit and we need to clear it in order
   5357  * to detect write faults.
   5358  * It accepts the following special protections:
   5359  * ~PROT_WRITE = remove write permissions.
   5360  * ~PROT_USER = remove user permissions.
   5361  */
   5362 static uint_t
   5363 sfmmu_vtop_prot(uint_t vprot, uint_t *tteflagsp)
   5364 {
   5365 	if (vprot == (uint_t)~PROT_WRITE) {
   5366 		*tteflagsp = TTE_WRPRM_INT | TTE_HWWR_INT;
   5367 		return (0);		/* will cause wrprm to be cleared */
   5368 	}
   5369 	if (vprot == (uint_t)~PROT_USER) {
   5370 		*tteflagsp = TTE_PRIV_INT;
   5371 		return (0);		/* will cause privprm to be cleared */
   5372 	}
   5373 	if ((vprot == 0) || (vprot == PROT_USER) ||
   5374 	    ((vprot & PROT_ALL) != vprot)) {
   5375 		panic("sfmmu_vtop_prot -- bad prot %x", vprot);
   5376 	}
   5377 
   5378 	switch (vprot) {
   5379 	case (PROT_READ):
   5380 	case (PROT_EXEC):
   5381 	case (PROT_EXEC | PROT_READ):
   5382 		*tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT;
   5383 		return (TTE_PRIV_INT); 		/* set prv and clr wrt */
   5384 	case (PROT_WRITE):
   5385 	case (PROT_WRITE | PROT_READ):
   5386 	case (PROT_EXEC | PROT_WRITE):
   5387 	case (PROT_EXEC | PROT_WRITE | PROT_READ):
   5388 		*tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT;
   5389 		return (TTE_PRIV_INT | TTE_WRPRM_INT); 	/* set prv and wrt */
   5390 	case (PROT_USER | PROT_READ):
   5391 	case (PROT_USER | PROT_EXEC):
   5392 	case (PROT_USER | PROT_EXEC | PROT_READ):
   5393 		*tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT;
   5394 		return (0); 			/* clr prv and wrt */
   5395 	case (PROT_USER | PROT_WRITE):
   5396 	case (PROT_USER | PROT_WRITE | PROT_READ):
   5397 	case (PROT_USER | PROT_EXEC | PROT_WRITE):
   5398 	case (PROT_USER | PROT_EXEC | PROT_WRITE | PROT_READ):
   5399 		*tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT;
   5400 		return (TTE_WRPRM_INT); 	/* clr prv and set wrt */
   5401 	default:
   5402 		panic("sfmmu_vtop_prot -- bad prot %x", vprot);
   5403 	}
   5404 	return (0);
   5405 }
   5406 
   5407 /*
   5408  * Alternate unload for very large virtual ranges. With a true 64 bit VA,
   5409  * the normal algorithm would take too long for a very large VA range with
   5410  * few real mappings. This routine just walks thru all HMEs in the global
   5411  * hash table to find and remove mappings.
   5412  */
   5413 static void
   5414 hat_unload_large_virtual(
   5415 	struct hat		*sfmmup,
   5416 	caddr_t			startaddr,
   5417 	size_t			len,
   5418 	uint_t			flags,
   5419 	hat_callback_t		*callback)
   5420 {
   5421 	struct hmehash_bucket *hmebp;
   5422 	struct hme_blk *hmeblkp;
   5423 	struct hme_blk *pr_hblk = NULL;
   5424 	struct hme_blk *nx_hblk;
   5425 	struct hme_blk *list = NULL;
   5426 	int i;
   5427 	demap_range_t dmr, *dmrp;
   5428 	cpuset_t cpuset;
   5429 	caddr_t	endaddr = startaddr + len;
   5430 	caddr_t	sa;
   5431 	caddr_t	ea;
   5432 	caddr_t	cb_sa[MAX_CB_ADDR];
   5433 	caddr_t	cb_ea[MAX_CB_ADDR];
   5434 	int	addr_cnt = 0;
   5435 	int	a = 0;
   5436 
   5437 	if (sfmmup->sfmmu_free) {
   5438 		dmrp = NULL;
   5439 	} else {
   5440 		dmrp = &dmr;
   5441 		DEMAP_RANGE_INIT(sfmmup, dmrp);
   5442 	}
   5443 
   5444 	/*
   5445 	 * Loop through all the hash buckets of HME blocks looking for matches.
   5446 	 */
   5447 	for (i = 0; i <= UHMEHASH_SZ; i++) {
   5448 		hmebp = &uhme_hash[i];
   5449 		SFMMU_HASH_LOCK(hmebp);
   5450 		hmeblkp = hmebp->hmeblkp;
   5451 		pr_hblk = NULL;
   5452 		while (hmeblkp) {
   5453 			nx_hblk = hmeblkp->hblk_next;
   5454 
   5455 			/*
   5456 			 * skip if not this context, if a shadow block or
   5457 			 * if the mapping is not in the requested range
   5458 			 */
   5459 			if (hmeblkp->hblk_tag.htag_id != sfmmup ||
   5460 			    hmeblkp->hblk_shw_bit ||
   5461 			    (sa = (caddr_t)get_hblk_base(hmeblkp)) >= endaddr ||
   5462 			    (ea = get_hblk_endaddr(hmeblkp)) <= startaddr) {
   5463 				pr_hblk = hmeblkp;
   5464 				goto next_block;
   5465 			}
   5466 
   5467 			ASSERT(!hmeblkp->hblk_shared);
   5468 			/*
   5469 			 * unload if there are any current valid mappings
   5470 			 */
   5471 			if (hmeblkp->hblk_vcnt != 0 ||
   5472 			    hmeblkp->hblk_hmecnt != 0)
   5473 				(void) sfmmu_hblk_unload(sfmmup, hmeblkp,
   5474 				    sa, ea, dmrp, flags);
   5475 
   5476 			/*
   5477 			 * on unmap we also release the HME block itself, once
   5478 			 * all mappings are gone.
   5479 			 */
   5480 			if ((flags & HAT_UNLOAD_UNMAP) != 0 &&
   5481 			    !hmeblkp->hblk_vcnt &&
   5482 			    !hmeblkp->hblk_hmecnt) {
   5483 				ASSERT(!hmeblkp->hblk_lckcnt);
   5484 				sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
   5485 				    &list, 0);
   5486 			} else {
   5487 				pr_hblk = hmeblkp;
   5488 			}
   5489 
   5490 			if (callback == NULL)
   5491 				goto next_block;
   5492 
   5493 			/*
   5494 			 * HME blocks may span more than one page, but we may be
   5495 			 * unmapping only one page, so check for a smaller range
   5496 			 * for the callback
   5497 			 */
   5498 			if (sa < startaddr)
   5499 				sa = startaddr;
   5500 			if (--ea > endaddr)
   5501 				ea = endaddr - 1;
   5502 
   5503 			cb_sa[addr_cnt] = sa;
   5504 			cb_ea[addr_cnt] = ea;
   5505 			if (++addr_cnt == MAX_CB_ADDR) {
   5506 				if (dmrp != NULL) {
   5507 					DEMAP_RANGE_FLUSH(dmrp);
   5508 					cpuset = sfmmup->sfmmu_cpusran;
   5509 					xt_sync(cpuset);
   5510 				}
   5511 
   5512 				for (a = 0; a < MAX_CB_ADDR; ++a) {
   5513 					callback->hcb_start_addr = cb_sa[a];
   5514 					callback->hcb_end_addr = cb_ea[a];
   5515 					callback->hcb_function(callback);
   5516 				}
   5517 				addr_cnt = 0;
   5518 			}
   5519 
   5520 next_block:
   5521 			hmeblkp = nx_hblk;
   5522 		}
   5523 		SFMMU_HASH_UNLOCK(hmebp);
   5524 	}
   5525 
   5526 	sfmmu_hblks_list_purge(&list, 0);
   5527 	if (dmrp != NULL) {
   5528 		DEMAP_RANGE_FLUSH(dmrp);
   5529 		cpuset = sfmmup->sfmmu_cpusran;
   5530 		xt_sync(cpuset);
   5531 	}
   5532 
   5533 	for (a = 0; a < addr_cnt; ++a) {
   5534 		callback->hcb_start_addr = cb_sa[a];
   5535 		callback->hcb_end_addr = cb_ea[a];
   5536 		callback->hcb_function(callback);
   5537 	}
   5538 
   5539 	/*
   5540 	 * Check TSB and TLB page sizes if the process isn't exiting.
   5541 	 */
   5542 	if (!sfmmup->sfmmu_free)
   5543 		sfmmu_check_page_sizes(sfmmup, 0);
   5544 }
   5545 
   5546 /*
   5547  * Unload all the mappings in the range [addr..addr+len). addr and len must
   5548  * be MMU_PAGESIZE aligned.
   5549  */
   5550 
   5551 extern struct seg *segkmap;
   5552 #define	ISSEGKMAP(sfmmup, addr) (sfmmup == ksfmmup && \
   5553 segkmap->s_base <= (addr) && (addr) < (segkmap->s_base + segkmap->s_size))
   5554 
   5555 
   5556 void
   5557 hat_unload_callback(
   5558 	struct hat *sfmmup,
   5559 	caddr_t addr,
   5560 	size_t len,
   5561 	uint_t flags,
   5562 	hat_callback_t *callback)
   5563 {
   5564 	struct hmehash_bucket *hmebp;
   5565 	hmeblk_tag hblktag;
   5566 	int hmeshift, hashno, iskernel;
   5567 	struct hme_blk *hmeblkp, *pr_hblk, *list = NULL;
   5568 	caddr_t endaddr;
   5569 	cpuset_t cpuset;
   5570 	int addr_count = 0;
   5571 	int a;
   5572 	caddr_t cb_start_addr[MAX_CB_ADDR];
   5573 	caddr_t cb_end_addr[MAX_CB_ADDR];
   5574 	int issegkmap = ISSEGKMAP(sfmmup, addr);
   5575 	demap_range_t dmr, *dmrp;
   5576 
   5577 	if (sfmmup->sfmmu_xhat_provider) {
   5578 		XHAT_UNLOAD_CALLBACK(sfmmup, addr, len, flags, callback);
   5579 		return;
   5580 	} else {
   5581 		/*
   5582 		 * This must be a CPU HAT. If the address space has
   5583 		 * XHATs attached, unload the mappings for all of them,
   5584 		 * just in case
   5585 		 */
   5586 		ASSERT(sfmmup->sfmmu_as != NULL);
   5587 		if (sfmmup->sfmmu_as->a_xhat != NULL)
   5588 			xhat_unload_callback_all(sfmmup->sfmmu_as, addr,
   5589 			    len, flags, callback);
   5590 	}
   5591 
   5592 	ASSERT((sfmmup == ksfmmup) || (flags & HAT_UNLOAD_OTHER) || \
   5593 	    AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock));
   5594 
   5595 	ASSERT(sfmmup != NULL);
   5596 	ASSERT((len & MMU_PAGEOFFSET) == 0);
   5597 	ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET));
   5598 
   5599 	/*
   5600 	 * Probing through a large VA range (say 63 bits) will be slow, even
   5601 	 * at 4 Meg steps between the probes. So, when the virtual address range
   5602 	 * is very large, search the HME entries for what to unload.
   5603 	 *
   5604 	 *	len >> TTE_PAGE_SHIFT(TTE4M) is the # of 4Meg probes we'd need
   5605 	 *
   5606 	 *	UHMEHASH_SZ is number of hash buckets to examine
   5607 	 *
   5608 	 */
   5609 	if (sfmmup != KHATID && (len >> TTE_PAGE_SHIFT(TTE4M)) > UHMEHASH_SZ) {
   5610 		hat_unload_large_virtual(sfmmup, addr, len, flags, callback);
   5611 		return;
   5612 	}
   5613 
   5614 	CPUSET_ZERO(cpuset);
   5615 
   5616 	/*
   5617 	 * If the process is exiting, we can save a lot of fuss since
   5618 	 * we'll flush the TLB when we free the ctx anyway.
   5619 	 */
   5620 	if (sfmmup->sfmmu_free)
   5621 		dmrp = NULL;
   5622 	else
   5623 		dmrp = &dmr;
   5624 
   5625 	DEMAP_RANGE_INIT(sfmmup, dmrp);
   5626 	endaddr = addr + len;
   5627 	hblktag.htag_id = sfmmup;
   5628 	hblktag.htag_rid = SFMMU_INVALID_SHMERID;
   5629 
   5630 	/*
   5631 	 * It is likely for the vm to call unload over a wide range of
   5632 	 * addresses that are actually very sparsely populated by
   5633 	 * translations.  In order to speed this up the sfmmu hat supports
   5634 	 * the concept of shadow hmeblks. Dummy large page hmeblks that
   5635 	 * correspond to actual small translations are allocated at tteload
   5636 	 * time and are referred to as shadow hmeblks.  Now, during unload
   5637 	 * time, we first check if we have a shadow hmeblk for that
   5638 	 * translation.  The absence of one means the corresponding address
   5639 	 * range is empty and can be skipped.
   5640 	 *
   5641 	 * The kernel is an exception to above statement and that is why
   5642 	 * we don't use shadow hmeblks and hash starting from the smallest
   5643 	 * page size.
   5644 	 */
   5645 	if (sfmmup == KHATID) {
   5646 		iskernel = 1;
   5647 		hashno = TTE64K;
   5648 	} else {
   5649 		iskernel = 0;
   5650 		if (mmu_page_sizes == max_mmu_page_sizes) {
   5651 			hashno = TTE256M;
   5652 		} else {
   5653 			hashno = TTE4M;
   5654 		}
   5655 	}
   5656 	while (addr < endaddr) {
   5657 		hmeshift = HME_HASH_SHIFT(hashno);
   5658 		hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
   5659 		hblktag.htag_rehash = hashno;
   5660 		hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
   5661 
   5662 		SFMMU_HASH_LOCK(hmebp);
   5663 
   5664 		HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list);
   5665 		if (hmeblkp == NULL) {
   5666 			/*
   5667 			 * didn't find an hmeblk. skip the appropiate
   5668 			 * address range.
   5669 			 */
   5670 			SFMMU_HASH_UNLOCK(hmebp);
   5671 			if (iskernel) {
   5672 				if (hashno < mmu_hashcnt) {
   5673 					hashno++;
   5674 					continue;
   5675 				} else {
   5676 					hashno = TTE64K;
   5677 					addr = (caddr_t)roundup((uintptr_t)addr
   5678 					    + 1, MMU_PAGESIZE64K);
   5679 					continue;
   5680 				}
   5681 			}
   5682 			addr = (caddr_t)roundup((uintptr_t)addr + 1,
   5683 			    (1 << hmeshift));
   5684 			if ((uintptr_t)addr & MMU_PAGEOFFSET512K) {
   5685 				ASSERT(hashno == TTE64K);
   5686 				continue;
   5687 			}
   5688 			if ((uintptr_t)addr & MMU_PAGEOFFSET4M) {
   5689 				hashno = TTE512K;
   5690 				continue;
   5691 			}
   5692 			if (mmu_page_sizes == max_mmu_page_sizes) {
   5693 				if ((uintptr_t)addr & MMU_PAGEOFFSET32M) {
   5694 					hashno = TTE4M;
   5695 					continue;
   5696 				}
   5697 				if ((uintptr_t)addr & MMU_PAGEOFFSET256M) {
   5698 					hashno = TTE32M;
   5699 					continue;
   5700 				}
   5701 				hashno = TTE256M;
   5702 				continue;
   5703 			} else {
   5704 				hashno = TTE4M;
   5705 				continue;
   5706 			}
   5707 		}
   5708 		ASSERT(hmeblkp);
   5709 		ASSERT(!hmeblkp->hblk_shared);
   5710 		if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) {
   5711 			/*
   5712 			 * If the valid count is zero we can skip the range
   5713 			 * mapped by this hmeblk.
   5714 			 * We free hblks in the case of HAT_UNMAP.  HAT_UNMAP
   5715 			 * is used by segment drivers as a hint
   5716 			 * that the mapping resource won't be used any longer.
   5717 			 * The best example of this is during exit().
   5718 			 */
   5719 			addr = (caddr_t)roundup((uintptr_t)addr + 1,
   5720 			    get_hblk_span(hmeblkp));
   5721 			if ((flags & HAT_UNLOAD_UNMAP) ||
   5722 			    (iskernel && !issegkmap)) {
   5723 				sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
   5724 				    &list, 0);
   5725 			}
   5726 			SFMMU_HASH_UNLOCK(hmebp);
   5727 
   5728 			if (iskernel) {
   5729 				hashno = TTE64K;
   5730 				continue;
   5731 			}
   5732 			if ((uintptr_t)addr & MMU_PAGEOFFSET512K) {
   5733 				ASSERT(hashno == TTE64K);
   5734 				continue;
   5735 			}
   5736 			if ((uintptr_t)addr & MMU_PAGEOFFSET4M) {
   5737 				hashno = TTE512K;
   5738 				continue;
   5739 			}
   5740 			if (mmu_page_sizes == max_mmu_page_sizes) {
   5741 				if ((uintptr_t)addr & MMU_PAGEOFFSET32M) {
   5742 					hashno = TTE4M;
   5743 					continue;
   5744 				}
   5745 				if ((uintptr_t)addr & MMU_PAGEOFFSET256M) {
   5746 					hashno = TTE32M;
   5747 					continue;
   5748 				}
   5749 				hashno = TTE256M;
   5750 				continue;
   5751 			} else {
   5752 				hashno = TTE4M;
   5753 				continue;
   5754 			}
   5755 		}
   5756 		if (hmeblkp->hblk_shw_bit) {
   5757 			/*
   5758 			 * If we encounter a shadow hmeblk we know there is
   5759 			 * smaller sized hmeblks mapping the same address space.
   5760 			 * Decrement the hash size and rehash.
   5761 			 */
   5762 			ASSERT(sfmmup != KHATID);
   5763 			hashno--;
   5764 			SFMMU_HASH_UNLOCK(hmebp);
   5765 			continue;
   5766 		}
   5767 
   5768 		/*
   5769 		 * track callback address ranges.
   5770 		 * only start a new range when it's not contiguous
   5771 		 */
   5772 		if (callback != NULL) {
   5773 			if (addr_count > 0 &&
   5774 			    addr == cb_end_addr[addr_count - 1])
   5775 				--addr_count;
   5776 			else
   5777 				cb_start_addr[addr_count] = addr;
   5778 		}
   5779 
   5780 		addr = sfmmu_hblk_unload(sfmmup, hmeblkp, addr, endaddr,
   5781 		    dmrp, flags);
   5782 
   5783 		if (callback != NULL)
   5784 			cb_end_addr[addr_count++] = addr;
   5785 
   5786 		if (((flags & HAT_UNLOAD_UNMAP) || (iskernel && !issegkmap)) &&
   5787 		    !hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) {
   5788 			sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, &list, 0);
   5789 		}
   5790 		SFMMU_HASH_UNLOCK(hmebp);
   5791 
   5792 		/*
   5793 		 * Notify our caller as to exactly which pages
   5794 		 * have been unloaded. We do these in clumps,
   5795 		 * to minimize the number of xt_sync()s that need to occur.
   5796 		 */
   5797 		if (callback != NULL && addr_count == MAX_CB_ADDR) {
   5798 			DEMAP_RANGE_FLUSH(dmrp);
   5799 			if (dmrp != NULL) {
   5800 				cpuset = sfmmup->sfmmu_cpusran;
   5801 				xt_sync(cpuset);
   5802 			}
   5803 
   5804 			for (a = 0; a < MAX_CB_ADDR; ++a) {
   5805 				callback->hcb_start_addr = cb_start_addr[a];
   5806 				callback->hcb_end_addr = cb_end_addr[a];
   5807 				callback->hcb_function(callback);
   5808 			}
   5809 			addr_count = 0;
   5810 		}
   5811 		if (iskernel) {
   5812 			hashno = TTE64K;
   5813 			continue;
   5814 		}
   5815 		if ((uintptr_t)addr & MMU_PAGEOFFSET512K) {
   5816 			ASSERT(hashno == TTE64K);
   5817 			continue;
   5818 		}
   5819 		if ((uintptr_t)addr & MMU_PAGEOFFSET4M) {
   5820 			hashno = TTE512K;
   5821 			continue;
   5822 		}
   5823 		if (mmu_page_sizes == max_mmu_page_sizes) {
   5824 			if ((uintptr_t)addr & MMU_PAGEOFFSET32M) {
   5825 				hashno = TTE4M;
   5826 				continue;
   5827 			}
   5828 			if ((uintptr_t)addr & MMU_PAGEOFFSET256M) {
   5829 				hashno = TTE32M;
   5830 				continue;
   5831 			}
   5832 			hashno = TTE256M;
   5833 		} else {
   5834 			hashno = TTE4M;
   5835 		}
   5836 	}
   5837 
   5838 	sfmmu_hblks_list_purge(&list, 0);
   5839 	DEMAP_RANGE_FLUSH(dmrp);
   5840 	if (dmrp != NULL) {
   5841 		cpuset = sfmmup->sfmmu_cpusran;
   5842 		xt_sync(cpuset);
   5843 	}
   5844 	if (callback && addr_count != 0) {
   5845 		for (a = 0; a < addr_count; ++a) {
   5846 			callback->hcb_start_addr = cb_start_addr[a];
   5847 			callback->hcb_end_addr = cb_end_addr[a];
   5848 			callback->hcb_function(callback);
   5849 		}
   5850 	}
   5851 
   5852 	/*
   5853 	 * Check TSB and TLB page sizes if the process isn't exiting.
   5854 	 */
   5855 	if (!sfmmup->sfmmu_free)
   5856 		sfmmu_check_page_sizes(sfmmup, 0);
   5857 }
   5858 
   5859 /*
   5860  * Unload all the mappings in the range [addr..addr+len). addr and len must
   5861  * be MMU_PAGESIZE aligned.
   5862  */
   5863 void
   5864 hat_unload(struct hat *sfmmup, caddr_t addr, size_t len, uint_t flags)
   5865 {
   5866 	if (sfmmup->sfmmu_xhat_provider) {
   5867 		XHAT_UNLOAD(sfmmup, addr, len, flags);
   5868 		return;
   5869 	}
   5870 	hat_unload_callback(sfmmup, addr, len, flags, NULL);
   5871 }
   5872 
   5873 
   5874 /*
   5875  * Find the largest mapping size for this page.
   5876  */
   5877 int
   5878 fnd_mapping_sz(page_t *pp)
   5879 {
   5880 	int sz;
   5881 	int p_index;
   5882 
   5883 	p_index = PP_MAPINDEX(pp);
   5884 
   5885 	sz = 0;
   5886 	p_index >>= 1;	/* don't care about 8K bit */
   5887 	for (; p_index; p_index >>= 1) {
   5888 		sz++;
   5889 	}
   5890 
   5891 	return (sz);
   5892 }
   5893 
   5894 /*
   5895  * This function unloads a range of addresses for an hmeblk.
   5896  * It returns the next address to be unloaded.
   5897  * It should be called with the hash lock held.
   5898  */
   5899 static caddr_t
   5900 sfmmu_hblk_unload(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr,
   5901 	caddr_t endaddr, demap_range_t *dmrp, uint_t flags)
   5902 {
   5903 	tte_t	tte, ttemod;
   5904 	struct	sf_hment *sfhmep;
   5905 	int	ttesz;
   5906 	long	ttecnt;
   5907 	page_t *pp;
   5908 	kmutex_t *pml;
   5909 	int ret;
   5910 	int use_demap_range;
   5911 
   5912 	ASSERT(in_hblk_range(hmeblkp, addr));
   5913 	ASSERT(!hmeblkp->hblk_shw_bit);
   5914 	ASSERT(sfmmup != NULL || hmeblkp->hblk_shared);
   5915 	ASSERT(sfmmup == NULL || !hmeblkp->hblk_shared);
   5916 	ASSERT(dmrp == NULL || !hmeblkp->hblk_shared);
   5917 
   5918 #ifdef DEBUG
   5919 	if (get_hblk_ttesz(hmeblkp) != TTE8K &&
   5920 	    (endaddr < get_hblk_endaddr(hmeblkp))) {
   5921 		panic("sfmmu_hblk_unload: partial unload of large page");
   5922 	}
   5923 #endif /* DEBUG */
   5924 
   5925 	endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
   5926 	ttesz = get_hblk_ttesz(hmeblkp);
   5927 
   5928 	use_demap_range = ((dmrp == NULL) ||
   5929 	    (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp)));
   5930 
   5931 	if (use_demap_range) {
   5932 		DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr);
   5933 	} else {
   5934 		DEMAP_RANGE_FLUSH(dmrp);
   5935 	}
   5936 	ttecnt = 0;
   5937 	HBLKTOHME(sfhmep, hmeblkp, addr);
   5938 
   5939 	while (addr < endaddr) {
   5940 		pml = NULL;
   5941 		sfmmu_copytte(&sfhmep->hme_tte, &tte);
   5942 		if (TTE_IS_VALID(&tte)) {
   5943 			pp = sfhmep->hme_page;
   5944 			if (pp != NULL) {
   5945 				pml = sfmmu_mlist_enter(pp);
   5946 			}
   5947 
   5948 			/*
   5949 			 * Verify if hme still points to 'pp' now that
   5950 			 * we have p_mapping lock.
   5951 			 */
   5952 			if (sfhmep->hme_page != pp) {
   5953 				if (pp != NULL && sfhmep->hme_page != NULL) {
   5954 					ASSERT(pml != NULL);
   5955 					sfmmu_mlist_exit(pml);
   5956 					/* Re-start this iteration. */
   5957 					continue;
   5958 				}
   5959 				ASSERT((pp != NULL) &&
   5960 				    (sfhmep->hme_page == NULL));
   5961 				goto tte_unloaded;
   5962 			}
   5963 
   5964 			/*
   5965 			 * This point on we have both HASH and p_mapping
   5966 			 * lock.
   5967 			 */
   5968 			ASSERT(pp == sfhmep->hme_page);
   5969 			ASSERT(pp == NULL || sfmmu_mlist_held(pp));
   5970 
   5971 			/*
   5972 			 * We need to loop on modify tte because it is
   5973 			 * possible for pagesync to come along and
   5974 			 * change the software bits beneath us.
   5975 			 *
   5976 			 * Page_unload can also invalidate the tte after
   5977 			 * we read tte outside of p_mapping lock.
   5978 			 */
   5979 again:
   5980 			ttemod = tte;
   5981 
   5982 			TTE_SET_INVALID(&ttemod);
   5983 			ret = sfmmu_modifytte_try(&tte, &ttemod,
   5984 			    &sfhmep->hme_tte);
   5985 
   5986 			if (ret <= 0) {
   5987 				if (TTE_IS_VALID(&tte)) {
   5988 					ASSERT(ret < 0);
   5989 					goto again;
   5990 				}
   5991 				if (pp != NULL) {
   5992 					panic("sfmmu_hblk_unload: pp = 0x%p "
   5993 					    "tte became invalid under mlist"
   5994 					    " lock = 0x%p", (void *)pp,
   5995 					    (void *)pml);
   5996 				}
   5997 				continue;
   5998 			}
   5999 
   6000 			if (!(flags & HAT_UNLOAD_NOSYNC)) {
   6001 				sfmmu_ttesync(sfmmup, addr, &tte, pp);
   6002 			}
   6003 
   6004 			/*
   6005 			 * Ok- we invalidated the tte. Do the rest of the job.
   6006 			 */
   6007 			ttecnt++;
   6008 
   6009 			if (flags & HAT_UNLOAD_UNLOCK) {
   6010 				ASSERT(hmeblkp->hblk_lckcnt > 0);
   6011 				atomic_add_32(&hmeblkp->hblk_lckcnt, -1);
   6012 				HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK);
   6013 			}
   6014 
   6015 			/*
   6016 			 * Normally we would need to flush the page
   6017 			 * from the virtual cache at this point in
   6018 			 * order to prevent a potential cache alias
   6019 			 * inconsistency.
   6020 			 * The particular scenario we need to worry
   6021 			 * about is:
   6022 			 * Given:  va1 and va2 are two virtual address
   6023 			 * that alias and map the same physical
   6024 			 * address.
   6025 			 * 1.   mapping exists from va1 to pa and data
   6026 			 * has been read into the cache.
   6027 			 * 2.   unload va1.
   6028 			 * 3.   load va2 and modify data using va2.
   6029 			 * 4    unload va2.
   6030 			 * 5.   load va1 and reference data.  Unless we
   6031 			 * flush the data cache when we unload we will
   6032 			 * get stale data.
   6033 			 * Fortunately, page coloring eliminates the
   6034 			 * above scenario by remembering the color a
   6035 			 * physical page was last or is currently
   6036 			 * mapped to.  Now, we delay the flush until
   6037 			 * the loading of translations.  Only when the
   6038 			 * new translation is of a different color
   6039 			 * are we forced to flush.
   6040 			 */
   6041 			if (use_demap_range) {
   6042 				/*
   6043 				 * Mark this page as needing a demap.
   6044 				 */
   6045 				DEMAP_RANGE_MARKPG(dmrp, addr);
   6046 			} else {
   6047 				ASSERT(sfmmup != NULL);
   6048 				ASSERT(!hmeblkp->hblk_shared);
   6049 				sfmmu_tlb_demap(addr, sfmmup, hmeblkp,
   6050 				    sfmmup->sfmmu_free, 0);
   6051 			}
   6052 
   6053 			if (pp) {
   6054 				/*
   6055 				 * Remove the hment from the mapping list
   6056 				 */
   6057 				ASSERT(hmeblkp->hblk_hmecnt > 0);
   6058 
   6059 				/*
   6060 				 * Again, we cannot
   6061 				 * ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS);
   6062 				 */
   6063 				HME_SUB(sfhmep, pp);
   6064 				membar_stst();
   6065 				atomic_add_16(&hmeblkp->hblk_hmecnt, -1);
   6066 			}
   6067 
   6068 			ASSERT(hmeblkp->hblk_vcnt > 0);
   6069 			atomic_add_16(&hmeblkp->hblk_vcnt, -1);
   6070 
   6071 			ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt ||
   6072 			    !hmeblkp->hblk_lckcnt);
   6073 
   6074 #ifdef VAC
   6075 			if (pp && (pp->p_nrm & (P_KPMC | P_KPMS | P_TNC))) {
   6076 				if (PP_ISTNC(pp)) {
   6077 					/*
   6078 					 * If page was temporary
   6079 					 * uncached, try to recache
   6080 					 * it. Note that HME_SUB() was
   6081 					 * called above so p_index and
   6082 					 * mlist had been updated.
   6083 					 */
   6084 					conv_tnc(pp, ttesz);
   6085 				} else if (pp->p_mapping == NULL) {
   6086 					ASSERT(kpm_enable);
   6087 					/*
   6088 					 * Page is marked to be in VAC conflict
   6089 					 * to an existing kpm mapping and/or is
   6090 					 * kpm mapped using only the regular
   6091 					 * pagesize.
   6092 					 */
   6093 					sfmmu_kpm_hme_unload(pp);
   6094 				}
   6095 			}
   6096 #endif	/* VAC */
   6097 		} else if ((pp = sfhmep->hme_page) != NULL) {
   6098 				/*
   6099 				 * TTE is invalid but the hme
   6100 				 * still exists. let pageunload
   6101 				 * complete its job.
   6102 				 */
   6103 				ASSERT(pml == NULL);
   6104 				pml = sfmmu_mlist_enter(pp);
   6105 				if (sfhmep->hme_page != NULL) {
   6106 					sfmmu_mlist_exit(pml);
   6107 					continue;
   6108 				}
   6109 				ASSERT(sfhmep->hme_page == NULL);
   6110 		} else if (hmeblkp->hblk_hmecnt != 0) {
   6111 			/*
   6112 			 * pageunload may have not finished decrementing
   6113 			 * hblk_vcnt and hblk_hmecnt. Find page_t if any and
   6114 			 * wait for pageunload to finish. Rely on pageunload
   6115 			 * to decrement hblk_hmecnt after hblk_vcnt.
   6116 			 */
   6117 			pfn_t pfn = TTE_TO_TTEPFN(&tte);
   6118 			ASSERT(pml == NULL);
   6119 			if (pf_is_memory(pfn)) {
   6120 				pp = page_numtopp_nolock(pfn);
   6121 				if (pp != NULL) {
   6122 					pml = sfmmu_mlist_enter(pp);
   6123 					sfmmu_mlist_exit(pml);
   6124 					pml = NULL;
   6125 				}
   6126 			}
   6127 		}
   6128 
   6129 tte_unloaded:
   6130 		/*
   6131 		 * At this point, the tte we are looking at
   6132 		 * should be unloaded, and hme has been unlinked
   6133 		 * from page too. This is important because in
   6134 		 * pageunload, it does ttesync() then HME_SUB.
   6135 		 * We need to make sure HME_SUB has been completed
   6136 		 * so we know ttesync() has been completed. Otherwise,
   6137 		 * at exit time, after return from hat layer, VM will
   6138 		 * release as structure which hat_setstat() (called
   6139 		 * by ttesync()) needs.
   6140 		 */
   6141 #ifdef DEBUG
   6142 		{
   6143 			tte_t	dtte;
   6144 
   6145 			ASSERT(sfhmep->hme_page == NULL);
   6146 
   6147 			sfmmu_copytte(&sfhmep->hme_tte, &dtte);
   6148 			ASSERT(!TTE_IS_VALID(&dtte));
   6149 		}
   6150 #endif
   6151 
   6152 		if (pml) {
   6153 			sfmmu_mlist_exit(pml);
   6154 		}
   6155 
   6156 		addr += TTEBYTES(ttesz);
   6157 		sfhmep++;
   6158 		DEMAP_RANGE_NEXTPG(dmrp);
   6159 	}
   6160 	/*
   6161 	 * For shared hmeblks this routine is only called when region is freed
   6162 	 * and no longer referenced.  So no need to decrement ttecnt
   6163 	 * in the region structure here.
   6164 	 */
   6165 	if (ttecnt > 0 && sfmmup != NULL) {
   6166 		atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -ttecnt);
   6167 	}
   6168 	return (addr);
   6169 }
   6170 
   6171 /*
   6172  * Invalidate a virtual address range for the local CPU.
   6173  * For best performance ensure that the va range is completely
   6174  * mapped, otherwise the entire TLB will be flushed.
   6175  */
   6176 void
   6177 hat_flush_range(struct hat *sfmmup, caddr_t va, size_t size)
   6178 {
   6179 	ssize_t sz;
   6180 	caddr_t endva = va + size;
   6181 
   6182 	while (va < endva) {
   6183 		sz = hat_getpagesize(sfmmup, va);
   6184 		if (sz < 0) {
   6185 			vtag_flushall();
   6186 			break;
   6187 		}
   6188 		vtag_flushpage(va, (uint64_t)sfmmup);
   6189 		va += sz;
   6190 	}
   6191 }
   6192 
   6193 /*
   6194  * Synchronize all the mappings in the range [addr..addr+len).
   6195  * Can be called with clearflag having two states:
   6196  * HAT_SYNC_DONTZERO means just return the rm stats
   6197  * HAT_SYNC_ZERORM means zero rm bits in the tte and return the stats
   6198  */
   6199 void
   6200 hat_sync(struct hat *sfmmup, caddr_t addr, size_t len, uint_t clearflag)
   6201 {
   6202 	struct hmehash_bucket *hmebp;
   6203 	hmeblk_tag hblktag;
   6204 	int hmeshift, hashno = 1;
   6205 	struct hme_blk *hmeblkp, *list = NULL;
   6206 	caddr_t endaddr;
   6207 	cpuset_t cpuset;
   6208 
   6209 	ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
   6210 	ASSERT((sfmmup == ksfmmup) ||
   6211 	    AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock));
   6212 	ASSERT((len & MMU_PAGEOFFSET) == 0);
   6213 	ASSERT((clearflag == HAT_SYNC_DONTZERO) ||
   6214 	    (clearflag == HAT_SYNC_ZERORM));
   6215 
   6216 	CPUSET_ZERO(cpuset);
   6217 
   6218 	endaddr = addr + len;
   6219 	hblktag.htag_id = sfmmup;
   6220 	hblktag.htag_rid = SFMMU_INVALID_SHMERID;
   6221 
   6222 	/*
   6223 	 * Spitfire supports 4 page sizes.
   6224 	 * Most pages are expected to be of the smallest page
   6225 	 * size (8K) and these will not need to be rehashed. 64K
   6226 	 * pages also don't need to be rehashed because the an hmeblk
   6227 	 * spans 64K of address space. 512K pages might need 1 rehash and
   6228 	 * and 4M pages 2 rehashes.
   6229 	 */
   6230 	while (addr < endaddr) {
   6231 		hmeshift = HME_HASH_SHIFT(hashno);
   6232 		hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
   6233 		hblktag.htag_rehash = hashno;
   6234 		hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
   6235 
   6236 		SFMMU_HASH_LOCK(hmebp);
   6237 
   6238 		HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list);
   6239 		if (hmeblkp != NULL) {
   6240 			ASSERT(!hmeblkp->hblk_shared);
   6241 			/*
   6242 			 * We've encountered a shadow hmeblk so skip the range
   6243 			 * of the next smaller mapping size.
   6244 			 */
   6245 			if (hmeblkp->hblk_shw_bit) {
   6246 				ASSERT(sfmmup != ksfmmup);
   6247 				ASSERT(hashno > 1);
   6248 				addr = (caddr_t)P2END((uintptr_t)addr,
   6249 				    TTEBYTES(hashno - 1));
   6250 			} else {
   6251 				addr = sfmmu_hblk_sync(sfmmup, hmeblkp,
   6252 				    addr, endaddr, clearflag);
   6253 			}
   6254 			SFMMU_HASH_UNLOCK(hmebp);
   6255 			hashno = 1;
   6256 			continue;
   6257 		}
   6258 		SFMMU_HASH_UNLOCK(hmebp);
   6259 
   6260 		if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) {
   6261 			/*
   6262 			 * We have traversed the whole list and rehashed
   6263 			 * if necessary without finding the address to sync.
   6264 			 * This is ok so we increment the address by the
   6265 			 * smallest hmeblk range for kernel mappings and the
   6266 			 * largest hmeblk range, to account for shadow hmeblks,
   6267 			 * for user mappings and continue.
   6268 			 */
   6269 			if (sfmmup == ksfmmup)
   6270 				addr = (caddr_t)P2END((uintptr_t)addr,
   6271 				    TTEBYTES(1));
   6272 			else
   6273 				addr = (caddr_t)P2END((uintptr_t)addr,
   6274 				    TTEBYTES(hashno));
   6275 			hashno = 1;
   6276 		} else {
   6277 			hashno++;
   6278 		}
   6279 	}
   6280 	sfmmu_hblks_list_purge(&list, 0);
   6281 	cpuset = sfmmup->sfmmu_cpusran;
   6282 	xt_sync(cpuset);
   6283 }
   6284 
   6285 static caddr_t
   6286 sfmmu_hblk_sync(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr,
   6287 	caddr_t endaddr, int clearflag)
   6288 {
   6289 	tte_t	tte, ttemod;
   6290 	struct sf_hment *sfhmep;
   6291 	int ttesz;
   6292 	struct page *pp;
   6293 	kmutex_t *pml;
   6294 	int ret;
   6295 
   6296 	ASSERT(hmeblkp->hblk_shw_bit == 0);
   6297 	ASSERT(!hmeblkp->hblk_shared);
   6298 
   6299 	endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
   6300 
   6301 	ttesz = get_hblk_ttesz(hmeblkp);
   6302 	HBLKTOHME(sfhmep, hmeblkp, addr);
   6303 
   6304 	while (addr < endaddr) {
   6305 		sfmmu_copytte(&sfhmep->hme_tte, &tte);
   6306 		if (TTE_IS_VALID(&tte)) {
   6307 			pml = NULL;
   6308 			pp = sfhmep->hme_page;
   6309 			if (pp) {
   6310 				pml = sfmmu_mlist_enter(pp);
   6311 			}
   6312 			if (pp != sfhmep->hme_page) {
   6313 				/*
   6314 				 * tte most have been unloaded
   6315 				 * underneath us.  Recheck
   6316 				 */
   6317 				ASSERT(pml);
   6318 				sfmmu_mlist_exit(pml);
   6319 				continue;
   6320 			}
   6321 
   6322 			ASSERT(pp == NULL || sfmmu_mlist_held(pp));
   6323 
   6324 			if (clearflag == HAT_SYNC_ZERORM) {
   6325 				ttemod = tte;
   6326 				TTE_CLR_RM(&ttemod);
   6327 				ret = sfmmu_modifytte_try(&tte, &ttemod,
   6328 				    &sfhmep->hme_tte);
   6329 				if (ret < 0) {
   6330 					if (pml) {
   6331 						sfmmu_mlist_exit(pml);
   6332 					}
   6333 					continue;
   6334 				}
   6335 
   6336 				if (ret > 0) {
   6337 					sfmmu_tlb_demap(addr, sfmmup,
   6338 					    hmeblkp, 0, 0);
   6339 				}
   6340 			}
   6341 			sfmmu_ttesync(sfmmup, addr, &tte, pp);
   6342 			if (pml) {
   6343 				sfmmu_mlist_exit(pml);
   6344 			}
   6345 		}
   6346 		addr += TTEBYTES(ttesz);
   6347 		sfhmep++;
   6348 	}
   6349 	return (addr);
   6350 }
   6351 
   6352 /*
   6353  * This function will sync a tte to the page struct and it will
   6354  * update the hat stats. Currently it allows us to pass a NULL pp
   6355  * and we will simply update the stats.  We may want to change this
   6356  * so we only keep stats for pages backed by pp's.
   6357  */
   6358 static void
   6359 sfmmu_ttesync(struct hat *sfmmup, caddr_t addr, tte_t *ttep, page_t *pp)
   6360 {
   6361 	uint_t rm = 0;
   6362 	int   	sz;
   6363 	pgcnt_t	npgs;
   6364 
   6365 	ASSERT(TTE_IS_VALID(ttep));
   6366 
   6367 	if (TTE_IS_NOSYNC(ttep)) {
   6368 		return;
   6369 	}
   6370 
   6371 	if (TTE_IS_REF(ttep))  {
   6372 		rm = P_REF;
   6373 	}
   6374 	if (TTE_IS_MOD(ttep))  {
   6375 		rm |= P_MOD;
   6376 	}
   6377 
   6378 	if (rm == 0) {
   6379 		return;
   6380 	}
   6381 
   6382 	sz = TTE_CSZ(ttep);
   6383 	if (sfmmup != NULL && sfmmup->sfmmu_rmstat) {
   6384 		int i;
   6385 		caddr_t	vaddr = addr;
   6386 
   6387 		for (i = 0; i < TTEPAGES(sz); i++, vaddr += MMU_PAGESIZE) {
   6388 			hat_setstat(sfmmup->sfmmu_as, vaddr, MMU_PAGESIZE, rm);
   6389 		}
   6390 
   6391 	}
   6392 
   6393 	/*
   6394 	 * XXX I want to use cas to update nrm bits but they
   6395 	 * currently belong in common/vm and not in hat where
   6396 	 * they should be.
   6397 	 * The nrm bits are protected by the same mutex as
   6398 	 * the one that protects the page's mapping list.
   6399 	 */
   6400 	if (!pp)
   6401 		return;
   6402 	ASSERT(sfmmu_mlist_held(pp));
   6403 	/*
   6404 	 * If the tte is for a large page, we need to sync all the
   6405 	 * pages covered by the tte.
   6406 	 */
   6407 	if (sz != TTE8K) {
   6408 		ASSERT(pp->p_szc != 0);
   6409 		pp = PP_GROUPLEADER(pp, sz);
   6410 		ASSERT(sfmmu_mlist_held(pp));
   6411 	}
   6412 
   6413 	/* Get number of pages from tte size. */
   6414 	npgs = TTEPAGES(sz);
   6415 
   6416 	do {
   6417 		ASSERT(pp);
   6418 		ASSERT(sfmmu_mlist_held(pp));
   6419 		if (((rm & P_REF) != 0 && !PP_ISREF(pp)) ||
   6420 		    ((rm & P_MOD) != 0 && !PP_ISMOD(pp)))
   6421 			hat_page_setattr(pp, rm);
   6422 
   6423 		/*
   6424 		 * Are we done? If not, we must have a large mapping.
   6425 		 * For large mappings we need to sync the rest of the pages
   6426 		 * covered by this tte; goto the next page.
   6427 		 */
   6428 	} while (--npgs > 0 && (pp = PP_PAGENEXT(pp)));
   6429 }
   6430 
   6431 /*
   6432  * Execute pre-callback handler of each pa_hment linked to pp
   6433  *
   6434  * Inputs:
   6435  *   flag: either HAT_PRESUSPEND or HAT_SUSPEND.
   6436  *   capture_cpus: pointer to return value (below)
   6437  *
   6438  * Returns:
   6439  *   Propagates the subsystem callback return values back to the caller;
   6440  *   returns 0 on success.  If capture_cpus is non-NULL, the value returned
   6441  *   is zero if all of the pa_hments are of a type that do not require
   6442  *   capturing CPUs prior to suspending the mapping, else it is 1.
   6443  */
   6444 static int
   6445 hat_pageprocess_precallbacks(struct page *pp, uint_t flag, int *capture_cpus)
   6446 {
   6447 	struct sf_hment	*sfhmep;
   6448 	struct pa_hment *pahmep;
   6449 	int (*f)(caddr_t, uint_t, uint_t, void *);
   6450 	int		ret;
   6451 	id_t		id;
   6452 	int		locked = 0;
   6453 	kmutex_t	*pml;
   6454 
   6455 	ASSERT(PAGE_EXCL(pp));
   6456 	if (!sfmmu_mlist_held(pp)) {
   6457 		pml = sfmmu_mlist_enter(pp);
   6458 		locked = 1;
   6459 	}
   6460 
   6461 	if (capture_cpus)
   6462 		*capture_cpus = 0;
   6463 
   6464 top:
   6465 	for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) {
   6466 		/*
   6467 		 * skip sf_hments corresponding to VA<->PA mappings;
   6468 		 * for pa_hment's, hme_tte.ll is zero
   6469 		 */
   6470 		if (!IS_PAHME(sfhmep))
   6471 			continue;
   6472 
   6473 		pahmep = sfhmep->hme_data;
   6474 		ASSERT(pahmep != NULL);
   6475 
   6476 		/*
   6477 		 * skip if pre-handler has been called earlier in this loop
   6478 		 */
   6479 		if (pahmep->flags & flag)
   6480 			continue;
   6481 
   6482 		id = pahmep->cb_id;
   6483 		ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid);
   6484 		if (capture_cpus && sfmmu_cb_table[id].capture_cpus != 0)
   6485 			*capture_cpus = 1;
   6486 		if ((f = sfmmu_cb_table[id].prehandler) == NULL) {
   6487 			pahmep->flags |= flag;
   6488 			continue;
   6489 		}
   6490 
   6491 		/*
   6492 		 * Drop the mapping list lock to avoid locking order issues.
   6493 		 */
   6494 		if (locked)
   6495 			sfmmu_mlist_exit(pml);
   6496 
   6497 		ret = f(pahmep->addr, pahmep->len, flag, pahmep->pvt);
   6498 		if (ret != 0)
   6499 			return (ret);	/* caller must do the cleanup */
   6500 
   6501 		if (locked) {
   6502 			pml = sfmmu_mlist_enter(pp);
   6503 			pahmep->flags |= flag;
   6504 			goto top;
   6505 		}
   6506 
   6507 		pahmep->flags |= flag;
   6508 	}
   6509 
   6510 	if (locked)
   6511 		sfmmu_mlist_exit(pml);
   6512 
   6513 	return (0);
   6514 }
   6515 
   6516 /*
   6517  * Execute post-callback handler of each pa_hment linked to pp
   6518  *
   6519  * Same overall assumptions and restrictions apply as for
   6520  * hat_pageprocess_precallbacks().
   6521  */
   6522 static void
   6523 hat_pageprocess_postcallbacks(struct page *pp, uint_t flag)
   6524 {
   6525 	pfn_t pgpfn = pp->p_pagenum;
   6526 	pfn_t pgmask = btop(page_get_pagesize(pp->p_szc)) - 1;
   6527 	pfn_t newpfn;
   6528 	struct sf_hment *sfhmep;
   6529 	struct pa_hment *pahmep;
   6530 	int (*f)(caddr_t, uint_t, uint_t, void *, pfn_t);
   6531 	id_t	id;
   6532 	int	locked = 0;
   6533 	kmutex_t *pml;
   6534 
   6535 	ASSERT(PAGE_EXCL(pp));
   6536 	if (!sfmmu_mlist_held(pp)) {
   6537 		pml = sfmmu_mlist_enter(pp);
   6538 		locked = 1;
   6539 	}
   6540 
   6541 top:
   6542 	for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) {
   6543 		/*
   6544 		 * skip sf_hments corresponding to VA<->PA mappings;
   6545 		 * for pa_hment's, hme_tte.ll is zero
   6546 		 */
   6547 		if (!IS_PAHME(sfhmep))
   6548 			continue;
   6549 
   6550 		pahmep = sfhmep->hme_data;
   6551 		ASSERT(pahmep != NULL);
   6552 
   6553 		if ((pahmep->flags & flag) == 0)
   6554 			continue;
   6555 
   6556 		pahmep->flags &= ~flag;
   6557 
   6558 		id = pahmep->cb_id;
   6559 		ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid);
   6560 		if ((f = sfmmu_cb_table[id].posthandler) == NULL)
   6561 			continue;
   6562 
   6563 		/*
   6564 		 * Convert the base page PFN into the constituent PFN
   6565 		 * which is needed by the callback handler.
   6566 		 */
   6567 		newpfn = pgpfn | (btop((uintptr_t)pahmep->addr) & pgmask);
   6568 
   6569 		/*
   6570 		 * Drop the mapping list lock to avoid locking order issues.
   6571 		 */
   6572 		if (locked)
   6573 			sfmmu_mlist_exit(pml);
   6574 
   6575 		if (f(pahmep->addr, pahmep->len, flag, pahmep->pvt, newpfn)
   6576 		    != 0)
   6577 			panic("sfmmu: posthandler failed");
   6578 
   6579 		if (locked) {
   6580 			pml = sfmmu_mlist_enter(pp);
   6581 			goto top;
   6582 		}
   6583 	}
   6584 
   6585 	if (locked)
   6586 		sfmmu_mlist_exit(pml);
   6587 }
   6588 
   6589 /*
   6590  * Suspend locked kernel mapping
   6591  */
   6592 void
   6593 hat_pagesuspend(struct page *pp)
   6594 {
   6595 	struct sf_hment *sfhmep;
   6596 	sfmmu_t *sfmmup;
   6597 	tte_t tte, ttemod;
   6598 	struct hme_blk *hmeblkp;
   6599 	caddr_t addr;
   6600 	int index, cons;
   6601 	cpuset_t cpuset;
   6602 
   6603 	ASSERT(PAGE_EXCL(pp));
   6604 	ASSERT(sfmmu_mlist_held(pp));
   6605 
   6606 	mutex_enter(&kpr_suspendlock);
   6607 
   6608 	/*
   6609 	 * We're about to suspend a kernel mapping so mark this thread as
   6610 	 * non-traceable by DTrace. This prevents us from running into issues
   6611 	 * with probe context trying to touch a suspended page
   6612 	 * in the relocation codepath itself.
   6613 	 */
   6614 	curthread->t_flag |= T_DONTDTRACE;
   6615 
   6616 	index = PP_MAPINDEX(pp);
   6617 	cons = TTE8K;
   6618 
   6619 retry:
   6620 	for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) {
   6621 
   6622 		if (IS_PAHME(sfhmep))
   6623 			continue;
   6624 
   6625 		if (get_hblk_ttesz(sfmmu_hmetohblk(sfhmep)) != cons)
   6626 			continue;
   6627 
   6628 		/*
   6629 		 * Loop until we successfully set the suspend bit in
   6630 		 * the TTE.
   6631 		 */
   6632 again:
   6633 		sfmmu_copytte(&sfhmep->hme_tte, &tte);
   6634 		ASSERT(TTE_IS_VALID(&tte));
   6635 
   6636 		ttemod = tte;
   6637 		TTE_SET_SUSPEND(&ttemod);
   6638 		if (sfmmu_modifytte_try(&tte, &ttemod,
   6639 		    &sfhmep->hme_tte) < 0)
   6640 			goto again;
   6641 
   6642 		/*
   6643 		 * Invalidate TSB entry
   6644 		 */
   6645 		hmeblkp = sfmmu_hmetohblk(sfhmep);
   6646 
   6647 		sfmmup = hblktosfmmu(hmeblkp);
   6648 		ASSERT(sfmmup == ksfmmup);
   6649 		ASSERT(!hmeblkp->hblk_shared);
   6650 
   6651 		addr = tte_to_vaddr(hmeblkp, tte);
   6652 
   6653 		/*
   6654 		 * No need to make sure that the TSB for this sfmmu is
   6655 		 * not being relocated since it is ksfmmup and thus it
   6656 		 * will never be relocated.
   6657 		 */
   6658 		SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0);
   6659 
   6660 		/*
   6661 		 * Update xcall stats
   6662 		 */
   6663 		cpuset = cpu_ready_set;
   6664 		CPUSET_DEL(cpuset, CPU->cpu_id);
   6665 
   6666 		/* LINTED: constant in conditional context */
   6667 		SFMMU_XCALL_STATS(ksfmmup);
   6668 
   6669 		/*
   6670 		 * Flush TLB entry on remote CPU's
   6671 		 */
   6672 		xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr,
   6673 		    (uint64_t)ksfmmup);
   6674 		xt_sync(cpuset);
   6675 
   6676 		/*
   6677 		 * Flush TLB entry on local CPU
   6678 		 */
   6679 		vtag_flushpage(addr, (uint64_t)ksfmmup);
   6680 	}
   6681 
   6682 	while (index != 0) {
   6683 		index = index >> 1;
   6684 		if (index != 0)
   6685 			cons++;
   6686 		if (index & 0x1) {
   6687 			pp = PP_GROUPLEADER(pp, cons);
   6688 			goto retry;
   6689 		}
   6690 	}
   6691 }
   6692 
   6693 #ifdef	DEBUG
   6694 
   6695 #define	N_PRLE	1024
   6696 struct prle {
   6697 	page_t *targ;
   6698 	page_t *repl;
   6699 	int status;
   6700 	int pausecpus;
   6701 	hrtime_t whence;
   6702 };
   6703 
   6704 static struct prle page_relocate_log[N_PRLE];
   6705 static int prl_entry;
   6706 static kmutex_t prl_mutex;
   6707 
   6708 #define	PAGE_RELOCATE_LOG(t, r, s, p)					\
   6709 	mutex_enter(&prl_mutex);					\
   6710 	page_relocate_log[prl_entry].targ = *(t);			\
   6711 	page_relocate_log[prl_entry].repl = *(r);			\
   6712 	page_relocate_log[prl_entry].status = (s);			\
   6713 	page_relocate_log[prl_entry].pausecpus = (p);			\
   6714 	page_relocate_log[prl_entry].whence = gethrtime();		\
   6715 	prl_entry = (prl_entry == (N_PRLE - 1))? 0 : prl_entry + 1;	\
   6716 	mutex_exit(&prl_mutex);
   6717 
   6718 #else	/* !DEBUG */
   6719 #define	PAGE_RELOCATE_LOG(t, r, s, p)
   6720 #endif
   6721 
   6722 /*
   6723  * Core Kernel Page Relocation Algorithm
   6724  *
   6725  * Input:
   6726  *
   6727  * target : 	constituent pages are SE_EXCL locked.
   6728  * replacement:	constituent pages are SE_EXCL locked.
   6729  *
   6730  * Output:
   6731  *
   6732  * nrelocp:	number of pages relocated
   6733  */
   6734 int
   6735 hat_page_relocate(page_t **target, page_t **replacement, spgcnt_t *nrelocp)
   6736 {
   6737 	page_t		*targ, *repl;
   6738 	page_t		*tpp, *rpp;
   6739 	kmutex_t	*low, *high;
   6740 	spgcnt_t	npages, i;
   6741 	page_t		*pl = NULL;
   6742 	int		old_pil;
   6743 	cpuset_t	cpuset;
   6744 	int		cap_cpus;
   6745 	int		ret;
   6746 #ifdef VAC
   6747 	int		cflags = 0;
   6748 #endif
   6749 
   6750 	if (hat_kpr_enabled == 0 || !kcage_on || PP_ISNORELOC(*target)) {
   6751 		PAGE_RELOCATE_LOG(target, replacement, EAGAIN, -1);
   6752 		return (EAGAIN);
   6753 	}
   6754 
   6755 	mutex_enter(&kpr_mutex);
   6756 	kreloc_thread = curthread;
   6757 
   6758 	targ = *target;
   6759 	repl = *replacement;
   6760 	ASSERT(repl != NULL);
   6761 	ASSERT(targ->p_szc == repl->p_szc);
   6762 
   6763 	npages = page_get_pagecnt(targ->p_szc);
   6764 
   6765 	/*
   6766 	 * unload VA<->PA mappings that are not locked
   6767 	 */
   6768 	tpp = targ;
   6769 	for (i = 0; i < npages; i++) {
   6770 		(void) hat_pageunload(tpp, SFMMU_KERNEL_RELOC);
   6771 		tpp++;
   6772 	}
   6773 
   6774 	/*
   6775 	 * Do "presuspend" callbacks, in a context from which we can still
   6776 	 * block as needed. Note that we don't hold the mapping list lock
   6777 	 * of "targ" at this point due to potential locking order issues;
   6778 	 * we assume that between the hat_pageunload() above and holding
   6779 	 * the SE_EXCL lock that the mapping list *cannot* change at this
   6780 	 * point.
   6781 	 */
   6782 	ret = hat_pageprocess_precallbacks(targ, HAT_PRESUSPEND, &cap_cpus);
   6783 	if (ret != 0) {
   6784 		/*
   6785 		 * EIO translates to fatal error, for all others cleanup
   6786 		 * and return EAGAIN.
   6787 		 */
   6788 		ASSERT(ret != EIO);
   6789 		hat_pageprocess_postcallbacks(targ, HAT_POSTUNSUSPEND);
   6790 		PAGE_RELOCATE_LOG(target, replacement, ret, -1);
   6791 		kreloc_thread = NULL;
   6792 		mutex_exit(&kpr_mutex);
   6793 		return (EAGAIN);
   6794 	}
   6795 
   6796 	/*
   6797 	 * acquire p_mapping list lock for both the target and replacement
   6798 	 * root pages.
   6799 	 *
   6800 	 * low and high refer to the need to grab the mlist locks in a
   6801 	 * specific order in order to prevent race conditions.  Thus the
   6802 	 * lower lock must be grabbed before the higher lock.
   6803 	 *
   6804 	 * This will block hat_unload's accessing p_mapping list.  Since
   6805 	 * we have SE_EXCL lock, hat_memload and hat_pageunload will be
   6806 	 * blocked.  Thus, no one else will be accessing the p_mapping list
   6807 	 * while we suspend and reload the locked mapping below.
   6808 	 */
   6809 	tpp = targ;
   6810 	rpp = repl;
   6811 	sfmmu_mlist_reloc_enter(tpp, rpp, &low, &high);
   6812 
   6813 	kpreempt_disable();
   6814 
   6815 	/*
   6816 	 * We raise our PIL to 13 so that we don't get captured by
   6817 	 * another CPU or pinned by an interrupt thread.  We can't go to
   6818 	 * PIL 14 since the nexus driver(s) may need to interrupt at
   6819 	 * that level in the case of IOMMU pseudo mappings.
   6820 	 */
   6821 	cpuset = cpu_ready_set;
   6822 	CPUSET_DEL(cpuset, CPU->cpu_id);
   6823 	if (!cap_cpus || CPUSET_ISNULL(cpuset)) {
   6824 		old_pil = splr(XCALL_PIL);
   6825 	} else {
   6826 		old_pil = -1;
   6827 		xc_attention(cpuset);
   6828 	}
   6829 	ASSERT(getpil() == XCALL_PIL);
   6830 
   6831 	/*
   6832 	 * Now do suspend callbacks. In the case of an IOMMU mapping
   6833 	 * this will suspend all DMA activity to the page while it is
   6834 	 * being relocated. Since we are well above LOCK_LEVEL and CPUs
   6835 	 * may be captured at this point we should have acquired any needed
   6836 	 * locks in the presuspend callback.
   6837 	 */
   6838 	ret = hat_pageprocess_precallbacks(targ, HAT_SUSPEND, NULL);
   6839 	if (ret != 0) {
   6840 		repl = targ;
   6841 		goto suspend_fail;
   6842 	}
   6843 
   6844 	/*
   6845 	 * Raise the PIL yet again, this time to block all high-level
   6846 	 * interrupts on this CPU. This is necessary to prevent an
   6847 	 * interrupt routine from pinning the thread which holds the
   6848 	 * mapping suspended and then touching the suspended page.
   6849 	 *
   6850 	 * Once the page is suspended we also need to be careful to
   6851 	 * avoid calling any functions which touch any seg_kmem memory
   6852 	 * since that memory may be backed by the very page we are
   6853 	 * relocating in here!
   6854 	 */
   6855 	hat_pagesuspend(targ);
   6856 
   6857 	/*
   6858 	 * Now that we are confident everybody has stopped using this page,
   6859 	 * copy the page contents.  Note we use a physical copy to prevent
   6860 	 * locking issues and to avoid fpRAS because we can't handle it in
   6861 	 * this context.
   6862 	 */
   6863 	for (i = 0; i < npages; i++, tpp++, rpp++) {
   6864 #ifdef VAC
   6865 		/*
   6866 		 * If the replacement has a different vcolor than
   6867 		 * the one being replacd, we need to handle VAC
   6868 		 * consistency for it just as we were setting up
   6869 		 * a new mapping to it.
   6870 		 */
   6871 		if ((PP_GET_VCOLOR(rpp) != NO_VCOLOR) &&
   6872 		    (tpp->p_vcolor != rpp->p_vcolor) &&
   6873 		    !CacheColor_IsFlushed(cflags, PP_GET_VCOLOR(rpp))) {
   6874 			CacheColor_SetFlushed(cflags, PP_GET_VCOLOR(rpp));
   6875 			sfmmu_cache_flushcolor(PP_GET_VCOLOR(rpp),
   6876 			    rpp->p_pagenum);
   6877 		}
   6878 #endif
   6879 		/*
   6880 		 * Copy the contents of the page.
   6881 		 */
   6882 		ppcopy_kernel(tpp, rpp);
   6883 	}
   6884 
   6885 	tpp = targ;
   6886 	rpp = repl;
   6887 	for (i = 0; i < npages; i++, tpp++, rpp++) {
   6888 		/*
   6889 		 * Copy attributes.  VAC consistency was handled above,
   6890 		 * if required.
   6891 		 */
   6892 		rpp->p_nrm = tpp->p_nrm;
   6893 		tpp->p_nrm = 0;
   6894 		rpp->p_index = tpp->p_index;
   6895 		tpp->p_index = 0;
   6896 #ifdef VAC
   6897 		rpp->p_vcolor = tpp->p_vcolor;
   6898 #endif
   6899 	}
   6900 
   6901 	/*
   6902 	 * First, unsuspend the page, if we set the suspend bit, and transfer
   6903 	 * the mapping list from the target page to the replacement page.
   6904 	 * Next process postcallbacks; since pa_hment's are linked only to the
   6905 	 * p_mapping list of root page, we don't iterate over the constituent
   6906 	 * pages.
   6907 	 */
   6908 	hat_pagereload(targ, repl);
   6909 
   6910 suspend_fail:
   6911 	hat_pageprocess_postcallbacks(repl, HAT_UNSUSPEND);
   6912 
   6913 	/*
   6914 	 * Now lower our PIL and release any captured CPUs since we
   6915 	 * are out of the "danger zone".  After this it will again be
   6916 	 * safe to acquire adaptive mutex locks, or to drop them...
   6917 	 */
   6918 	if (old_pil != -1) {
   6919 		splx(old_pil);
   6920 	} else {
   6921 		xc_dismissed(cpuset);
   6922 	}
   6923 
   6924 	kpreempt_enable();
   6925 
   6926 	sfmmu_mlist_reloc_exit(low, high);
   6927 
   6928 	/*
   6929 	 * Postsuspend callbacks should drop any locks held across
   6930 	 * the suspend callbacks.  As before, we don't hold the mapping
   6931 	 * list lock at this point.. our assumption is that the mapping
   6932 	 * list still can't change due to our holding SE_EXCL lock and
   6933 	 * there being no unlocked mappings left. Hence the restriction
   6934 	 * on calling context to hat_delete_callback()
   6935 	 */
   6936 	hat_pageprocess_postcallbacks(repl, HAT_POSTUNSUSPEND);
   6937 	if (ret != 0) {
   6938 		/*
   6939 		 * The second presuspend call failed: we got here through
   6940 		 * the suspend_fail label above.
   6941 		 */
   6942 		ASSERT(ret != EIO);
   6943 		PAGE_RELOCATE_LOG(target, replacement, ret, cap_cpus);
   6944 		kreloc_thread = NULL;
   6945 		mutex_exit(&kpr_mutex);
   6946 		return (EAGAIN);
   6947 	}
   6948 
   6949 	/*
   6950 	 * Now that we're out of the performance critical section we can
   6951 	 * take care of updating the hash table, since we still
   6952 	 * hold all the pages locked SE_EXCL at this point we
   6953 	 * needn't worry about things changing out from under us.
   6954 	 */
   6955 	tpp = targ;
   6956 	rpp = repl;
   6957 	for (i = 0; i < npages; i++, tpp++, rpp++) {
   6958 
   6959 		/*
   6960 		 * replace targ with replacement in page_hash table
   6961 		 */
   6962 		targ = tpp;
   6963 		page_relocate_hash(rpp, targ);
   6964 
   6965 		/*
   6966 		 * concatenate target; caller of platform_page_relocate()
   6967 		 * expects target to be concatenated after returning.
   6968 		 */
   6969 		ASSERT(targ->p_next == targ);
   6970 		ASSERT(targ->p_prev == targ);
   6971 		page_list_concat(&pl, &targ);
   6972 	}
   6973 
   6974 	ASSERT(*target == pl);
   6975 	*nrelocp = npages;
   6976 	PAGE_RELOCATE_LOG(target, replacement, 0, cap_cpus);
   6977 	kreloc_thread = NULL;
   6978 	mutex_exit(&kpr_mutex);
   6979 	return (0);
   6980 }
   6981 
   6982 /*
   6983  * Called when stray pa_hments are found attached to a page which is
   6984  * being freed.  Notify the subsystem which attached the pa_hment of
   6985  * the error if it registered a suitable handler, else panic.
   6986  */
   6987 static void
   6988 sfmmu_pahment_leaked(struct pa_hment *pahmep)
   6989 {
   6990 	id_t cb_id = pahmep->cb_id;
   6991 
   6992 	ASSERT(cb_id >= (id_t)0 && cb_id < sfmmu_cb_nextid);
   6993 	if (sfmmu_cb_table[cb_id].errhandler != NULL) {
   6994 		if (sfmmu_cb_table[cb_id].errhandler(pahmep->addr, pahmep->len,
   6995 		    HAT_CB_ERR_LEAKED, pahmep->pvt) == 0)
   6996 			return;		/* non-fatal */
   6997 	}
   6998 	panic("pa_hment leaked: 0x%p", (void *)pahmep);
   6999 }
   7000 
   7001 /*
   7002  * Remove all mappings to page 'pp'.
   7003  */
   7004 int
   7005 hat_pageunload(struct page *pp, uint_t forceflag)
   7006 {
   7007 	struct page *origpp = pp;
   7008 	struct sf_hment *sfhme, *tmphme;
   7009 	struct hme_blk *hmeblkp;
   7010 	kmutex_t *pml;
   7011 #ifdef VAC
   7012 	kmutex_t *pmtx;
   7013 #endif
   7014 	cpuset_t cpuset, tset;
   7015 	int index, cons;
   7016 	int xhme_blks;
   7017 	int pa_hments;
   7018 
   7019 	ASSERT(PAGE_EXCL(pp));
   7020 
   7021 retry_xhat:
   7022 	tmphme = NULL;
   7023 	xhme_blks = 0;
   7024 	pa_hments = 0;
   7025 	CPUSET_ZERO(cpuset);
   7026 
   7027 	pml = sfmmu_mlist_enter(pp);
   7028 
   7029 #ifdef VAC
   7030 	if (pp->p_kpmref)
   7031 		sfmmu_kpm_pageunload(pp);
   7032 	ASSERT(!PP_ISMAPPED_KPM(pp));
   7033 #endif
   7034 	/*
   7035 	 * Clear vpm reference. Since the page is exclusively locked
   7036 	 * vpm cannot be referencing it.
   7037 	 */
   7038 	if (vpm_enable) {
   7039 		pp->p_vpmref = 0;
   7040 	}
   7041 
   7042 	index = PP_MAPINDEX(pp);
   7043 	cons = TTE8K;
   7044 retry:
   7045 	for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
   7046 		tmphme = sfhme->hme_next;
   7047 
   7048 		if (IS_PAHME(sfhme)) {
   7049 			ASSERT(sfhme->hme_data != NULL);
   7050 			pa_hments++;
   7051 			continue;
   7052 		}
   7053 
   7054 		hmeblkp = sfmmu_hmetohblk(sfhme);
   7055 		if (hmeblkp->hblk_xhat_bit) {
   7056 			struct xhat_hme_blk *xblk =
   7057 			    (struct xhat_hme_blk *)hmeblkp;
   7058 
   7059 			(void) XHAT_PAGEUNLOAD(xblk->xhat_hme_blk_hat,
   7060 			    pp, forceflag, XBLK2PROVBLK(xblk));
   7061 
   7062 			xhme_blks = 1;
   7063 			continue;
   7064 		}
   7065 
   7066 		/*
   7067 		 * If there are kernel mappings don't unload them, they will
   7068 		 * be suspended.
   7069 		 */
   7070 		if (forceflag == SFMMU_KERNEL_RELOC && hmeblkp->hblk_lckcnt &&
   7071 		    hmeblkp->hblk_tag.htag_id == ksfmmup)
   7072 			continue;
   7073 
   7074 		tset = sfmmu_pageunload(pp, sfhme, cons);
   7075 		CPUSET_OR(cpuset, tset);
   7076 	}
   7077 
   7078 	while (index != 0) {
   7079 		index = index >> 1;
   7080 		if (index != 0)
   7081 			cons++;
   7082 		if (index & 0x1) {
   7083 			/* Go to leading page */
   7084 			pp = PP_GROUPLEADER(pp, cons);
   7085 			ASSERT(sfmmu_mlist_held(pp));
   7086 			goto retry;
   7087 		}
   7088 	}
   7089 
   7090 	/*
   7091 	 * cpuset may be empty if the page was only mapped by segkpm,
   7092 	 * in which case we won't actually cross-trap.
   7093 	 */
   7094 	xt_sync(cpuset);
   7095 
   7096 	/*
   7097 	 * The page should have no mappings at this point, unless
   7098 	 * we were called from hat_page_relocate() in which case we
   7099 	 * leave the locked mappings which will be suspended later.
   7100 	 */
   7101 	ASSERT(!PP_ISMAPPED(origpp) || xhme_blks || pa_hments ||
   7102 	    (forceflag == SFMMU_KERNEL_RELOC));
   7103 
   7104 #ifdef VAC
   7105 	if (PP_ISTNC(pp)) {
   7106 		if (cons == TTE8K) {
   7107 			pmtx = sfmmu_page_enter(pp);
   7108 			PP_CLRTNC(pp);
   7109 			sfmmu_page_exit(pmtx);
   7110 		} else {
   7111 			conv_tnc(pp, cons);
   7112 		}
   7113 	}
   7114 #endif	/* VAC */
   7115 
   7116 	if (pa_hments && forceflag != SFMMU_KERNEL_RELOC) {
   7117 		/*
   7118 		 * Unlink any pa_hments and free them, calling back
   7119 		 * the responsible subsystem to notify it of the error.
   7120 		 * This can occur in situations such as drivers leaking
   7121 		 * DMA handles: naughty, but common enough that we'd like
   7122 		 * to keep the system running rather than bringing it
   7123 		 * down with an obscure error like "pa_hment leaked"
   7124 		 * which doesn't aid the user in debugging their driver.
   7125 		 */
   7126 		for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
   7127 			tmphme = sfhme->hme_next;
   7128 			if (IS_PAHME(sfhme)) {
   7129 				struct pa_hment *pahmep = sfhme->hme_data;
   7130 				sfmmu_pahment_leaked(pahmep);
   7131 				HME_SUB(sfhme, pp);
   7132 				kmem_cache_free(pa_hment_cache, pahmep);
   7133 			}
   7134 		}
   7135 
   7136 		ASSERT(!PP_ISMAPPED(origpp) || xhme_blks);
   7137 	}
   7138 
   7139 	sfmmu_mlist_exit(pml);
   7140 
   7141 	/*
   7142 	 * XHAT may not have finished unloading pages
   7143 	 * because some other thread was waiting for
   7144 	 * mlist lock and XHAT_PAGEUNLOAD let it do
   7145 	 * the job.
   7146 	 */
   7147 	if (xhme_blks) {
   7148 		pp = origpp;
   7149 		goto retry_xhat;
   7150 	}
   7151 
   7152 	return (0);
   7153 }
   7154 
   7155 cpuset_t
   7156 sfmmu_pageunload(page_t *pp, struct sf_hment *sfhme, int cons)
   7157 {
   7158 	struct hme_blk *hmeblkp;
   7159 	sfmmu_t *sfmmup;
   7160 	tte_t tte, ttemod;
   7161 #ifdef DEBUG
   7162 	tte_t orig_old;
   7163 #endif /* DEBUG */
   7164 	caddr_t addr;
   7165 	int ttesz;
   7166 	int ret;
   7167 	cpuset_t cpuset;
   7168 
   7169 	ASSERT(pp != NULL);
   7170 	ASSERT(sfmmu_mlist_held(pp));
   7171 	ASSERT(!PP_ISKAS(pp));
   7172 
   7173 	CPUSET_ZERO(cpuset);
   7174 
   7175 	hmeblkp = sfmmu_hmetohblk(sfhme);
   7176 
   7177 readtte:
   7178 	sfmmu_copytte(&sfhme->hme_tte, &tte);
   7179 	if (TTE_IS_VALID(&tte)) {
   7180 		sfmmup = hblktosfmmu(hmeblkp);
   7181 		ttesz = get_hblk_ttesz(hmeblkp);
   7182 		/*
   7183 		 * Only unload mappings of 'cons' size.
   7184 		 */
   7185 		if (ttesz != cons)
   7186 			return (cpuset);
   7187 
   7188 		/*
   7189 		 * Note that we have p_mapping lock, but no hash lock here.
   7190 		 * hblk_unload() has to have both hash lock AND p_mapping
   7191 		 * lock before it tries to modify tte. So, the tte could
   7192 		 * not become invalid in the sfmmu_modifytte_try() below.
   7193 		 */
   7194 		ttemod = tte;
   7195 #ifdef DEBUG
   7196 		orig_old = tte;
   7197 #endif /* DEBUG */
   7198 
   7199 		TTE_SET_INVALID(&ttemod);
   7200 		ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte);
   7201 		if (ret < 0) {
   7202 #ifdef DEBUG
   7203 			/* only R/M bits can change. */
   7204 			chk_tte(&orig_old, &tte, &ttemod, hmeblkp);
   7205 #endif /* DEBUG */
   7206 			goto readtte;
   7207 		}
   7208 
   7209 		if (ret == 0) {
   7210 			panic("pageunload: cas failed?");
   7211 		}
   7212 
   7213 		addr = tte_to_vaddr(hmeblkp, tte);
   7214 
   7215 		if (hmeblkp->hblk_shared) {
   7216 			sf_srd_t *srdp = (sf_srd_t *)sfmmup;
   7217 			uint_t rid = hmeblkp->hblk_tag.htag_rid;
   7218 			sf_region_t *rgnp;
   7219 			ASSERT(SFMMU_IS_SHMERID_VALID(rid));
   7220 			ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   7221 			ASSERT(srdp != NULL);
   7222 			rgnp = srdp->srd_hmergnp[rid];
   7223 			SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid);
   7224 			cpuset = sfmmu_rgntlb_demap(addr, rgnp, hmeblkp, 1);
   7225 			sfmmu_ttesync(NULL, addr, &tte, pp);
   7226 			ASSERT(rgnp->rgn_ttecnt[ttesz] > 0);
   7227 			atomic_add_long(&rgnp->rgn_ttecnt[ttesz], -1);
   7228 		} else {
   7229 			sfmmu_ttesync(sfmmup, addr, &tte, pp);
   7230 			atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -1);
   7231 
   7232 			/*
   7233 			 * We need to flush the page from the virtual cache
   7234 			 * in order to prevent a virtual cache alias
   7235 			 * inconsistency. The particular scenario we need
   7236 			 * to worry about is:
   7237 			 * Given:  va1 and va2 are two virtual address that
   7238 			 * alias and will map the same physical address.
   7239 			 * 1.   mapping exists from va1 to pa and data has
   7240 			 *	been read into the cache.
   7241 			 * 2.   unload va1.
   7242 			 * 3.   load va2 and modify data using va2.
   7243 			 * 4    unload va2.
   7244 			 * 5.   load va1 and reference data.  Unless we flush
   7245 			 *	the data cache when we unload we will get
   7246 			 *	stale data.
   7247 			 * This scenario is taken care of by using virtual
   7248 			 * page coloring.
   7249 			 */
   7250 			if (sfmmup->sfmmu_ismhat) {
   7251 				/*
   7252 				 * Flush TSBs, TLBs and caches
   7253 				 * of every process
   7254 				 * sharing this ism segment.
   7255 				 */
   7256 				sfmmu_hat_lock_all();
   7257 				mutex_enter(&ism_mlist_lock);
   7258 				kpreempt_disable();
   7259 				sfmmu_ismtlbcache_demap(addr, sfmmup, hmeblkp,
   7260 				    pp->p_pagenum, CACHE_NO_FLUSH);
   7261 				kpreempt_enable();
   7262 				mutex_exit(&ism_mlist_lock);
   7263 				sfmmu_hat_unlock_all();
   7264 				cpuset = cpu_ready_set;
   7265 			} else {
   7266 				sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0);
   7267 				cpuset = sfmmup->sfmmu_cpusran;
   7268 			}
   7269 		}
   7270 
   7271 		/*
   7272 		 * Hme_sub has to run after ttesync() and a_rss update.
   7273 		 * See hblk_unload().
   7274 		 */
   7275 		HME_SUB(sfhme, pp);
   7276 		membar_stst();
   7277 
   7278 		/*
   7279 		 * We can not make ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS)
   7280 		 * since pteload may have done a HME_ADD() right after
   7281 		 * we did the HME_SUB() above. Hmecnt is now maintained
   7282 		 * by cas only. no lock guranteed its value. The only
   7283 		 * gurantee we have is the hmecnt should not be less than
   7284 		 * what it should be so the hblk will not be taken away.
   7285 		 * It's also important that we decremented the hmecnt after
   7286 		 * we are done with hmeblkp so that this hmeblk won't be
   7287 		 * stolen.
   7288 		 */
   7289 		ASSERT(hmeblkp->hblk_hmecnt > 0);
   7290 		ASSERT(hmeblkp->hblk_vcnt > 0);
   7291 		atomic_add_16(&hmeblkp->hblk_vcnt, -1);
   7292 		atomic_add_16(&hmeblkp->hblk_hmecnt, -1);
   7293 		/*
   7294 		 * This is bug 4063182.
   7295 		 * XXX: fixme
   7296 		 * ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt ||
   7297 		 *	!hmeblkp->hblk_lckcnt);
   7298 		 */
   7299 	} else {
   7300 		panic("invalid tte? pp %p &tte %p",
   7301 		    (void *)pp, (void *)&tte);
   7302 	}
   7303 
   7304 	return (cpuset);
   7305 }
   7306 
   7307 /*
   7308  * While relocating a kernel page, this function will move the mappings
   7309  * from tpp to dpp and modify any associated data with these mappings.
   7310  * It also unsuspends the suspended kernel mapping.
   7311  */
   7312 static void
   7313 hat_pagereload(struct page *tpp, struct page *dpp)
   7314 {
   7315 	struct sf_hment *sfhme;
   7316 	tte_t tte, ttemod;
   7317 	int index, cons;
   7318 
   7319 	ASSERT(getpil() == PIL_MAX);
   7320 	ASSERT(sfmmu_mlist_held(tpp));
   7321 	ASSERT(sfmmu_mlist_held(dpp));
   7322 
   7323 	index = PP_MAPINDEX(tpp);
   7324 	cons = TTE8K;
   7325 
   7326 	/* Update real mappings to the page */
   7327 retry:
   7328 	for (sfhme = tpp->p_mapping; sfhme != NULL; sfhme = sfhme->hme_next) {
   7329 		if (IS_PAHME(sfhme))
   7330 			continue;
   7331 		sfmmu_copytte(&sfhme->hme_tte, &tte);
   7332 		ttemod = tte;
   7333 
   7334 		/*
   7335 		 * replace old pfn with new pfn in TTE
   7336 		 */
   7337 		PFN_TO_TTE(ttemod, dpp->p_pagenum);
   7338 
   7339 		/*
   7340 		 * clear suspend bit
   7341 		 */
   7342 		ASSERT(TTE_IS_SUSPEND(&ttemod));
   7343 		TTE_CLR_SUSPEND(&ttemod);
   7344 
   7345 		if (sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte) < 0)
   7346 			panic("hat_pagereload(): sfmmu_modifytte_try() failed");
   7347 
   7348 		/*
   7349 		 * set hme_page point to new page
   7350 		 */
   7351 		sfhme->hme_page = dpp;
   7352 	}
   7353 
   7354 	/*
   7355 	 * move p_mapping list from old page to new page
   7356 	 */
   7357 	dpp->p_mapping = tpp->p_mapping;
   7358 	tpp->p_mapping = NULL;
   7359 	dpp->p_share = tpp->p_share;
   7360 	tpp->p_share = 0;
   7361 
   7362 	while (index != 0) {
   7363 		index = index >> 1;
   7364 		if (index != 0)
   7365 			cons++;
   7366 		if (index & 0x1) {
   7367 			tpp = PP_GROUPLEADER(tpp, cons);
   7368 			dpp = PP_GROUPLEADER(dpp, cons);
   7369 			goto retry;
   7370 		}
   7371 	}
   7372 
   7373 	curthread->t_flag &= ~T_DONTDTRACE;
   7374 	mutex_exit(&kpr_suspendlock);
   7375 }
   7376 
   7377 uint_t
   7378 hat_pagesync(struct page *pp, uint_t clearflag)
   7379 {
   7380 	struct sf_hment *sfhme, *tmphme = NULL;
   7381 	struct hme_blk *hmeblkp;
   7382 	kmutex_t *pml;
   7383 	cpuset_t cpuset, tset;
   7384 	int	index, cons;
   7385 	extern	ulong_t po_share;
   7386 	page_t	*save_pp = pp;
   7387 	int	stop_on_sh = 0;
   7388 	uint_t	shcnt;
   7389 
   7390 	CPUSET_ZERO(cpuset);
   7391 
   7392 	if (PP_ISRO(pp) && (clearflag & HAT_SYNC_STOPON_MOD)) {
   7393 		return (PP_GENERIC_ATTR(pp));
   7394 	}
   7395 
   7396 	if ((clearflag & HAT_SYNC_ZERORM) == 0) {
   7397 		if ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(pp)) {
   7398 			return (PP_GENERIC_ATTR(pp));
   7399 		}
   7400 		if ((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(pp)) {
   7401 			return (PP_GENERIC_ATTR(pp));
   7402 		}
   7403 		if (clearflag & HAT_SYNC_STOPON_SHARED) {
   7404 			if (pp->p_share > po_share) {
   7405 				hat_page_setattr(pp, P_REF);
   7406 				return (PP_GENERIC_ATTR(pp));
   7407 			}
   7408 			stop_on_sh = 1;
   7409 			shcnt = 0;
   7410 		}
   7411 	}
   7412 
   7413 	clearflag &= ~HAT_SYNC_STOPON_SHARED;
   7414 	pml = sfmmu_mlist_enter(pp);
   7415 	index = PP_MAPINDEX(pp);
   7416 	cons = TTE8K;
   7417 retry:
   7418 	for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
   7419 		/*
   7420 		 * We need to save the next hment on the list since
   7421 		 * it is possible for pagesync to remove an invalid hment
   7422 		 * from the list.
   7423 		 */
   7424 		tmphme = sfhme->hme_next;
   7425 		if (IS_PAHME(sfhme))
   7426 			continue;
   7427 		/*
   7428 		 * If we are looking for large mappings and this hme doesn't
   7429 		 * reach the range we are seeking, just ignore it.
   7430 		 */
   7431 		hmeblkp = sfmmu_hmetohblk(sfhme);
   7432 		if (hmeblkp->hblk_xhat_bit)
   7433 			continue;
   7434 
   7435 		if (hme_size(sfhme) < cons)
   7436 			continue;
   7437 
   7438 		if (stop_on_sh) {
   7439 			if (hmeblkp->hblk_shared) {
   7440 				sf_srd_t *srdp = hblktosrd(hmeblkp);
   7441 				uint_t rid = hmeblkp->hblk_tag.htag_rid;
   7442 				sf_region_t *rgnp;
   7443 				ASSERT(SFMMU_IS_SHMERID_VALID(rid));
   7444 				ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   7445 				ASSERT(srdp != NULL);
   7446 				rgnp = srdp->srd_hmergnp[rid];
   7447 				SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp,
   7448 				    rgnp, rid);
   7449 				shcnt += rgnp->rgn_refcnt;
   7450 			} else {
   7451 				shcnt++;
   7452 			}
   7453 			if (shcnt > po_share) {
   7454 				/*
   7455 				 * tell the pager to spare the page this time
   7456 				 * around.
   7457 				 */
   7458 				hat_page_setattr(save_pp, P_REF);
   7459 				index = 0;
   7460 				break;
   7461 			}
   7462 		}
   7463 		tset = sfmmu_pagesync(pp, sfhme,
   7464 		    clearflag & ~HAT_SYNC_STOPON_RM);
   7465 		CPUSET_OR(cpuset, tset);
   7466 
   7467 		/*
   7468 		 * If clearflag is HAT_SYNC_DONTZERO, break out as soon
   7469 		 * as the "ref" or "mod" is set or share cnt exceeds po_share.
   7470 		 */
   7471 		if ((clearflag & ~HAT_SYNC_STOPON_RM) == HAT_SYNC_DONTZERO &&
   7472 		    (((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(save_pp)) ||
   7473 		    ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(save_pp)))) {
   7474 			index = 0;
   7475 			break;
   7476 		}
   7477 	}
   7478 
   7479 	while (index) {
   7480 		index = index >> 1;
   7481 		cons++;
   7482 		if (index & 0x1) {
   7483 			/* Go to leading page */
   7484 			pp = PP_GROUPLEADER(pp, cons);
   7485 			goto retry;
   7486 		}
   7487 	}
   7488 
   7489 	xt_sync(cpuset);
   7490 	sfmmu_mlist_exit(pml);
   7491 	return (PP_GENERIC_ATTR(save_pp));
   7492 }
   7493 
   7494 /*
   7495  * Get all the hardware dependent attributes for a page struct
   7496  */
   7497 static cpuset_t
   7498 sfmmu_pagesync(struct page *pp, struct sf_hment *sfhme,
   7499 	uint_t clearflag)
   7500 {
   7501 	caddr_t addr;
   7502 	tte_t tte, ttemod;
   7503 	struct hme_blk *hmeblkp;
   7504 	int ret;
   7505 	sfmmu_t *sfmmup;
   7506 	cpuset_t cpuset;
   7507 
   7508 	ASSERT(pp != NULL);
   7509 	ASSERT(sfmmu_mlist_held(pp));
   7510 	ASSERT((clearflag == HAT_SYNC_DONTZERO) ||
   7511 	    (clearflag == HAT_SYNC_ZERORM));
   7512 
   7513 	SFMMU_STAT(sf_pagesync);
   7514 
   7515 	CPUSET_ZERO(cpuset);
   7516 
   7517 sfmmu_pagesync_retry:
   7518 
   7519 	sfmmu_copytte(&sfhme->hme_tte, &tte);
   7520 	if (TTE_IS_VALID(&tte)) {
   7521 		hmeblkp = sfmmu_hmetohblk(sfhme);
   7522 		sfmmup = hblktosfmmu(hmeblkp);
   7523 		addr = tte_to_vaddr(hmeblkp, tte);
   7524 		if (clearflag == HAT_SYNC_ZERORM) {
   7525 			ttemod = tte;
   7526 			TTE_CLR_RM(&ttemod);
   7527 			ret = sfmmu_modifytte_try(&tte, &ttemod,
   7528 			    &sfhme->hme_tte);
   7529 			if (ret < 0) {
   7530 				/*
   7531 				 * cas failed and the new value is not what
   7532 				 * we want.
   7533 				 */
   7534 				goto sfmmu_pagesync_retry;
   7535 			}
   7536 
   7537 			if (ret > 0) {
   7538 				/* we win the cas */
   7539 				if (hmeblkp->hblk_shared) {
   7540 					sf_srd_t *srdp = (sf_srd_t *)sfmmup;
   7541 					uint_t rid =
   7542 					    hmeblkp->hblk_tag.htag_rid;
   7543 					sf_region_t *rgnp;
   7544 					ASSERT(SFMMU_IS_SHMERID_VALID(rid));
   7545 					ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   7546 					ASSERT(srdp != NULL);
   7547 					rgnp = srdp->srd_hmergnp[rid];
   7548 					SFMMU_VALIDATE_SHAREDHBLK(hmeblkp,
   7549 					    srdp, rgnp, rid);
   7550 					cpuset = sfmmu_rgntlb_demap(addr,
   7551 					    rgnp, hmeblkp, 1);
   7552 				} else {
   7553 					sfmmu_tlb_demap(addr, sfmmup, hmeblkp,
   7554 					    0, 0);
   7555 					cpuset = sfmmup->sfmmu_cpusran;
   7556 				}
   7557 			}
   7558 		}
   7559 		sfmmu_ttesync(hmeblkp->hblk_shared ? NULL : sfmmup, addr,
   7560 		    &tte, pp);
   7561 	}
   7562 	return (cpuset);
   7563 }
   7564 
   7565 /*
   7566  * Remove write permission from a mappings to a page, so that
   7567  * we can detect the next modification of it. This requires modifying
   7568  * the TTE then invalidating (demap) any TLB entry using that TTE.
   7569  * This code is similar to sfmmu_pagesync().
   7570  */
   7571 static cpuset_t
   7572 sfmmu_pageclrwrt(struct page *pp, struct sf_hment *sfhme)
   7573 {
   7574 	caddr_t addr;
   7575 	tte_t tte;
   7576 	tte_t ttemod;
   7577 	struct hme_blk *hmeblkp;
   7578 	int ret;
   7579 	sfmmu_t *sfmmup;
   7580 	cpuset_t cpuset;
   7581 
   7582 	ASSERT(pp != NULL);
   7583 	ASSERT(sfmmu_mlist_held(pp));
   7584 
   7585 	CPUSET_ZERO(cpuset);
   7586 	SFMMU_STAT(sf_clrwrt);
   7587 
   7588 retry:
   7589 
   7590 	sfmmu_copytte(&sfhme->hme_tte, &tte);
   7591 	if (TTE_IS_VALID(&tte) && TTE_IS_WRITABLE(&tte)) {
   7592 		hmeblkp = sfmmu_hmetohblk(sfhme);
   7593 
   7594 		/*
   7595 		 * xhat mappings should never be to a VMODSORT page.
   7596 		 */
   7597 		ASSERT(hmeblkp->hblk_xhat_bit == 0);
   7598 
   7599 		sfmmup = hblktosfmmu(hmeblkp);
   7600 		addr = tte_to_vaddr(hmeblkp, tte);
   7601 
   7602 		ttemod = tte;
   7603 		TTE_CLR_WRT(&ttemod);
   7604 		TTE_CLR_MOD(&ttemod);
   7605 		ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte);
   7606 
   7607 		/*
   7608 		 * if cas failed and the new value is not what
   7609 		 * we want retry
   7610 		 */
   7611 		if (ret < 0)
   7612 			goto retry;
   7613 
   7614 		/* we win the cas */
   7615 		if (ret > 0) {
   7616 			if (hmeblkp->hblk_shared) {
   7617 				sf_srd_t *srdp = (sf_srd_t *)sfmmup;
   7618 				uint_t rid = hmeblkp->hblk_tag.htag_rid;
   7619 				sf_region_t *rgnp;
   7620 				ASSERT(SFMMU_IS_SHMERID_VALID(rid));
   7621 				ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   7622 				ASSERT(srdp != NULL);
   7623 				rgnp = srdp->srd_hmergnp[rid];
   7624 				SFMMU_VALIDATE_SHAREDHBLK(hmeblkp,
   7625 				    srdp, rgnp, rid);
   7626 				cpuset = sfmmu_rgntlb_demap(addr,
   7627 				    rgnp, hmeblkp, 1);
   7628 			} else {
   7629 				sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0);
   7630 				cpuset = sfmmup->sfmmu_cpusran;
   7631 			}
   7632 		}
   7633 	}
   7634 
   7635 	return (cpuset);
   7636 }
   7637 
   7638 /*
   7639  * Walk all mappings of a page, removing write permission and clearing the
   7640  * ref/mod bits. This code is similar to hat_pagesync()
   7641  */
   7642 static void
   7643 hat_page_clrwrt(page_t *pp)
   7644 {
   7645 	struct sf_hment *sfhme;
   7646 	struct sf_hment *tmphme = NULL;
   7647 	kmutex_t *pml;
   7648 	cpuset_t cpuset;
   7649 	cpuset_t tset;
   7650 	int	index;
   7651 	int	 cons;
   7652 
   7653 	CPUSET_ZERO(cpuset);
   7654 
   7655 	pml = sfmmu_mlist_enter(pp);
   7656 	index = PP_MAPINDEX(pp);
   7657 	cons = TTE8K;
   7658 retry:
   7659 	for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
   7660 		tmphme = sfhme->hme_next;
   7661 
   7662 		/*
   7663 		 * If we are looking for large mappings and this hme doesn't
   7664 		 * reach the range we are seeking, just ignore its.
   7665 		 */
   7666 
   7667 		if (hme_size(sfhme) < cons)
   7668 			continue;
   7669 
   7670 		tset = sfmmu_pageclrwrt(pp, sfhme);
   7671 		CPUSET_OR(cpuset, tset);
   7672 	}
   7673 
   7674 	while (index) {
   7675 		index = index >> 1;
   7676 		cons++;
   7677 		if (index & 0x1) {
   7678 			/* Go to leading page */
   7679 			pp = PP_GROUPLEADER(pp, cons);
   7680 			goto retry;
   7681 		}
   7682 	}
   7683 
   7684 	xt_sync(cpuset);
   7685 	sfmmu_mlist_exit(pml);
   7686 }
   7687 
   7688 /*
   7689  * Set the given REF/MOD/RO bits for the given page.
   7690  * For a vnode with a sorted v_pages list, we need to change
   7691  * the attributes and the v_pages list together under page_vnode_mutex.
   7692  */
   7693 void
   7694 hat_page_setattr(page_t *pp, uint_t flag)
   7695 {
   7696 	vnode_t		*vp = pp->p_vnode;
   7697 	page_t		**listp;
   7698 	kmutex_t	*pmtx;
   7699 	kmutex_t	*vphm = NULL;
   7700 	int		noshuffle;
   7701 
   7702 	noshuffle = flag & P_NSH;
   7703 	flag &= ~P_NSH;
   7704 
   7705 	ASSERT(!(flag & ~(P_MOD | P_REF | P_RO)));
   7706 
   7707 	/*
   7708 	 * nothing to do if attribute already set
   7709 	 */
   7710 	if ((pp->p_nrm & flag) == flag)
   7711 		return;
   7712 
   7713 	if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp) &&
   7714 	    !noshuffle) {
   7715 		vphm = page_vnode_mutex(vp);
   7716 		mutex_enter(vphm);
   7717 	}
   7718 
   7719 	pmtx = sfmmu_page_enter(pp);
   7720 	pp->p_nrm |= flag;
   7721 	sfmmu_page_exit(pmtx);
   7722 
   7723 	if (vphm != NULL) {
   7724 		/*
   7725 		 * Some File Systems examine v_pages for NULL w/o
   7726 		 * grabbing the vphm mutex. Must not let it become NULL when
   7727 		 * pp is the only page on the list.
   7728 		 */
   7729 		if (pp->p_vpnext != pp) {
   7730 			page_vpsub(&vp->v_pages, pp);
   7731 			if (vp->v_pages != NULL)
   7732 				listp = &vp->v_pages->p_vpprev->p_vpnext;
   7733 			else
   7734 				listp = &vp->v_pages;
   7735 			page_vpadd(listp, pp);
   7736 		}
   7737 		mutex_exit(vphm);
   7738 	}
   7739 }
   7740 
   7741 void
   7742 hat_page_clrattr(page_t *pp, uint_t flag)
   7743 {
   7744 	vnode_t		*vp = pp->p_vnode;
   7745 	kmutex_t	*pmtx;
   7746 
   7747 	ASSERT(!(flag & ~(P_MOD | P_REF | P_RO)));
   7748 
   7749 	pmtx = sfmmu_page_enter(pp);
   7750 
   7751 	/*
   7752 	 * Caller is expected to hold page's io lock for VMODSORT to work
   7753 	 * correctly with pvn_vplist_dirty() and pvn_getdirty() when mod
   7754 	 * bit is cleared.
   7755 	 * We don't have assert to avoid tripping some existing third party
   7756 	 * code. The dirty page is moved back to top of the v_page list
   7757 	 * after IO is done in pvn_write_done().
   7758 	 */
   7759 	pp->p_nrm &= ~flag;
   7760 	sfmmu_page_exit(pmtx);
   7761 
   7762 	if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp)) {
   7763 
   7764 		/*
   7765 		 * VMODSORT works by removing write permissions and getting
   7766 		 * a fault when a page is made dirty. At this point
   7767 		 * we need to remove write permission from all mappings
   7768 		 * to this page.
   7769 		 */
   7770 		hat_page_clrwrt(pp);
   7771 	}
   7772 }
   7773 
   7774 uint_t
   7775 hat_page_getattr(page_t *pp, uint_t flag)
   7776 {
   7777 	ASSERT(!(flag & ~(P_MOD | P_REF | P_RO)));
   7778 	return ((uint_t)(pp->p_nrm & flag));
   7779 }
   7780 
   7781 /*
   7782  * DEBUG kernels: verify that a kernel va<->pa translation
   7783  * is safe by checking the underlying page_t is in a page
   7784  * relocation-safe state.
   7785  */
   7786 #ifdef	DEBUG
   7787 void
   7788 sfmmu_check_kpfn(pfn_t pfn)
   7789 {
   7790 	page_t *pp;
   7791 	int index, cons;
   7792 
   7793 	if (hat_check_vtop == 0)
   7794 		return;
   7795 
   7796 	if (hat_kpr_enabled == 0 || kvseg.s_base == NULL || panicstr)
   7797 		return;
   7798 
   7799 	pp = page_numtopp_nolock(pfn);
   7800 	if (!pp)
   7801 		return;
   7802 
   7803 	if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp))
   7804 		return;
   7805 
   7806 	/*
   7807 	 * Handed a large kernel page, we dig up the root page since we
   7808 	 * know the root page might have the lock also.
   7809 	 */
   7810 	if (pp->p_szc != 0) {
   7811 		index = PP_MAPINDEX(pp);
   7812 		cons = TTE8K;
   7813 again:
   7814 		while (index != 0) {
   7815 			index >>= 1;
   7816 			if (index != 0)
   7817 				cons++;
   7818 			if (index & 0x1) {
   7819 				pp = PP_GROUPLEADER(pp, cons);
   7820 				goto again;
   7821 			}
   7822 		}
   7823 	}
   7824 
   7825 	if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp))
   7826 		return;
   7827 
   7828 	/*
   7829 	 * Pages need to be locked or allocated "permanent" (either from
   7830 	 * static_arena arena or explicitly setting PG_NORELOC when calling
   7831 	 * page_create_va()) for VA->PA translations to be valid.
   7832 	 */
   7833 	if (!PP_ISNORELOC(pp))
   7834 		panic("Illegal VA->PA translation, pp 0x%p not permanent",
   7835 		    (void *)pp);
   7836 	else
   7837 		panic("Illegal VA->PA translation, pp 0x%p not locked",
   7838 		    (void *)pp);
   7839 }
   7840 #endif	/* DEBUG */
   7841 
   7842 /*
   7843  * Returns a page frame number for a given virtual address.
   7844  * Returns PFN_INVALID to indicate an invalid mapping
   7845  */
   7846 pfn_t
   7847 hat_getpfnum(struct hat *hat, caddr_t addr)
   7848 {
   7849 	pfn_t pfn;
   7850 	tte_t tte;
   7851 
   7852 	/*
   7853 	 * We would like to
   7854 	 * ASSERT(AS_LOCK_HELD(as, &as->a_lock));
   7855 	 * but we can't because the iommu driver will call this
   7856 	 * routine at interrupt time and it can't grab the as lock
   7857 	 * or it will deadlock: A thread could have the as lock
   7858 	 * and be waiting for io.  The io can't complete
   7859 	 * because the interrupt thread is blocked trying to grab
   7860 	 * the as lock.
   7861 	 */
   7862 
   7863 	ASSERT(hat->sfmmu_xhat_provider == NULL);
   7864 
   7865 	if (hat == ksfmmup) {
   7866 		if (IS_KMEM_VA_LARGEPAGE(addr)) {
   7867 			ASSERT(segkmem_lpszc > 0);
   7868 			pfn = sfmmu_kvaszc2pfn(addr, segkmem_lpszc);
   7869 			if (pfn != PFN_INVALID) {
   7870 				sfmmu_check_kpfn(pfn);
   7871 				return (pfn);
   7872 			}
   7873 		} else if (segkpm && IS_KPM_ADDR(addr)) {
   7874 			return (sfmmu_kpm_vatopfn(addr));
   7875 		}
   7876 		while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte))
   7877 		    == PFN_SUSPENDED) {
   7878 			sfmmu_vatopfn_suspended(addr, ksfmmup, &tte);
   7879 		}
   7880 		sfmmu_check_kpfn(pfn);
   7881 		return (pfn);
   7882 	} else {
   7883 		return (sfmmu_uvatopfn(addr, hat, NULL));
   7884 	}
   7885 }
   7886 
   7887 /*
   7888  * hat_getkpfnum() is an obsolete DDI routine, and its use is discouraged.
   7889  * Use hat_getpfnum(kas.a_hat, ...) instead.
   7890  *
   7891  * We'd like to return PFN_INVALID if the mappings have underlying page_t's
   7892  * but can't right now due to the fact that some software has grown to use
   7893  * this interface incorrectly. So for now when the interface is misused,
   7894  * return a warning to the user that in the future it won't work in the
   7895  * way they're abusing it, and carry on (after disabling page relocation).
   7896  */
   7897 pfn_t
   7898 hat_getkpfnum(caddr_t addr)
   7899 {
   7900 	pfn_t pfn;
   7901 	tte_t tte;
   7902 	int badcaller = 0;
   7903 	extern int segkmem_reloc;
   7904 
   7905 	if (segkpm && IS_KPM_ADDR(addr)) {
   7906 		badcaller = 1;
   7907 		pfn = sfmmu_kpm_vatopfn(addr);
   7908 	} else {
   7909 		while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte))
   7910 		    == PFN_SUSPENDED) {
   7911 			sfmmu_vatopfn_suspended(addr, ksfmmup, &tte);
   7912 		}
   7913 		badcaller = pf_is_memory(pfn);
   7914 	}
   7915 
   7916 	if (badcaller) {
   7917 		/*
   7918 		 * We can't return PFN_INVALID or the caller may panic
   7919 		 * or corrupt the system.  The only alternative is to
   7920 		 * disable page relocation at this point for all kernel
   7921 		 * memory.  This will impact any callers of page_relocate()
   7922 		 * such as FMA or DR.
   7923 		 *
   7924 		 * RFE: Add junk here to spit out an ereport so the sysadmin
   7925 		 * can be advised that he should upgrade his device driver
   7926 		 * so that this doesn't happen.
   7927 		 */
   7928 		hat_getkpfnum_badcall(caller());
   7929 		if (hat_kpr_enabled && segkmem_reloc) {
   7930 			hat_kpr_enabled = 0;
   7931 			segkmem_reloc = 0;
   7932 			cmn_err(CE_WARN, "Kernel Page Relocation is DISABLED");
   7933 		}
   7934 	}
   7935 	return (pfn);
   7936 }
   7937 
   7938 /*
   7939  * This routine will return both pfn and tte for the vaddr.
   7940  */
   7941 static pfn_t
   7942 sfmmu_uvatopfn(caddr_t vaddr, struct hat *sfmmup, tte_t *ttep)
   7943 {
   7944 	struct hmehash_bucket *hmebp;
   7945 	hmeblk_tag hblktag;
   7946 	int hmeshift, hashno = 1;
   7947 	struct hme_blk *hmeblkp = NULL;
   7948 	tte_t tte;
   7949 
   7950 	struct sf_hment *sfhmep;
   7951 	pfn_t pfn;
   7952 
   7953 	/* support for ISM */
   7954 	ism_map_t	*ism_map;
   7955 	ism_blk_t	*ism_blkp;
   7956 	int		i;
   7957 	sfmmu_t *ism_hatid = NULL;
   7958 	sfmmu_t *locked_hatid = NULL;
   7959 	sfmmu_t	*sv_sfmmup = sfmmup;
   7960 	caddr_t	sv_vaddr = vaddr;
   7961 	sf_srd_t *srdp;
   7962 
   7963 	if (ttep == NULL) {
   7964 		ttep = &tte;
   7965 	} else {
   7966 		ttep->ll = 0;
   7967 	}
   7968 
   7969 	ASSERT(sfmmup != ksfmmup);
   7970 	SFMMU_STAT(sf_user_vtop);
   7971 	/*
   7972 	 * Set ism_hatid if vaddr falls in a ISM segment.
   7973 	 */
   7974 	ism_blkp = sfmmup->sfmmu_iblk;
   7975 	if (ism_blkp != NULL) {
   7976 		sfmmu_ismhat_enter(sfmmup, 0);
   7977 		locked_hatid = sfmmup;
   7978 	}
   7979 	while (ism_blkp != NULL && ism_hatid == NULL) {
   7980 		ism_map = ism_blkp->iblk_maps;
   7981 		for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) {
   7982 			if (vaddr >= ism_start(ism_map[i]) &&
   7983 			    vaddr < ism_end(ism_map[i])) {
   7984 				sfmmup = ism_hatid = ism_map[i].imap_ismhat;
   7985 				vaddr = (caddr_t)(vaddr -
   7986 				    ism_start(ism_map[i]));
   7987 				break;
   7988 			}
   7989 		}
   7990 		ism_blkp = ism_blkp->iblk_next;
   7991 	}
   7992 	if (locked_hatid) {
   7993 		sfmmu_ismhat_exit(locked_hatid, 0);
   7994 	}
   7995 
   7996 	hblktag.htag_id = sfmmup;
   7997 	hblktag.htag_rid = SFMMU_INVALID_SHMERID;
   7998 	do {
   7999 		hmeshift = HME_HASH_SHIFT(hashno);
   8000 		hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift);
   8001 		hblktag.htag_rehash = hashno;
   8002 		hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift);
   8003 
   8004 		SFMMU_HASH_LOCK(hmebp);
   8005 
   8006 		HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp);
   8007 		if (hmeblkp != NULL) {
   8008 			ASSERT(!hmeblkp->hblk_shared);
   8009 			HBLKTOHME(sfhmep, hmeblkp, vaddr);
   8010 			sfmmu_copytte(&sfhmep->hme_tte, ttep);
   8011 			SFMMU_HASH_UNLOCK(hmebp);
   8012 			if (TTE_IS_VALID(ttep)) {
   8013 				pfn = TTE_TO_PFN(vaddr, ttep);
   8014 				return (pfn);
   8015 			}
   8016 			break;
   8017 		}
   8018 		SFMMU_HASH_UNLOCK(hmebp);
   8019 		hashno++;
   8020 	} while (HME_REHASH(sfmmup) && (hashno <= mmu_hashcnt));
   8021 
   8022 	if (SF_HMERGNMAP_ISNULL(sv_sfmmup)) {
   8023 		return (PFN_INVALID);
   8024 	}
   8025 	srdp = sv_sfmmup->sfmmu_srdp;
   8026 	ASSERT(srdp != NULL);
   8027 	ASSERT(srdp->srd_refcnt != 0);
   8028 	hblktag.htag_id = srdp;
   8029 	hashno = 1;
   8030 	do {
   8031 		hmeshift = HME_HASH_SHIFT(hashno);
   8032 		hblktag.htag_bspage = HME_HASH_BSPAGE(sv_vaddr, hmeshift);
   8033 		hblktag.htag_rehash = hashno;
   8034 		hmebp = HME_HASH_FUNCTION(srdp, sv_vaddr, hmeshift);
   8035 
   8036 		SFMMU_HASH_LOCK(hmebp);
   8037 		for (hmeblkp = hmebp->hmeblkp; hmeblkp != NULL;
   8038 		    hmeblkp = hmeblkp->hblk_next) {
   8039 			uint_t rid;
   8040 			sf_region_t *rgnp;
   8041 			caddr_t rsaddr;
   8042 			caddr_t readdr;
   8043 
   8044 			if (!HTAGS_EQ_SHME(hmeblkp->hblk_tag, hblktag,
   8045 			    sv_sfmmup->sfmmu_hmeregion_map)) {
   8046 				continue;
   8047 			}
   8048 			ASSERT(hmeblkp->hblk_shared);
   8049 			rid = hmeblkp->hblk_tag.htag_rid;
   8050 			ASSERT(SFMMU_IS_SHMERID_VALID(rid));
   8051 			ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   8052 			rgnp = srdp->srd_hmergnp[rid];
   8053 			SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid);
   8054 			HBLKTOHME(sfhmep, hmeblkp, sv_vaddr);
   8055 			sfmmu_copytte(&sfhmep->hme_tte, ttep);
   8056 			rsaddr = rgnp->rgn_saddr;
   8057 			readdr = rsaddr + rgnp->rgn_size;
   8058 #ifdef DEBUG
   8059 			if (TTE_IS_VALID(ttep) ||
   8060 			    get_hblk_ttesz(hmeblkp) > TTE8K) {
   8061 				caddr_t eva = tte_to_evaddr(hmeblkp, ttep);
   8062 				ASSERT(eva > sv_vaddr);
   8063 				ASSERT(sv_vaddr >= rsaddr);
   8064 				ASSERT(sv_vaddr < readdr);
   8065 				ASSERT(eva <= readdr);
   8066 			}
   8067 #endif /* DEBUG */
   8068 			/*
   8069 			 * Continue the search if we
   8070 			 * found an invalid 8K tte outside of the area
   8071 			 * covered by this hmeblk's region.
   8072 			 */
   8073 			if (TTE_IS_VALID(ttep)) {
   8074 				SFMMU_HASH_UNLOCK(hmebp);
   8075 				pfn = TTE_TO_PFN(sv_vaddr, ttep);
   8076 				return (pfn);
   8077 			} else if (get_hblk_ttesz(hmeblkp) > TTE8K ||
   8078 			    (sv_vaddr >= rsaddr && sv_vaddr < readdr)) {
   8079 				SFMMU_HASH_UNLOCK(hmebp);
   8080 				pfn = PFN_INVALID;
   8081 				return (pfn);
   8082 			}
   8083 		}
   8084 		SFMMU_HASH_UNLOCK(hmebp);
   8085 		hashno++;
   8086 	} while (hashno <= mmu_hashcnt);
   8087 	return (PFN_INVALID);
   8088 }
   8089 
   8090 
   8091 /*
   8092  * For compatability with AT&T and later optimizations
   8093  */
   8094 /* ARGSUSED */
   8095 void
   8096 hat_map(struct hat *hat, caddr_t addr, size_t len, uint_t flags)
   8097 {
   8098 	ASSERT(hat != NULL);
   8099 	ASSERT(hat->sfmmu_xhat_provider == NULL);
   8100 }
   8101 
   8102 /*
   8103  * Return the number of mappings to a particular page.  This number is an
   8104  * approximation of the number of people sharing the page.
   8105  *
   8106  * shared hmeblks or ism hmeblks are counted as 1 mapping here.
   8107  * hat_page_checkshare() can be used to compare threshold to share
   8108  * count that reflects the number of region sharers albeit at higher cost.
   8109  */
   8110 ulong_t
   8111 hat_page_getshare(page_t *pp)
   8112 {
   8113 	page_t *spp = pp;	/* start page */
   8114 	kmutex_t *pml;
   8115 	ulong_t	cnt;
   8116 	int index, sz = TTE64K;
   8117 
   8118 	/*
   8119 	 * We need to grab the mlist lock to make sure any outstanding
   8120 	 * load/unloads complete.  Otherwise we could return zero
   8121 	 * even though the unload(s) hasn't finished yet.
   8122 	 */
   8123 	pml = sfmmu_mlist_enter(spp);
   8124 	cnt = spp->p_share;
   8125 
   8126 #ifdef VAC
   8127 	if (kpm_enable)
   8128 		cnt += spp->p_kpmref;
   8129 #endif
   8130 	if (vpm_enable && pp->p_vpmref) {
   8131