Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * VM - Hardware Address Translation management for Spitfire MMU.
     28  *
     29  * This file implements the machine specific hardware translation
     30  * needed by the VM system.  The machine independent interface is
     31  * described in <vm/hat.h> while the machine dependent interface
     32  * and data structures are described in <vm/hat_sfmmu.h>.
     33  *
     34  * The hat layer manages the address translation hardware as a cache
     35  * driven by calls from the higher levels in the VM system.
     36  */
     37 
     38 #include <sys/types.h>
     39 #include <sys/kstat.h>
     40 #include <vm/hat.h>
     41 #include <vm/hat_sfmmu.h>
     42 #include <vm/page.h>
     43 #include <sys/pte.h>
     44 #include <sys/systm.h>
     45 #include <sys/mman.h>
     46 #include <sys/sysmacros.h>
     47 #include <sys/machparam.h>
     48 #include <sys/vtrace.h>
     49 #include <sys/kmem.h>
     50 #include <sys/mmu.h>
     51 #include <sys/cmn_err.h>
     52 #include <sys/cpu.h>
     53 #include <sys/cpuvar.h>
     54 #include <sys/debug.h>
     55 #include <sys/lgrp.h>
     56 #include <sys/archsystm.h>
     57 #include <sys/machsystm.h>
     58 #include <sys/vmsystm.h>
     59 #include <vm/as.h>
     60 #include <vm/seg.h>
     61 #include <vm/seg_kp.h>
     62 #include <vm/seg_kmem.h>
     63 #include <vm/seg_kpm.h>
     64 #include <vm/rm.h>
     65 #include <sys/t_lock.h>
     66 #include <sys/obpdefs.h>
     67 #include <sys/vm_machparam.h>
     68 #include <sys/var.h>
     69 #include <sys/trap.h>
     70 #include <sys/machtrap.h>
     71 #include <sys/scb.h>
     72 #include <sys/bitmap.h>
     73 #include <sys/machlock.h>
     74 #include <sys/membar.h>
     75 #include <sys/atomic.h>
     76 #include <sys/cpu_module.h>
     77 #include <sys/prom_debug.h>
     78 #include <sys/ksynch.h>
     79 #include <sys/mem_config.h>
     80 #include <sys/mem_cage.h>
     81 #include <vm/vm_dep.h>
     82 #include <vm/xhat_sfmmu.h>
     83 #include <sys/fpu/fpusystm.h>
     84 #include <vm/mach_kpm.h>
     85 #include <sys/callb.h>
     86 
     87 #ifdef	DEBUG
     88 #define	SFMMU_VALIDATE_HMERID(hat, rid, saddr, len)			\
     89 	if (SFMMU_IS_SHMERID_VALID(rid)) {				\
     90 		caddr_t _eaddr = (saddr) + (len);			\
     91 		sf_srd_t *_srdp;					\
     92 		sf_region_t *_rgnp;					\
     93 		ASSERT((rid) < SFMMU_MAX_HME_REGIONS);			\
     94 		ASSERT(SF_RGNMAP_TEST(hat->sfmmu_hmeregion_map, rid));	\
     95 		ASSERT((hat) != ksfmmup);				\
     96 		_srdp = (hat)->sfmmu_srdp;				\
     97 		ASSERT(_srdp != NULL);					\
     98 		ASSERT(_srdp->srd_refcnt != 0);				\
     99 		_rgnp = _srdp->srd_hmergnp[(rid)];			\
    100 		ASSERT(_rgnp != NULL && _rgnp->rgn_id == rid);		\
    101 		ASSERT(_rgnp->rgn_refcnt != 0);				\
    102 		ASSERT(!(_rgnp->rgn_flags & SFMMU_REGION_FREE));	\
    103 		ASSERT((_rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) ==	\
    104 		    SFMMU_REGION_HME);					\
    105 		ASSERT((saddr) >= _rgnp->rgn_saddr);			\
    106 		ASSERT((saddr) < _rgnp->rgn_saddr + _rgnp->rgn_size);	\
    107 		ASSERT(_eaddr > _rgnp->rgn_saddr);			\
    108 		ASSERT(_eaddr <= _rgnp->rgn_saddr + _rgnp->rgn_size);	\
    109 	}
    110 
    111 #define	SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid) 	 	 \
    112 {						 			 \
    113 		caddr_t _hsva;						 \
    114 		caddr_t _heva;						 \
    115 		caddr_t _rsva;					 	 \
    116 		caddr_t _reva;					 	 \
    117 		int	_ttesz = get_hblk_ttesz(hmeblkp);		 \
    118 		int	_flagtte;					 \
    119 		ASSERT((srdp)->srd_refcnt != 0);			 \
    120 		ASSERT((rid) < SFMMU_MAX_HME_REGIONS);			 \
    121 		ASSERT((rgnp)->rgn_id == rid);				 \
    122 		ASSERT(!((rgnp)->rgn_flags & SFMMU_REGION_FREE));	 \
    123 		ASSERT(((rgnp)->rgn_flags & SFMMU_REGION_TYPE_MASK) ==	 \
    124 		    SFMMU_REGION_HME);					 \
    125 		ASSERT(_ttesz <= (rgnp)->rgn_pgszc);			 \
    126 		_hsva = (caddr_t)get_hblk_base(hmeblkp);		 \
    127 		_heva = get_hblk_endaddr(hmeblkp);			 \
    128 		_rsva = (caddr_t)P2ALIGN(				 \
    129 		    (uintptr_t)(rgnp)->rgn_saddr, HBLK_MIN_BYTES);	 \
    130 		_reva = (caddr_t)P2ROUNDUP(				 \
    131 		    (uintptr_t)((rgnp)->rgn_saddr + (rgnp)->rgn_size),	 \
    132 		    HBLK_MIN_BYTES);					 \
    133 		ASSERT(_hsva >= _rsva);				 	 \
    134 		ASSERT(_hsva < _reva);				 	 \
    135 		ASSERT(_heva > _rsva);				 	 \
    136 		ASSERT(_heva <= _reva);				 	 \
    137 		_flagtte = (_ttesz < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ :  \
    138 			_ttesz;						 \
    139 		ASSERT(rgnp->rgn_hmeflags & (0x1 << _flagtte));		 \
    140 }
    141 
    142 #else /* DEBUG */
    143 #define	SFMMU_VALIDATE_HMERID(hat, rid, addr, len)
    144 #define	SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid)
    145 #endif /* DEBUG */
    146 
    147 #if defined(SF_ERRATA_57)
    148 extern caddr_t errata57_limit;
    149 #endif
    150 
    151 #define	HME8BLK_SZ_RND		((roundup(HME8BLK_SZ, sizeof (int64_t))) /  \
    152 				(sizeof (int64_t)))
    153 #define	HBLK_RESERVE		((struct hme_blk *)hblk_reserve)
    154 
    155 #define	HBLK_RESERVE_CNT	128
    156 #define	HBLK_RESERVE_MIN	20
    157 
    158 static struct hme_blk		*freehblkp;
    159 static kmutex_t			freehblkp_lock;
    160 static int			freehblkcnt;
    161 
    162 static int64_t			hblk_reserve[HME8BLK_SZ_RND];
    163 static kmutex_t			hblk_reserve_lock;
    164 static kthread_t		*hblk_reserve_thread;
    165 
    166 static nucleus_hblk8_info_t	nucleus_hblk8;
    167 static nucleus_hblk1_info_t	nucleus_hblk1;
    168 
    169 /*
    170  * Data to manage per-cpu hmeblk pending queues, hmeblks are queued here
    171  * after the initial phase of removing an hmeblk from the hash chain, see
    172  * the detailed comment in sfmmu_hblk_hash_rm() for further details.
    173  */
    174 static cpu_hme_pend_t		*cpu_hme_pend;
    175 static uint_t			cpu_hme_pend_thresh;
    176 /*
    177  * SFMMU specific hat functions
    178  */
    179 void	hat_pagecachectl(struct page *, int);
    180 
    181 /* flags for hat_pagecachectl */
    182 #define	HAT_CACHE	0x1
    183 #define	HAT_UNCACHE	0x2
    184 #define	HAT_TMPNC	0x4
    185 
    186 /*
    187  * Flag to allow the creation of non-cacheable translations
    188  * to system memory. It is off by default. At the moment this
    189  * flag is used by the ecache error injector. The error injector
    190  * will turn it on when creating such a translation then shut it
    191  * off when it's finished.
    192  */
    193 
    194 int	sfmmu_allow_nc_trans = 0;
    195 
    196 /*
    197  * Flag to disable large page support.
    198  * 	value of 1 => disable all large pages.
    199  *	bits 1, 2, and 3 are to disable 64K, 512K and 4M pages respectively.
    200  *
    201  * For example, use the value 0x4 to disable 512K pages.
    202  *
    203  */
    204 #define	LARGE_PAGES_OFF		0x1
    205 
    206 /*
    207  * The disable_large_pages and disable_ism_large_pages variables control
    208  * hat_memload_array and the page sizes to be used by ISM and the kernel.
    209  *
    210  * The disable_auto_data_large_pages and disable_auto_text_large_pages variables
    211  * are only used to control which OOB pages to use at upper VM segment creation
    212  * time, and are set in hat_init_pagesizes and used in the map_pgsz* routines.
    213  * Their values may come from platform or CPU specific code to disable page
    214  * sizes that should not be used.
    215  *
    216  * WARNING: 512K pages are currently not supported for ISM/DISM.
    217  */
    218 uint_t	disable_large_pages = 0;
    219 uint_t	disable_ism_large_pages = (1 << TTE512K);
    220 uint_t	disable_auto_data_large_pages = 0;
    221 uint_t	disable_auto_text_large_pages = 0;
    222 
    223 /*
    224  * Private sfmmu data structures for hat management
    225  */
    226 static struct kmem_cache *sfmmuid_cache;
    227 static struct kmem_cache *mmuctxdom_cache;
    228 
    229 /*
    230  * Private sfmmu data structures for tsb management
    231  */
    232 static struct kmem_cache *sfmmu_tsbinfo_cache;
    233 static struct kmem_cache *sfmmu_tsb8k_cache;
    234 static struct kmem_cache *sfmmu_tsb_cache[NLGRPS_MAX];
    235 static vmem_t *kmem_bigtsb_arena;
    236 static vmem_t *kmem_tsb_arena;
    237 
    238 /*
    239  * sfmmu static variables for hmeblk resource management.
    240  */
    241 static vmem_t *hat_memload1_arena; /* HAT translation arena for sfmmu1_cache */
    242 static struct kmem_cache *sfmmu8_cache;
    243 static struct kmem_cache *sfmmu1_cache;
    244 static struct kmem_cache *pa_hment_cache;
    245 
    246 static kmutex_t 	ism_mlist_lock;	/* mutex for ism mapping list */
    247 /*
    248  * private data for ism
    249  */
    250 static struct kmem_cache *ism_blk_cache;
    251 static struct kmem_cache *ism_ment_cache;
    252 #define	ISMID_STARTADDR	NULL
    253 
    254 /*
    255  * Region management data structures and function declarations.
    256  */
    257 
    258 static void	sfmmu_leave_srd(sfmmu_t *);
    259 static int	sfmmu_srdcache_constructor(void *, void *, int);
    260 static void	sfmmu_srdcache_destructor(void *, void *);
    261 static int	sfmmu_rgncache_constructor(void *, void *, int);
    262 static void	sfmmu_rgncache_destructor(void *, void *);
    263 static int	sfrgnmap_isnull(sf_region_map_t *);
    264 static int	sfhmergnmap_isnull(sf_hmeregion_map_t *);
    265 static int	sfmmu_scdcache_constructor(void *, void *, int);
    266 static void	sfmmu_scdcache_destructor(void *, void *);
    267 static void	sfmmu_rgn_cb_noop(caddr_t, caddr_t, caddr_t,
    268     size_t, void *, u_offset_t);
    269 
    270 static uint_t srd_hashmask = SFMMU_MAX_SRD_BUCKETS - 1;
    271 static sf_srd_bucket_t *srd_buckets;
    272 static struct kmem_cache *srd_cache;
    273 static uint_t srd_rgn_hashmask = SFMMU_MAX_REGION_BUCKETS - 1;
    274 static struct kmem_cache *region_cache;
    275 static struct kmem_cache *scd_cache;
    276 
    277 #ifdef sun4v
    278 int use_bigtsb_arena = 1;
    279 #else
    280 int use_bigtsb_arena = 0;
    281 #endif
    282 
    283 /* External /etc/system tunable, for turning on&off the shctx support */
    284 int disable_shctx = 0;
    285 /* Internal variable, set by MD if the HW supports shctx feature */
    286 int shctx_on = 0;
    287 
    288 #ifdef DEBUG
    289 static void check_scd_sfmmu_list(sfmmu_t **, sfmmu_t *, int);
    290 #endif
    291 static void sfmmu_to_scd_list(sfmmu_t **, sfmmu_t *);
    292 static void sfmmu_from_scd_list(sfmmu_t **, sfmmu_t *);
    293 
    294 static sf_scd_t *sfmmu_alloc_scd(sf_srd_t *, sf_region_map_t *);
    295 static void sfmmu_find_scd(sfmmu_t *);
    296 static void sfmmu_join_scd(sf_scd_t *, sfmmu_t *);
    297 static void sfmmu_finish_join_scd(sfmmu_t *);
    298 static void sfmmu_leave_scd(sfmmu_t *, uchar_t);
    299 static void sfmmu_destroy_scd(sf_srd_t *, sf_scd_t *, sf_region_map_t *);
    300 static int sfmmu_alloc_scd_tsbs(sf_srd_t *, sf_scd_t *);
    301 static void sfmmu_free_scd_tsbs(sfmmu_t *);
    302 static void sfmmu_tsb_inv_ctx(sfmmu_t *);
    303 static int find_ism_rid(sfmmu_t *, sfmmu_t *, caddr_t, uint_t *);
    304 static void sfmmu_ism_hatflags(sfmmu_t *, int);
    305 static int sfmmu_srd_lock_held(sf_srd_t *);
    306 static void sfmmu_remove_scd(sf_scd_t **, sf_scd_t *);
    307 static void sfmmu_add_scd(sf_scd_t **headp, sf_scd_t *);
    308 static void sfmmu_link_scd_to_regions(sf_srd_t *, sf_scd_t *);
    309 static void sfmmu_unlink_scd_from_regions(sf_srd_t *, sf_scd_t *);
    310 static void sfmmu_link_to_hmeregion(sfmmu_t *, sf_region_t *);
    311 static void sfmmu_unlink_from_hmeregion(sfmmu_t *, sf_region_t *);
    312 
    313 /*
    314  * ``hat_lock'' is a hashed mutex lock for protecting sfmmu TSB lists,
    315  * HAT flags, synchronizing TLB/TSB coherency, and context management.
    316  * The lock is hashed on the sfmmup since the case where we need to lock
    317  * all processes is rare but does occur (e.g. we need to unload a shared
    318  * mapping from all processes using the mapping).  We have a lot of buckets,
    319  * and each slab of sfmmu_t's can use about a quarter of them, giving us
    320  * a fairly good distribution without wasting too much space and overhead
    321  * when we have to grab them all.
    322  */
    323 #define	SFMMU_NUM_LOCK	128		/* must be power of two */
    324 hatlock_t	hat_lock[SFMMU_NUM_LOCK];
    325 
    326 /*
    327  * Hash algorithm optimized for a small number of slabs.
    328  *  7 is (highbit((sizeof sfmmu_t)) - 1)
    329  * This hash algorithm is based upon the knowledge that sfmmu_t's come from a
    330  * kmem_cache, and thus they will be sequential within that cache.  In
    331  * addition, each new slab will have a different "color" up to cache_maxcolor
    332  * which will skew the hashing for each successive slab which is allocated.
    333  * If the size of sfmmu_t changed to a larger size, this algorithm may need
    334  * to be revisited.
    335  */
    336 #define	TSB_HASH_SHIFT_BITS (7)
    337 #define	PTR_HASH(x) ((uintptr_t)x >> TSB_HASH_SHIFT_BITS)
    338 
    339 #ifdef DEBUG
    340 int tsb_hash_debug = 0;
    341 #define	TSB_HASH(sfmmup)	\
    342 	(tsb_hash_debug ? &hat_lock[0] : \
    343 	&hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)])
    344 #else	/* DEBUG */
    345 #define	TSB_HASH(sfmmup)	&hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)]
    346 #endif	/* DEBUG */
    347 
    348 
    349 /* sfmmu_replace_tsb() return codes. */
    350 typedef enum tsb_replace_rc {
    351 	TSB_SUCCESS,
    352 	TSB_ALLOCFAIL,
    353 	TSB_LOSTRACE,
    354 	TSB_ALREADY_SWAPPED,
    355 	TSB_CANTGROW
    356 } tsb_replace_rc_t;
    357 
    358 /*
    359  * Flags for TSB allocation routines.
    360  */
    361 #define	TSB_ALLOC	0x01
    362 #define	TSB_FORCEALLOC	0x02
    363 #define	TSB_GROW	0x04
    364 #define	TSB_SHRINK	0x08
    365 #define	TSB_SWAPIN	0x10
    366 
    367 /*
    368  * Support for HAT callbacks.
    369  */
    370 #define	SFMMU_MAX_RELOC_CALLBACKS	10
    371 int sfmmu_max_cb_id = SFMMU_MAX_RELOC_CALLBACKS;
    372 static id_t sfmmu_cb_nextid = 0;
    373 static id_t sfmmu_tsb_cb_id;
    374 struct sfmmu_callback *sfmmu_cb_table;
    375 
    376 /*
    377  * Kernel page relocation is enabled by default for non-caged
    378  * kernel pages.  This has little effect unless segkmem_reloc is
    379  * set, since by default kernel memory comes from inside the
    380  * kernel cage.
    381  */
    382 int hat_kpr_enabled = 1;
    383 
    384 kmutex_t	kpr_mutex;
    385 kmutex_t	kpr_suspendlock;
    386 kthread_t	*kreloc_thread;
    387 
    388 /*
    389  * Enable VA->PA translation sanity checking on DEBUG kernels.
    390  * Disabled by default.  This is incompatible with some
    391  * drivers (error injector, RSM) so if it breaks you get
    392  * to keep both pieces.
    393  */
    394 int hat_check_vtop = 0;
    395 
    396 /*
    397  * Private sfmmu routines (prototypes)
    398  */
    399 static struct hme_blk *sfmmu_shadow_hcreate(sfmmu_t *, caddr_t, int, uint_t);
    400 static struct 	hme_blk *sfmmu_hblk_alloc(sfmmu_t *, caddr_t,
    401 			struct hmehash_bucket *, uint_t, hmeblk_tag, uint_t,
    402 			uint_t);
    403 static caddr_t	sfmmu_hblk_unload(struct hat *, struct hme_blk *, caddr_t,
    404 			caddr_t, demap_range_t *, uint_t);
    405 static caddr_t	sfmmu_hblk_sync(struct hat *, struct hme_blk *, caddr_t,
    406 			caddr_t, int);
    407 static void	sfmmu_hblk_free(struct hme_blk **);
    408 static void	sfmmu_hblks_list_purge(struct hme_blk **, int);
    409 static uint_t	sfmmu_get_free_hblk(struct hme_blk **, uint_t);
    410 static uint_t	sfmmu_put_free_hblk(struct hme_blk *, uint_t);
    411 static struct hme_blk *sfmmu_hblk_steal(int);
    412 static int	sfmmu_steal_this_hblk(struct hmehash_bucket *,
    413 			struct hme_blk *, uint64_t, struct hme_blk *);
    414 static caddr_t	sfmmu_hblk_unlock(struct hme_blk *, caddr_t, caddr_t);
    415 
    416 static void	hat_do_memload_array(struct hat *, caddr_t, size_t,
    417 		    struct page **, uint_t, uint_t, uint_t);
    418 static void	hat_do_memload(struct hat *, caddr_t, struct page *,
    419 		    uint_t, uint_t, uint_t);
    420 static void	sfmmu_memload_batchsmall(struct hat *, caddr_t, page_t **,
    421 		    uint_t, uint_t, pgcnt_t, uint_t);
    422 void		sfmmu_tteload(struct hat *, tte_t *, caddr_t, page_t *,
    423 			uint_t);
    424 static int	sfmmu_tteload_array(sfmmu_t *, tte_t *, caddr_t, page_t **,
    425 			uint_t, uint_t);
    426 static struct hmehash_bucket *sfmmu_tteload_acquire_hashbucket(sfmmu_t *,
    427 					caddr_t, int, uint_t);
    428 static struct hme_blk *sfmmu_tteload_find_hmeblk(sfmmu_t *,
    429 			struct hmehash_bucket *, caddr_t, uint_t, uint_t,
    430 			uint_t);
    431 static int	sfmmu_tteload_addentry(sfmmu_t *, struct hme_blk *, tte_t *,
    432 			caddr_t, page_t **, uint_t, uint_t);
    433 static void	sfmmu_tteload_release_hashbucket(struct hmehash_bucket *);
    434 
    435 static int	sfmmu_pagearray_setup(caddr_t, page_t **, tte_t *, int);
    436 static pfn_t	sfmmu_uvatopfn(caddr_t, sfmmu_t *, tte_t *);
    437 void		sfmmu_memtte(tte_t *, pfn_t, uint_t, int);
    438 #ifdef VAC
    439 static void	sfmmu_vac_conflict(struct hat *, caddr_t, page_t *);
    440 static int	sfmmu_vacconflict_array(caddr_t, page_t *, int *);
    441 int	tst_tnc(page_t *pp, pgcnt_t);
    442 void	conv_tnc(page_t *pp, int);
    443 #endif
    444 
    445 static void	sfmmu_get_ctx(sfmmu_t *);
    446 static void	sfmmu_free_sfmmu(sfmmu_t *);
    447 
    448 static void	sfmmu_ttesync(struct hat *, caddr_t, tte_t *, page_t *);
    449 static void	sfmmu_chgattr(struct hat *, caddr_t, size_t, uint_t, int);
    450 
    451 cpuset_t	sfmmu_pageunload(page_t *, struct sf_hment *, int);
    452 static void	hat_pagereload(struct page *, struct page *);
    453 static cpuset_t	sfmmu_pagesync(page_t *, struct sf_hment *, uint_t);
    454 #ifdef VAC
    455 void	sfmmu_page_cache_array(page_t *, int, int, pgcnt_t);
    456 static void	sfmmu_page_cache(page_t *, int, int, int);
    457 #endif
    458 
    459 cpuset_t	sfmmu_rgntlb_demap(caddr_t, sf_region_t *,
    460     struct hme_blk *, int);
    461 static void	sfmmu_tlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *,
    462 			pfn_t, int, int, int, int);
    463 static void	sfmmu_ismtlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *,
    464 			pfn_t, int);
    465 static void	sfmmu_tlb_demap(caddr_t, sfmmu_t *, struct hme_blk *, int, int);
    466 static void	sfmmu_tlb_range_demap(demap_range_t *);
    467 static void	sfmmu_invalidate_ctx(sfmmu_t *);
    468 static void	sfmmu_sync_mmustate(sfmmu_t *);
    469 
    470 static void 	sfmmu_tsbinfo_setup_phys(struct tsb_info *, pfn_t);
    471 static int	sfmmu_tsbinfo_alloc(struct tsb_info **, int, int, uint_t,
    472 			sfmmu_t *);
    473 static void	sfmmu_tsb_free(struct tsb_info *);
    474 static void	sfmmu_tsbinfo_free(struct tsb_info *);
    475 static int	sfmmu_init_tsbinfo(struct tsb_info *, int, int, uint_t,
    476 			sfmmu_t *);
    477 static void	sfmmu_tsb_chk_reloc(sfmmu_t *, hatlock_t *);
    478 static void	sfmmu_tsb_swapin(sfmmu_t *, hatlock_t *);
    479 static int	sfmmu_select_tsb_szc(pgcnt_t);
    480 static void	sfmmu_mod_tsb(sfmmu_t *, caddr_t, tte_t *, int);
    481 #define		sfmmu_load_tsb(sfmmup, vaddr, tte, szc) \
    482 	sfmmu_mod_tsb(sfmmup, vaddr, tte, szc)
    483 #define		sfmmu_unload_tsb(sfmmup, vaddr, szc)    \
    484 	sfmmu_mod_tsb(sfmmup, vaddr, NULL, szc)
    485 static void	sfmmu_copy_tsb(struct tsb_info *, struct tsb_info *);
    486 static tsb_replace_rc_t sfmmu_replace_tsb(sfmmu_t *, struct tsb_info *, uint_t,
    487     hatlock_t *, uint_t);
    488 static void	sfmmu_size_tsb(sfmmu_t *, int, uint64_t, uint64_t, int);
    489 
    490 #ifdef VAC
    491 void	sfmmu_cache_flush(pfn_t, int);
    492 void	sfmmu_cache_flushcolor(int, pfn_t);
    493 #endif
    494 static caddr_t	sfmmu_hblk_chgattr(sfmmu_t *, struct hme_blk *, caddr_t,
    495 			caddr_t, demap_range_t *, uint_t, int);
    496 
    497 static uint64_t	sfmmu_vtop_attr(uint_t, int mode, tte_t *);
    498 static uint_t	sfmmu_ptov_attr(tte_t *);
    499 static caddr_t	sfmmu_hblk_chgprot(sfmmu_t *, struct hme_blk *, caddr_t,
    500 			caddr_t, demap_range_t *, uint_t);
    501 static uint_t	sfmmu_vtop_prot(uint_t, uint_t *);
    502 static int	sfmmu_idcache_constructor(void *, void *, int);
    503 static void	sfmmu_idcache_destructor(void *, void *);
    504 static int	sfmmu_hblkcache_constructor(void *, void *, int);
    505 static void	sfmmu_hblkcache_destructor(void *, void *);
    506 static void	sfmmu_hblkcache_reclaim(void *);
    507 static void	sfmmu_shadow_hcleanup(sfmmu_t *, struct hme_blk *,
    508 			struct hmehash_bucket *);
    509 static void	sfmmu_hblk_hash_rm(struct hmehash_bucket *, struct hme_blk *,
    510 			struct hme_blk *, struct hme_blk **, int);
    511 static void	sfmmu_hblk_hash_add(struct hmehash_bucket *, struct hme_blk *,
    512 			uint64_t);
    513 static struct hme_blk *sfmmu_check_pending_hblks(int);
    514 static void	sfmmu_free_hblks(sfmmu_t *, caddr_t, caddr_t, int);
    515 static void	sfmmu_cleanup_rhblk(sf_srd_t *, caddr_t, uint_t, int);
    516 static void	sfmmu_unload_hmeregion_va(sf_srd_t *, uint_t, caddr_t, caddr_t,
    517 			int, caddr_t *);
    518 static void	sfmmu_unload_hmeregion(sf_srd_t *, sf_region_t *);
    519 
    520 static void	sfmmu_rm_large_mappings(page_t *, int);
    521 
    522 static void	hat_lock_init(void);
    523 static void	hat_kstat_init(void);
    524 static int	sfmmu_kstat_percpu_update(kstat_t *ksp, int rw);
    525 static void	sfmmu_set_scd_rttecnt(sf_srd_t *, sf_scd_t *);
    526 static	int	sfmmu_is_rgnva(sf_srd_t *, caddr_t, ulong_t, ulong_t);
    527 static void	sfmmu_check_page_sizes(sfmmu_t *, int);
    528 int	fnd_mapping_sz(page_t *);
    529 static void	iment_add(struct ism_ment *,  struct hat *);
    530 static void	iment_sub(struct ism_ment *, struct hat *);
    531 static pgcnt_t	ism_tsb_entries(sfmmu_t *, int szc);
    532 extern void	sfmmu_setup_tsbinfo(sfmmu_t *);
    533 extern void	sfmmu_clear_utsbinfo(void);
    534 
    535 static void	sfmmu_ctx_wrap_around(mmu_ctx_t *);
    536 
    537 extern int vpm_enable;
    538 
    539 /* kpm globals */
    540 #ifdef	DEBUG
    541 /*
    542  * Enable trap level tsbmiss handling
    543  */
    544 int	kpm_tsbmtl = 1;
    545 
    546 /*
    547  * Flush the TLB on kpm mapout. Note: Xcalls are used (again) for the
    548  * required TLB shootdowns in this case, so handle w/ care. Off by default.
    549  */
    550 int	kpm_tlb_flush;
    551 #endif	/* DEBUG */
    552 
    553 static void	*sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *, size_t, int);
    554 
    555 #ifdef DEBUG
    556 static void	sfmmu_check_hblk_flist();
    557 #endif
    558 
    559 /*
    560  * Semi-private sfmmu data structures.  Some of them are initialize in
    561  * startup or in hat_init. Some of them are private but accessed by
    562  * assembly code or mach_sfmmu.c
    563  */
    564 struct hmehash_bucket *uhme_hash;	/* user hmeblk hash table */
    565 struct hmehash_bucket *khme_hash;	/* kernel hmeblk hash table */
    566 uint64_t	uhme_hash_pa;		/* PA of uhme_hash */
    567 uint64_t	khme_hash_pa;		/* PA of khme_hash */
    568 int 		uhmehash_num;		/* # of buckets in user hash table */
    569 int 		khmehash_num;		/* # of buckets in kernel hash table */
    570 
    571 uint_t		max_mmu_ctxdoms = 0;	/* max context domains in the system */
    572 mmu_ctx_t	**mmu_ctxs_tbl;		/* global array of context domains */
    573 uint64_t	mmu_saved_gnum = 0;	/* to init incoming MMUs' gnums */
    574 
    575 #define	DEFAULT_NUM_CTXS_PER_MMU 8192
    576 static uint_t	nctxs = DEFAULT_NUM_CTXS_PER_MMU;
    577 
    578 int		cache;			/* describes system cache */
    579 
    580 caddr_t		ktsb_base;		/* kernel 8k-indexed tsb base address */
    581 uint64_t	ktsb_pbase;		/* kernel 8k-indexed tsb phys address */
    582 int		ktsb_szcode;		/* kernel 8k-indexed tsb size code */
    583 int		ktsb_sz;		/* kernel 8k-indexed tsb size */
    584 
    585 caddr_t		ktsb4m_base;		/* kernel 4m-indexed tsb base address */
    586 uint64_t	ktsb4m_pbase;		/* kernel 4m-indexed tsb phys address */
    587 int		ktsb4m_szcode;		/* kernel 4m-indexed tsb size code */
    588 int		ktsb4m_sz;		/* kernel 4m-indexed tsb size */
    589 
    590 uint64_t	kpm_tsbbase;		/* kernel seg_kpm 4M TSB base address */
    591 int		kpm_tsbsz;		/* kernel seg_kpm 4M TSB size code */
    592 uint64_t	kpmsm_tsbbase;		/* kernel seg_kpm 8K TSB base address */
    593 int		kpmsm_tsbsz;		/* kernel seg_kpm 8K TSB size code */
    594 
    595 #ifndef sun4v
    596 int		utsb_dtlb_ttenum = -1;	/* index in TLB for utsb locked TTE */
    597 int		utsb4m_dtlb_ttenum = -1; /* index in TLB for 4M TSB TTE */
    598 int		dtlb_resv_ttenum;	/* index in TLB of first reserved TTE */
    599 caddr_t		utsb_vabase;		/* reserved kernel virtual memory */
    600 caddr_t		utsb4m_vabase;		/* for trap handler TSB accesses */
    601 #endif /* sun4v */
    602 uint64_t	tsb_alloc_bytes = 0;	/* bytes allocated to TSBs */
    603 vmem_t		*kmem_tsb_default_arena[NLGRPS_MAX];	/* For dynamic TSBs */
    604 vmem_t		*kmem_bigtsb_default_arena[NLGRPS_MAX]; /* dynamic 256M TSBs */
    605 
    606 /*
    607  * Size to use for TSB slabs.  Future platforms that support page sizes
    608  * larger than 4M may wish to change these values, and provide their own
    609  * assembly macros for building and decoding the TSB base register contents.
    610  * Note disable_large_pages will override the value set here.
    611  */
    612 static	uint_t tsb_slab_ttesz = TTE4M;
    613 size_t	tsb_slab_size = MMU_PAGESIZE4M;
    614 uint_t	tsb_slab_shift = MMU_PAGESHIFT4M;
    615 /* PFN mask for TTE */
    616 size_t	tsb_slab_mask = MMU_PAGEOFFSET4M >> MMU_PAGESHIFT;
    617 
    618 /*
    619  * Size to use for TSB slabs.  These are used only when 256M tsb arenas
    620  * exist.
    621  */
    622 static uint_t	bigtsb_slab_ttesz = TTE256M;
    623 static size_t	bigtsb_slab_size = MMU_PAGESIZE256M;
    624 static uint_t	bigtsb_slab_shift = MMU_PAGESHIFT256M;
    625 /* 256M page alignment for 8K pfn */
    626 static size_t	bigtsb_slab_mask = MMU_PAGEOFFSET256M >> MMU_PAGESHIFT;
    627 
    628 /* largest TSB size to grow to, will be smaller on smaller memory systems */
    629 static int	tsb_max_growsize = 0;
    630 
    631 /*
    632  * Tunable parameters dealing with TSB policies.
    633  */
    634 
    635 /*
    636  * This undocumented tunable forces all 8K TSBs to be allocated from
    637  * the kernel heap rather than from the kmem_tsb_default_arena arenas.
    638  */
    639 #ifdef	DEBUG
    640 int	tsb_forceheap = 0;
    641 #endif	/* DEBUG */
    642 
    643 /*
    644  * Decide whether to use per-lgroup arenas, or one global set of
    645  * TSB arenas.  The default is not to break up per-lgroup, since
    646  * most platforms don't recognize any tangible benefit from it.
    647  */
    648 int	tsb_lgrp_affinity = 0;
    649 
    650 /*
    651  * Used for growing the TSB based on the process RSS.
    652  * tsb_rss_factor is based on the smallest TSB, and is
    653  * shifted by the TSB size to determine if we need to grow.
    654  * The default will grow the TSB if the number of TTEs for
    655  * this page size exceeds 75% of the number of TSB entries,
    656  * which should _almost_ eliminate all conflict misses
    657  * (at the expense of using up lots and lots of memory).
    658  */
    659 #define	TSB_RSS_FACTOR		(TSB_ENTRIES(TSB_MIN_SZCODE) * 0.75)
    660 #define	SFMMU_RSS_TSBSIZE(tsbszc)	(tsb_rss_factor << tsbszc)
    661 #define	SELECT_TSB_SIZECODE(pgcnt) ( \
    662 	(enable_tsb_rss_sizing)? sfmmu_select_tsb_szc(pgcnt) : \
    663 	default_tsb_size)
    664 #define	TSB_OK_SHRINK()	\
    665 	(tsb_alloc_bytes > tsb_alloc_hiwater || freemem < desfree)
    666 #define	TSB_OK_GROW()	\
    667 	(tsb_alloc_bytes < tsb_alloc_hiwater && freemem > desfree)
    668 
    669 int	enable_tsb_rss_sizing = 1;
    670 int	tsb_rss_factor	= (int)TSB_RSS_FACTOR;
    671 
    672 /* which TSB size code to use for new address spaces or if rss sizing off */
    673 int default_tsb_size = TSB_8K_SZCODE;
    674 
    675 static uint64_t tsb_alloc_hiwater; /* limit TSB reserved memory */
    676 uint64_t tsb_alloc_hiwater_factor; /* tsb_alloc_hiwater = physmem / this */
    677 #define	TSB_ALLOC_HIWATER_FACTOR_DEFAULT	32
    678 
    679 #ifdef DEBUG
    680 static int tsb_random_size = 0;	/* set to 1 to test random tsb sizes on alloc */
    681 static int tsb_grow_stress = 0;	/* if set to 1, keep replacing TSB w/ random */
    682 static int tsb_alloc_mtbf = 0;	/* fail allocation every n attempts */
    683 static int tsb_alloc_fail_mtbf = 0;
    684 static int tsb_alloc_count = 0;
    685 #endif /* DEBUG */
    686 
    687 /* if set to 1, will remap valid TTEs when growing TSB. */
    688 int tsb_remap_ttes = 1;
    689 
    690 /*
    691  * If we have more than this many mappings, allocate a second TSB.
    692  * This default is chosen because the I/D fully associative TLBs are
    693  * assumed to have at least 8 available entries. Platforms with a
    694  * larger fully-associative TLB could probably override the default.
    695  */
    696 
    697 #ifdef sun4v
    698 int tsb_sectsb_threshold = 0;
    699 #else
    700 int tsb_sectsb_threshold = 8;
    701 #endif
    702 
    703 /*
    704  * kstat data
    705  */
    706 struct sfmmu_global_stat sfmmu_global_stat;
    707 struct sfmmu_tsbsize_stat sfmmu_tsbsize_stat;
    708 
    709 /*
    710  * Global data
    711  */
    712 sfmmu_t 	*ksfmmup;		/* kernel's hat id */
    713 
    714 #ifdef DEBUG
    715 static void	chk_tte(tte_t *, tte_t *, tte_t *, struct hme_blk *);
    716 #endif
    717 
    718 /* sfmmu locking operations */
    719 static kmutex_t *sfmmu_mlspl_enter(struct page *, int);
    720 static int	sfmmu_mlspl_held(struct page *, int);
    721 
    722 kmutex_t *sfmmu_page_enter(page_t *);
    723 void	sfmmu_page_exit(kmutex_t *);
    724 int	sfmmu_page_spl_held(struct page *);
    725 
    726 /* sfmmu internal locking operations - accessed directly */
    727 static void	sfmmu_mlist_reloc_enter(page_t *, page_t *,
    728 				kmutex_t **, kmutex_t **);
    729 static void	sfmmu_mlist_reloc_exit(kmutex_t *, kmutex_t *);
    730 static hatlock_t *
    731 		sfmmu_hat_enter(sfmmu_t *);
    732 static hatlock_t *
    733 		sfmmu_hat_tryenter(sfmmu_t *);
    734 static void	sfmmu_hat_exit(hatlock_t *);
    735 static void	sfmmu_hat_lock_all(void);
    736 static void	sfmmu_hat_unlock_all(void);
    737 static void	sfmmu_ismhat_enter(sfmmu_t *, int);
    738 static void	sfmmu_ismhat_exit(sfmmu_t *, int);
    739 
    740 /*
    741  * Array of mutexes protecting a page's mapping list and p_nrm field.
    742  *
    743  * The hash function looks complicated, but is made up so that:
    744  *
    745  * "pp" not shifted, so adjacent pp values will hash to different cache lines
    746  *  (8 byte alignment * 8 bytes/mutes == 64 byte coherency subblock)
    747  *
    748  * "pp" >> mml_shift, incorporates more source bits into the hash result
    749  *
    750  *  "& (mml_table_size - 1), should be faster than using remainder "%"
    751  *
    752  * Hopefully, mml_table, mml_table_size and mml_shift are all in the same
    753  * cacheline, since they get declared next to each other below. We'll trust
    754  * ld not to do something random.
    755  */
    756 #ifdef	DEBUG
    757 int mlist_hash_debug = 0;
    758 #define	MLIST_HASH(pp)	(mlist_hash_debug ? &mml_table[0] : \
    759 	&mml_table[((uintptr_t)(pp) + \
    760 	((uintptr_t)(pp) >> mml_shift)) & (mml_table_sz - 1)])
    761 #else	/* !DEBUG */
    762 #define	MLIST_HASH(pp)   &mml_table[ \
    763 	((uintptr_t)(pp) + ((uintptr_t)(pp) >> mml_shift)) & (mml_table_sz - 1)]
    764 #endif	/* !DEBUG */
    765 
    766 kmutex_t		*mml_table;
    767 uint_t			mml_table_sz;	/* must be a power of 2 */
    768 uint_t			mml_shift;	/* log2(mml_table_sz) + 3 for align */
    769 
    770 kpm_hlk_t	*kpmp_table;
    771 uint_t		kpmp_table_sz;	/* must be a power of 2 */
    772 uchar_t		kpmp_shift;
    773 
    774 kpm_shlk_t	*kpmp_stable;
    775 uint_t		kpmp_stable_sz;	/* must be a power of 2 */
    776 
    777 /*
    778  * SPL_HASH was improved to avoid false cache line sharing
    779  */
    780 #define	SPL_TABLE_SIZE	128
    781 #define	SPL_MASK	(SPL_TABLE_SIZE - 1)
    782 #define	SPL_SHIFT	7		/* log2(SPL_TABLE_SIZE) */
    783 
    784 #define	SPL_INDEX(pp) \
    785 	((((uintptr_t)(pp) >> SPL_SHIFT) ^ \
    786 	((uintptr_t)(pp) >> (SPL_SHIFT << 1))) & \
    787 	(SPL_TABLE_SIZE - 1))
    788 
    789 #define	SPL_HASH(pp)    \
    790 	(&sfmmu_page_lock[SPL_INDEX(pp) & SPL_MASK].pad_mutex)
    791 
    792 static	pad_mutex_t	sfmmu_page_lock[SPL_TABLE_SIZE];
    793 
    794 
    795 /*
    796  * hat_unload_callback() will group together callbacks in order
    797  * to avoid xt_sync() calls.  This is the maximum size of the group.
    798  */
    799 #define	MAX_CB_ADDR	32
    800 
    801 tte_t	hw_tte;
    802 static ulong_t sfmmu_dmr_maxbit = DMR_MAXBIT;
    803 
    804 static char	*mmu_ctx_kstat_names[] = {
    805 	"mmu_ctx_tsb_exceptions",
    806 	"mmu_ctx_tsb_raise_exception",
    807 	"mmu_ctx_wrap_around",
    808 };
    809 
    810 /*
    811  * Wrapper for vmem_xalloc since vmem_create only allows limited
    812  * parameters for vm_source_alloc functions.  This function allows us
    813  * to specify alignment consistent with the size of the object being
    814  * allocated.
    815  */
    816 static void *
    817 sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *vmp, size_t size, int vmflag)
    818 {
    819 	return (vmem_xalloc(vmp, size, size, 0, 0, NULL, NULL, vmflag));
    820 }
    821 
    822 /* Common code for setting tsb_alloc_hiwater. */
    823 #define	SFMMU_SET_TSB_ALLOC_HIWATER(pages)	tsb_alloc_hiwater = \
    824 		ptob(pages) / tsb_alloc_hiwater_factor
    825 
    826 /*
    827  * Set tsb_max_growsize to allow at most all of physical memory to be mapped by
    828  * a single TSB.  physmem is the number of physical pages so we need physmem 8K
    829  * TTEs to represent all those physical pages.  We round this up by using
    830  * 1<<highbit().  To figure out which size code to use, remember that the size
    831  * code is just an amount to shift the smallest TSB size to get the size of
    832  * this TSB.  So we subtract that size, TSB_START_SIZE, from highbit() (or
    833  * highbit() - 1) to get the size code for the smallest TSB that can represent
    834  * all of physical memory, while erring on the side of too much.
    835  *
    836  * Restrict tsb_max_growsize to make sure that:
    837  *	1) TSBs can't grow larger than the TSB slab size
    838  *	2) TSBs can't grow larger than UTSB_MAX_SZCODE.
    839  */
    840 #define	SFMMU_SET_TSB_MAX_GROWSIZE(pages) {				\
    841 	int	_i, _szc, _slabszc, _tsbszc;				\
    842 									\
    843 	_i = highbit(pages);						\
    844 	if ((1 << (_i - 1)) == (pages))					\
    845 		_i--;		/* 2^n case, round down */              \
    846 	_szc = _i - TSB_START_SIZE;					\
    847 	_slabszc = bigtsb_slab_shift - (TSB_START_SIZE + TSB_ENTRY_SHIFT); \
    848 	_tsbszc = MIN(_szc, _slabszc);                                  \
    849 	tsb_max_growsize = MIN(_tsbszc, UTSB_MAX_SZCODE);               \
    850 }
    851 
    852 /*
    853  * Given a pointer to an sfmmu and a TTE size code, return a pointer to the
    854  * tsb_info which handles that TTE size.
    855  */
    856 #define	SFMMU_GET_TSBINFO(tsbinfop, sfmmup, tte_szc) {			\
    857 	(tsbinfop) = (sfmmup)->sfmmu_tsb;				\
    858 	ASSERT(((tsbinfop)->tsb_flags & TSB_SHAREDCTX) ||		\
    859 	    sfmmu_hat_lock_held(sfmmup));				\
    860 	if ((tte_szc) >= TTE4M)	{					\
    861 		ASSERT((tsbinfop) != NULL);				\
    862 		(tsbinfop) = (tsbinfop)->tsb_next;			\
    863 	}								\
    864 }
    865 
    866 /*
    867  * Macro to use to unload entries from the TSB.
    868  * It has knowledge of which page sizes get replicated in the TSB
    869  * and will call the appropriate unload routine for the appropriate size.
    870  */
    871 #define	SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, ismhat)		\
    872 {									\
    873 	int ttesz = get_hblk_ttesz(hmeblkp);				\
    874 	if (ttesz == TTE8K || ttesz == TTE4M) {				\
    875 		sfmmu_unload_tsb(sfmmup, addr, ttesz);			\
    876 	} else {							\
    877 		caddr_t sva = ismhat ? addr : 				\
    878 		    (caddr_t)get_hblk_base(hmeblkp);			\
    879 		caddr_t eva = sva + get_hblk_span(hmeblkp);		\
    880 		ASSERT(addr >= sva && addr < eva);			\
    881 		sfmmu_unload_tsb_range(sfmmup, sva, eva, ttesz);	\
    882 	}								\
    883 }
    884 
    885 
    886 /* Update tsb_alloc_hiwater after memory is configured. */
    887 /*ARGSUSED*/
    888 static void
    889 sfmmu_update_post_add(void *arg, pgcnt_t delta_pages)
    890 {
    891 	/* Assumes physmem has already been updated. */
    892 	SFMMU_SET_TSB_ALLOC_HIWATER(physmem);
    893 	SFMMU_SET_TSB_MAX_GROWSIZE(physmem);
    894 }
    895 
    896 /*
    897  * Update tsb_alloc_hiwater before memory is deleted.  We'll do nothing here
    898  * and update tsb_alloc_hiwater and tsb_max_growsize after the memory is
    899  * deleted.
    900  */
    901 /*ARGSUSED*/
    902 static int
    903 sfmmu_update_pre_del(void *arg, pgcnt_t delta_pages)
    904 {
    905 	return (0);
    906 }
    907 
    908 /* Update tsb_alloc_hiwater after memory fails to be unconfigured. */
    909 /*ARGSUSED*/
    910 static void
    911 sfmmu_update_post_del(void *arg, pgcnt_t delta_pages, int cancelled)
    912 {
    913 	/*
    914 	 * Whether the delete was cancelled or not, just go ahead and update
    915 	 * tsb_alloc_hiwater and tsb_max_growsize.
    916 	 */
    917 	SFMMU_SET_TSB_ALLOC_HIWATER(physmem);
    918 	SFMMU_SET_TSB_MAX_GROWSIZE(physmem);
    919 }
    920 
    921 static kphysm_setup_vector_t sfmmu_update_vec = {
    922 	KPHYSM_SETUP_VECTOR_VERSION,	/* version */
    923 	sfmmu_update_post_add,		/* post_add */
    924 	sfmmu_update_pre_del,		/* pre_del */
    925 	sfmmu_update_post_del		/* post_del */
    926 };
    927 
    928 
    929 /*
    930  * HME_BLK HASH PRIMITIVES
    931  */
    932 
    933 /*
    934  * Enter a hme on the mapping list for page pp.
    935  * When large pages are more prevalent in the system we might want to
    936  * keep the mapping list in ascending order by the hment size. For now,
    937  * small pages are more frequent, so don't slow it down.
    938  */
    939 #define	HME_ADD(hme, pp)					\
    940 {								\
    941 	ASSERT(sfmmu_mlist_held(pp));				\
    942 								\
    943 	hme->hme_prev = NULL;					\
    944 	hme->hme_next = pp->p_mapping;				\
    945 	hme->hme_page = pp;					\
    946 	if (pp->p_mapping) {					\
    947 		((struct sf_hment *)(pp->p_mapping))->hme_prev = hme;\
    948 		ASSERT(pp->p_share > 0);			\
    949 	} else  {						\
    950 		/* EMPTY */					\
    951 		ASSERT(pp->p_share == 0);			\
    952 	}							\
    953 	pp->p_mapping = hme;					\
    954 	pp->p_share++;						\
    955 }
    956 
    957 /*
    958  * Enter a hme on the mapping list for page pp.
    959  * If we are unmapping a large translation, we need to make sure that the
    960  * change is reflect in the corresponding bit of the p_index field.
    961  */
    962 #define	HME_SUB(hme, pp)					\
    963 {								\
    964 	ASSERT(sfmmu_mlist_held(pp));				\
    965 	ASSERT(hme->hme_page == pp || IS_PAHME(hme));		\
    966 								\
    967 	if (pp->p_mapping == NULL) {				\
    968 		panic("hme_remove - no mappings");		\
    969 	}							\
    970 								\
    971 	membar_stst();	/* ensure previous stores finish */	\
    972 								\
    973 	ASSERT(pp->p_share > 0);				\
    974 	pp->p_share--;						\
    975 								\
    976 	if (hme->hme_prev) {					\
    977 		ASSERT(pp->p_mapping != hme);			\
    978 		ASSERT(hme->hme_prev->hme_page == pp ||		\
    979 			IS_PAHME(hme->hme_prev));		\
    980 		hme->hme_prev->hme_next = hme->hme_next;	\
    981 	} else {						\
    982 		ASSERT(pp->p_mapping == hme);			\
    983 		pp->p_mapping = hme->hme_next;			\
    984 		ASSERT((pp->p_mapping == NULL) ?		\
    985 			(pp->p_share == 0) : 1);		\
    986 	}							\
    987 								\
    988 	if (hme->hme_next) {					\
    989 		ASSERT(hme->hme_next->hme_page == pp ||		\
    990 			IS_PAHME(hme->hme_next));		\
    991 		hme->hme_next->hme_prev = hme->hme_prev;	\
    992 	}							\
    993 								\
    994 	/* zero out the entry */				\
    995 	hme->hme_next = NULL;					\
    996 	hme->hme_prev = NULL;					\
    997 	hme->hme_page = NULL;					\
    998 								\
    999 	if (hme_size(hme) > TTE8K) {				\
   1000 		/* remove mappings for remainder of large pg */	\
   1001 		sfmmu_rm_large_mappings(pp, hme_size(hme));	\
   1002 	}							\
   1003 }
   1004 
   1005 /*
   1006  * This function returns the hment given the hme_blk and a vaddr.
   1007  * It assumes addr has already been checked to belong to hme_blk's
   1008  * range.
   1009  */
   1010 #define	HBLKTOHME(hment, hmeblkp, addr)					\
   1011 {									\
   1012 	int index;							\
   1013 	HBLKTOHME_IDX(hment, hmeblkp, addr, index)			\
   1014 }
   1015 
   1016 /*
   1017  * Version of HBLKTOHME that also returns the index in hmeblkp
   1018  * of the hment.
   1019  */
   1020 #define	HBLKTOHME_IDX(hment, hmeblkp, addr, idx)			\
   1021 {									\
   1022 	ASSERT(in_hblk_range((hmeblkp), (addr)));			\
   1023 									\
   1024 	if (get_hblk_ttesz(hmeblkp) == TTE8K) {				\
   1025 		idx = (((uintptr_t)(addr) >> MMU_PAGESHIFT) & (NHMENTS-1)); \
   1026 	} else								\
   1027 		idx = 0;						\
   1028 									\
   1029 	(hment) = &(hmeblkp)->hblk_hme[idx];				\
   1030 }
   1031 
   1032 /*
   1033  * Disable any page sizes not supported by the CPU
   1034  */
   1035 void
   1036 hat_init_pagesizes()
   1037 {
   1038 	int 		i;
   1039 
   1040 	mmu_exported_page_sizes = 0;
   1041 	for (i = TTE8K; i < max_mmu_page_sizes; i++) {
   1042 
   1043 		szc_2_userszc[i] = (uint_t)-1;
   1044 		userszc_2_szc[i] = (uint_t)-1;
   1045 
   1046 		if ((mmu_exported_pagesize_mask & (1 << i)) == 0) {
   1047 			disable_large_pages |= (1 << i);
   1048 		} else {
   1049 			szc_2_userszc[i] = mmu_exported_page_sizes;
   1050 			userszc_2_szc[mmu_exported_page_sizes] = i;
   1051 			mmu_exported_page_sizes++;
   1052 		}
   1053 	}
   1054 
   1055 	disable_ism_large_pages |= disable_large_pages;
   1056 	disable_auto_data_large_pages = disable_large_pages;
   1057 	disable_auto_text_large_pages = disable_large_pages;
   1058 
   1059 	/*
   1060 	 * Initialize mmu-specific large page sizes.
   1061 	 */
   1062 	if (&mmu_large_pages_disabled) {
   1063 		disable_large_pages |= mmu_large_pages_disabled(HAT_LOAD);
   1064 		disable_ism_large_pages |=
   1065 		    mmu_large_pages_disabled(HAT_LOAD_SHARE);
   1066 		disable_auto_data_large_pages |=
   1067 		    mmu_large_pages_disabled(HAT_AUTO_DATA);
   1068 		disable_auto_text_large_pages |=
   1069 		    mmu_large_pages_disabled(HAT_AUTO_TEXT);
   1070 	}
   1071 }
   1072 
   1073 /*
   1074  * Initialize the hardware address translation structures.
   1075  */
   1076 void
   1077 hat_init(void)
   1078 {
   1079 	int 		i;
   1080 	uint_t		sz;
   1081 	size_t		size;
   1082 
   1083 	hat_lock_init();
   1084 	hat_kstat_init();
   1085 
   1086 	/*
   1087 	 * Hardware-only bits in a TTE
   1088 	 */
   1089 	MAKE_TTE_MASK(&hw_tte);
   1090 
   1091 	hat_init_pagesizes();
   1092 
   1093 	/* Initialize the hash locks */
   1094 	for (i = 0; i < khmehash_num; i++) {
   1095 		mutex_init(&khme_hash[i].hmehash_mutex, NULL,
   1096 		    MUTEX_DEFAULT, NULL);
   1097 		khme_hash[i].hmeh_nextpa = HMEBLK_ENDPA;
   1098 	}
   1099 	for (i = 0; i < uhmehash_num; i++) {
   1100 		mutex_init(&uhme_hash[i].hmehash_mutex, NULL,
   1101 		    MUTEX_DEFAULT, NULL);
   1102 		uhme_hash[i].hmeh_nextpa = HMEBLK_ENDPA;
   1103 	}
   1104 	khmehash_num--;		/* make sure counter starts from 0 */
   1105 	uhmehash_num--;		/* make sure counter starts from 0 */
   1106 
   1107 	/*
   1108 	 * Allocate context domain structures.
   1109 	 *
   1110 	 * A platform may choose to modify max_mmu_ctxdoms in
   1111 	 * set_platform_defaults(). If a platform does not define
   1112 	 * a set_platform_defaults() or does not choose to modify
   1113 	 * max_mmu_ctxdoms, it gets one MMU context domain for every CPU.
   1114 	 *
   1115 	 * For sun4v, there will be one global context domain, this is to
   1116 	 * avoid the ldom cpu substitution problem.
   1117 	 *
   1118 	 * For all platforms that have CPUs sharing MMUs, this
   1119 	 * value must be defined.
   1120 	 */
   1121 	if (max_mmu_ctxdoms == 0) {
   1122 #ifndef sun4v
   1123 		max_mmu_ctxdoms = max_ncpus;
   1124 #else /* sun4v */
   1125 		max_mmu_ctxdoms = 1;
   1126 #endif /* sun4v */
   1127 	}
   1128 
   1129 	size = max_mmu_ctxdoms * sizeof (mmu_ctx_t *);
   1130 	mmu_ctxs_tbl = kmem_zalloc(size, KM_SLEEP);
   1131 
   1132 	/* mmu_ctx_t is 64 bytes aligned */
   1133 	mmuctxdom_cache = kmem_cache_create("mmuctxdom_cache",
   1134 	    sizeof (mmu_ctx_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
   1135 	/*
   1136 	 * MMU context domain initialization for the Boot CPU.
   1137 	 * This needs the context domains array allocated above.
   1138 	 */
   1139 	mutex_enter(&cpu_lock);
   1140 	sfmmu_cpu_init(CPU);
   1141 	mutex_exit(&cpu_lock);
   1142 
   1143 	/*
   1144 	 * Intialize ism mapping list lock.
   1145 	 */
   1146 
   1147 	mutex_init(&ism_mlist_lock, NULL, MUTEX_DEFAULT, NULL);
   1148 
   1149 	/*
   1150 	 * Each sfmmu structure carries an array of MMU context info
   1151 	 * structures, one per context domain. The size of this array depends
   1152 	 * on the maximum number of context domains. So, the size of the
   1153 	 * sfmmu structure varies per platform.
   1154 	 *
   1155 	 * sfmmu is allocated from static arena, because trap
   1156 	 * handler at TL > 0 is not allowed to touch kernel relocatable
   1157 	 * memory. sfmmu's alignment is changed to 64 bytes from
   1158 	 * default 8 bytes, as the lower 6 bits will be used to pass
   1159 	 * pgcnt to vtag_flush_pgcnt_tl1.
   1160 	 */
   1161 	size = sizeof (sfmmu_t) + sizeof (sfmmu_ctx_t) * (max_mmu_ctxdoms - 1);
   1162 
   1163 	sfmmuid_cache = kmem_cache_create("sfmmuid_cache", size,
   1164 	    64, sfmmu_idcache_constructor, sfmmu_idcache_destructor,
   1165 	    NULL, NULL, static_arena, 0);
   1166 
   1167 	sfmmu_tsbinfo_cache = kmem_cache_create("sfmmu_tsbinfo_cache",
   1168 	    sizeof (struct tsb_info), 0, NULL, NULL, NULL, NULL, NULL, 0);
   1169 
   1170 	/*
   1171 	 * Since we only use the tsb8k cache to "borrow" pages for TSBs
   1172 	 * from the heap when low on memory or when TSB_FORCEALLOC is
   1173 	 * specified, don't use magazines to cache them--we want to return
   1174 	 * them to the system as quickly as possible.
   1175 	 */
   1176 	sfmmu_tsb8k_cache = kmem_cache_create("sfmmu_tsb8k_cache",
   1177 	    MMU_PAGESIZE, MMU_PAGESIZE, NULL, NULL, NULL, NULL,
   1178 	    static_arena, KMC_NOMAGAZINE);
   1179 
   1180 	/*
   1181 	 * Set tsb_alloc_hiwater to 1/tsb_alloc_hiwater_factor of physical
   1182 	 * memory, which corresponds to the old static reserve for TSBs.
   1183 	 * tsb_alloc_hiwater_factor defaults to 32.  This caps the amount of
   1184 	 * memory we'll allocate for TSB slabs; beyond this point TSB
   1185 	 * allocations will be taken from the kernel heap (via
   1186 	 * sfmmu_tsb8k_cache) and will be throttled as would any other kmem
   1187 	 * consumer.
   1188 	 */
   1189 	if (tsb_alloc_hiwater_factor == 0) {
   1190 		tsb_alloc_hiwater_factor = TSB_ALLOC_HIWATER_FACTOR_DEFAULT;
   1191 	}
   1192 	SFMMU_SET_TSB_ALLOC_HIWATER(physmem);
   1193 
   1194 	for (sz = tsb_slab_ttesz; sz > 0; sz--) {
   1195 		if (!(disable_large_pages & (1 << sz)))
   1196 			break;
   1197 	}
   1198 
   1199 	if (sz < tsb_slab_ttesz) {
   1200 		tsb_slab_ttesz = sz;
   1201 		tsb_slab_shift = MMU_PAGESHIFT + (sz << 1) + sz;
   1202 		tsb_slab_size = 1 << tsb_slab_shift;
   1203 		tsb_slab_mask = (1 << (tsb_slab_shift - MMU_PAGESHIFT)) - 1;
   1204 		use_bigtsb_arena = 0;
   1205 	} else if (use_bigtsb_arena &&
   1206 	    (disable_large_pages & (1 << bigtsb_slab_ttesz))) {
   1207 		use_bigtsb_arena = 0;
   1208 	}
   1209 
   1210 	if (!use_bigtsb_arena) {
   1211 		bigtsb_slab_shift = tsb_slab_shift;
   1212 	}
   1213 	SFMMU_SET_TSB_MAX_GROWSIZE(physmem);
   1214 
   1215 	/*
   1216 	 * On smaller memory systems, allocate TSB memory in smaller chunks
   1217 	 * than the default 4M slab size. We also honor disable_large_pages
   1218 	 * here.
   1219 	 *
   1220 	 * The trap handlers need to be patched with the final slab shift,
   1221 	 * since they need to be able to construct the TSB pointer at runtime.
   1222 	 */
   1223 	if ((tsb_max_growsize <= TSB_512K_SZCODE) &&
   1224 	    !(disable_large_pages & (1 << TTE512K))) {
   1225 		tsb_slab_ttesz = TTE512K;
   1226 		tsb_slab_shift = MMU_PAGESHIFT512K;
   1227 		tsb_slab_size = MMU_PAGESIZE512K;
   1228 		tsb_slab_mask = MMU_PAGEOFFSET512K >> MMU_PAGESHIFT;
   1229 		use_bigtsb_arena = 0;
   1230 	}
   1231 
   1232 	if (!use_bigtsb_arena) {
   1233 		bigtsb_slab_ttesz = tsb_slab_ttesz;
   1234 		bigtsb_slab_shift = tsb_slab_shift;
   1235 		bigtsb_slab_size = tsb_slab_size;
   1236 		bigtsb_slab_mask = tsb_slab_mask;
   1237 	}
   1238 
   1239 
   1240 	/*
   1241 	 * Set up memory callback to update tsb_alloc_hiwater and
   1242 	 * tsb_max_growsize.
   1243 	 */
   1244 	i = kphysm_setup_func_register(&sfmmu_update_vec, (void *) 0);
   1245 	ASSERT(i == 0);
   1246 
   1247 	/*
   1248 	 * kmem_tsb_arena is the source from which large TSB slabs are
   1249 	 * drawn.  The quantum of this arena corresponds to the largest
   1250 	 * TSB size we can dynamically allocate for user processes.
   1251 	 * Currently it must also be a supported page size since we
   1252 	 * use exactly one translation entry to map each slab page.
   1253 	 *
   1254 	 * The per-lgroup kmem_tsb_default_arena arenas are the arenas from
   1255 	 * which most TSBs are allocated.  Since most TSB allocations are
   1256 	 * typically 8K we have a kmem cache we stack on top of each
   1257 	 * kmem_tsb_default_arena to speed up those allocations.
   1258 	 *
   1259 	 * Note the two-level scheme of arenas is required only
   1260 	 * because vmem_create doesn't allow us to specify alignment
   1261 	 * requirements.  If this ever changes the code could be
   1262 	 * simplified to use only one level of arenas.
   1263 	 *
   1264 	 * If 256M page support exists on sun4v, 256MB kmem_bigtsb_arena
   1265 	 * will be provided in addition to the 4M kmem_tsb_arena.
   1266 	 */
   1267 	if (use_bigtsb_arena) {
   1268 		kmem_bigtsb_arena = vmem_create("kmem_bigtsb", NULL, 0,
   1269 		    bigtsb_slab_size, sfmmu_vmem_xalloc_aligned_wrapper,
   1270 		    vmem_xfree, heap_arena, 0, VM_SLEEP);
   1271 	}
   1272 
   1273 	kmem_tsb_arena = vmem_create("kmem_tsb", NULL, 0, tsb_slab_size,
   1274 	    sfmmu_vmem_xalloc_aligned_wrapper,
   1275 	    vmem_xfree, heap_arena, 0, VM_SLEEP);
   1276 
   1277 	if (tsb_lgrp_affinity) {
   1278 		char s[50];
   1279 		for (i = 0; i < NLGRPS_MAX; i++) {
   1280 			if (use_bigtsb_arena) {
   1281 				(void) sprintf(s, "kmem_bigtsb_lgrp%d", i);
   1282 				kmem_bigtsb_default_arena[i] = vmem_create(s,
   1283 				    NULL, 0, 2 * tsb_slab_size,
   1284 				    sfmmu_tsb_segkmem_alloc,
   1285 				    sfmmu_tsb_segkmem_free, kmem_bigtsb_arena,
   1286 				    0, VM_SLEEP | VM_BESTFIT);
   1287 			}
   1288 
   1289 			(void) sprintf(s, "kmem_tsb_lgrp%d", i);
   1290 			kmem_tsb_default_arena[i] = vmem_create(s,
   1291 			    NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc,
   1292 			    sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0,
   1293 			    VM_SLEEP | VM_BESTFIT);
   1294 
   1295 			(void) sprintf(s, "sfmmu_tsb_lgrp%d_cache", i);
   1296 			sfmmu_tsb_cache[i] = kmem_cache_create(s,
   1297 			    PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL,
   1298 			    kmem_tsb_default_arena[i], 0);
   1299 		}
   1300 	} else {
   1301 		if (use_bigtsb_arena) {
   1302 			kmem_bigtsb_default_arena[0] =
   1303 			    vmem_create("kmem_bigtsb_default", NULL, 0,
   1304 			    2 * tsb_slab_size, sfmmu_tsb_segkmem_alloc,
   1305 			    sfmmu_tsb_segkmem_free, kmem_bigtsb_arena, 0,
   1306 			    VM_SLEEP | VM_BESTFIT);
   1307 		}
   1308 
   1309 		kmem_tsb_default_arena[0] = vmem_create("kmem_tsb_default",
   1310 		    NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc,
   1311 		    sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0,
   1312 		    VM_SLEEP | VM_BESTFIT);
   1313 		sfmmu_tsb_cache[0] = kmem_cache_create("sfmmu_tsb_cache",
   1314 		    PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL,
   1315 		    kmem_tsb_default_arena[0], 0);
   1316 	}
   1317 
   1318 	sfmmu8_cache = kmem_cache_create("sfmmu8_cache", HME8BLK_SZ,
   1319 	    HMEBLK_ALIGN, sfmmu_hblkcache_constructor,
   1320 	    sfmmu_hblkcache_destructor,
   1321 	    sfmmu_hblkcache_reclaim, (void *)HME8BLK_SZ,
   1322 	    hat_memload_arena, KMC_NOHASH);
   1323 
   1324 	hat_memload1_arena = vmem_create("hat_memload1", NULL, 0, PAGESIZE,
   1325 	    segkmem_alloc_permanent, segkmem_free, heap_arena, 0, VM_SLEEP);
   1326 
   1327 	sfmmu1_cache = kmem_cache_create("sfmmu1_cache", HME1BLK_SZ,
   1328 	    HMEBLK_ALIGN, sfmmu_hblkcache_constructor,
   1329 	    sfmmu_hblkcache_destructor,
   1330 	    NULL, (void *)HME1BLK_SZ,
   1331 	    hat_memload1_arena, KMC_NOHASH);
   1332 
   1333 	pa_hment_cache = kmem_cache_create("pa_hment_cache", PAHME_SZ,
   1334 	    0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
   1335 
   1336 	ism_blk_cache = kmem_cache_create("ism_blk_cache",
   1337 	    sizeof (ism_blk_t), ecache_alignsize, NULL, NULL,
   1338 	    NULL, NULL, static_arena, KMC_NOHASH);
   1339 
   1340 	ism_ment_cache = kmem_cache_create("ism_ment_cache",
   1341 	    sizeof (ism_ment_t), 0, NULL, NULL,
   1342 	    NULL, NULL, NULL, 0);
   1343 
   1344 	/*
   1345 	 * We grab the first hat for the kernel,
   1346 	 */
   1347 	AS_LOCK_ENTER(&kas, &kas.a_lock, RW_WRITER);
   1348 	kas.a_hat = hat_alloc(&kas);
   1349 	AS_LOCK_EXIT(&kas, &kas.a_lock);
   1350 
   1351 	/*
   1352 	 * Initialize hblk_reserve.
   1353 	 */
   1354 	((struct hme_blk *)hblk_reserve)->hblk_nextpa =
   1355 	    va_to_pa((caddr_t)hblk_reserve);
   1356 
   1357 #ifndef UTSB_PHYS
   1358 	/*
   1359 	 * Reserve some kernel virtual address space for the locked TTEs
   1360 	 * that allow us to probe the TSB from TL>0.
   1361 	 */
   1362 	utsb_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size,
   1363 	    0, 0, NULL, NULL, VM_SLEEP);
   1364 	utsb4m_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size,
   1365 	    0, 0, NULL, NULL, VM_SLEEP);
   1366 #endif
   1367 
   1368 #ifdef VAC
   1369 	/*
   1370 	 * The big page VAC handling code assumes VAC
   1371 	 * will not be bigger than the smallest big
   1372 	 * page- which is 64K.
   1373 	 */
   1374 	if (TTEPAGES(TTE64K) < CACHE_NUM_COLOR) {
   1375 		cmn_err(CE_PANIC, "VAC too big!");
   1376 	}
   1377 #endif
   1378 
   1379 	(void) xhat_init();
   1380 
   1381 	uhme_hash_pa = va_to_pa(uhme_hash);
   1382 	khme_hash_pa = va_to_pa(khme_hash);
   1383 
   1384 	/*
   1385 	 * Initialize relocation locks. kpr_suspendlock is held
   1386 	 * at PIL_MAX to prevent interrupts from pinning the holder
   1387 	 * of a suspended TTE which may access it leading to a
   1388 	 * deadlock condition.
   1389 	 */
   1390 	mutex_init(&kpr_mutex, NULL, MUTEX_DEFAULT, NULL);
   1391 	mutex_init(&kpr_suspendlock, NULL, MUTEX_SPIN, (void *)PIL_MAX);
   1392 
   1393 	/*
   1394 	 * If Shared context support is disabled via /etc/system
   1395 	 * set shctx_on to 0 here if it was set to 1 earlier in boot
   1396 	 * sequence by cpu module initialization code.
   1397 	 */
   1398 	if (shctx_on && disable_shctx) {
   1399 		shctx_on = 0;
   1400 	}
   1401 
   1402 	if (shctx_on) {
   1403 		srd_buckets = kmem_zalloc(SFMMU_MAX_SRD_BUCKETS *
   1404 		    sizeof (srd_buckets[0]), KM_SLEEP);
   1405 		for (i = 0; i < SFMMU_MAX_SRD_BUCKETS; i++) {
   1406 			mutex_init(&srd_buckets[i].srdb_lock, NULL,
   1407 			    MUTEX_DEFAULT, NULL);
   1408 		}
   1409 
   1410 		srd_cache = kmem_cache_create("srd_cache", sizeof (sf_srd_t),
   1411 		    0, sfmmu_srdcache_constructor, sfmmu_srdcache_destructor,
   1412 		    NULL, NULL, NULL, 0);
   1413 		region_cache = kmem_cache_create("region_cache",
   1414 		    sizeof (sf_region_t), 0, sfmmu_rgncache_constructor,
   1415 		    sfmmu_rgncache_destructor, NULL, NULL, NULL, 0);
   1416 		scd_cache = kmem_cache_create("scd_cache", sizeof (sf_scd_t),
   1417 		    0, sfmmu_scdcache_constructor,  sfmmu_scdcache_destructor,
   1418 		    NULL, NULL, NULL, 0);
   1419 	}
   1420 
   1421 	/*
   1422 	 * Pre-allocate hrm_hashtab before enabling the collection of
   1423 	 * refmod statistics.  Allocating on the fly would mean us
   1424 	 * running the risk of suffering recursive mutex enters or
   1425 	 * deadlocks.
   1426 	 */
   1427 	hrm_hashtab = kmem_zalloc(HRM_HASHSIZE * sizeof (struct hrmstat *),
   1428 	    KM_SLEEP);
   1429 
   1430 	/* Allocate per-cpu pending freelist of hmeblks */
   1431 	cpu_hme_pend = kmem_zalloc((NCPU * sizeof (cpu_hme_pend_t)) + 64,
   1432 	    KM_SLEEP);
   1433 	cpu_hme_pend = (cpu_hme_pend_t *)P2ROUNDUP(
   1434 	    (uintptr_t)cpu_hme_pend, 64);
   1435 
   1436 	for (i = 0; i < NCPU; i++) {
   1437 		mutex_init(&cpu_hme_pend[i].chp_mutex, NULL, MUTEX_DEFAULT,
   1438 		    NULL);
   1439 	}
   1440 
   1441 	if (cpu_hme_pend_thresh == 0) {
   1442 		cpu_hme_pend_thresh = CPU_HME_PEND_THRESH;
   1443 	}
   1444 }
   1445 
   1446 /*
   1447  * Initialize locking for the hat layer, called early during boot.
   1448  */
   1449 static void
   1450 hat_lock_init()
   1451 {
   1452 	int i;
   1453 
   1454 	/*
   1455 	 * initialize the array of mutexes protecting a page's mapping
   1456 	 * list and p_nrm field.
   1457 	 */
   1458 	for (i = 0; i < mml_table_sz; i++)
   1459 		mutex_init(&mml_table[i], NULL, MUTEX_DEFAULT, NULL);
   1460 
   1461 	if (kpm_enable) {
   1462 		for (i = 0; i < kpmp_table_sz; i++) {
   1463 			mutex_init(&kpmp_table[i].khl_mutex, NULL,
   1464 			    MUTEX_DEFAULT, NULL);
   1465 		}
   1466 	}
   1467 
   1468 	/*
   1469 	 * Initialize array of mutex locks that protects sfmmu fields and
   1470 	 * TSB lists.
   1471 	 */
   1472 	for (i = 0; i < SFMMU_NUM_LOCK; i++)
   1473 		mutex_init(HATLOCK_MUTEXP(&hat_lock[i]), NULL, MUTEX_DEFAULT,
   1474 		    NULL);
   1475 }
   1476 
   1477 #define	SFMMU_KERNEL_MAXVA \
   1478 	(kmem64_base ? (uintptr_t)kmem64_end : (SYSLIMIT))
   1479 
   1480 /*
   1481  * Allocate a hat structure.
   1482  * Called when an address space first uses a hat.
   1483  */
   1484 struct hat *
   1485 hat_alloc(struct as *as)
   1486 {
   1487 	sfmmu_t *sfmmup;
   1488 	int i;
   1489 	uint64_t cnum;
   1490 	extern uint_t get_color_start(struct as *);
   1491 
   1492 	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
   1493 	sfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP);
   1494 	sfmmup->sfmmu_as = as;
   1495 	sfmmup->sfmmu_flags = 0;
   1496 	sfmmup->sfmmu_tteflags = 0;
   1497 	sfmmup->sfmmu_rtteflags = 0;
   1498 	LOCK_INIT_CLEAR(&sfmmup->sfmmu_ctx_lock);
   1499 
   1500 	if (as == &kas) {
   1501 		ksfmmup = sfmmup;
   1502 		sfmmup->sfmmu_cext = 0;
   1503 		cnum = KCONTEXT;
   1504 
   1505 		sfmmup->sfmmu_clrstart = 0;
   1506 		sfmmup->sfmmu_tsb = NULL;
   1507 		/*
   1508 		 * hat_kern_setup() will call sfmmu_init_ktsbinfo()
   1509 		 * to setup tsb_info for ksfmmup.
   1510 		 */
   1511 	} else {
   1512 
   1513 		/*
   1514 		 * Just set to invalid ctx. When it faults, it will
   1515 		 * get a valid ctx. This would avoid the situation
   1516 		 * where we get a ctx, but it gets stolen and then
   1517 		 * we fault when we try to run and so have to get
   1518 		 * another ctx.
   1519 		 */
   1520 		sfmmup->sfmmu_cext = 0;
   1521 		cnum = INVALID_CONTEXT;
   1522 
   1523 		/* initialize original physical page coloring bin */
   1524 		sfmmup->sfmmu_clrstart = get_color_start(as);
   1525 #ifdef DEBUG
   1526 		if (tsb_random_size) {
   1527 			uint32_t randval = (uint32_t)gettick() >> 4;
   1528 			int size = randval % (tsb_max_growsize + 1);
   1529 
   1530 			/* chose a random tsb size for stress testing */
   1531 			(void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, size,
   1532 			    TSB8K|TSB64K|TSB512K, 0, sfmmup);
   1533 		} else
   1534 #endif /* DEBUG */
   1535 			(void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb,
   1536 			    default_tsb_size,
   1537 			    TSB8K|TSB64K|TSB512K, 0, sfmmup);
   1538 		sfmmup->sfmmu_flags = HAT_SWAPPED | HAT_ALLCTX_INVALID;
   1539 		ASSERT(sfmmup->sfmmu_tsb != NULL);
   1540 	}
   1541 
   1542 	ASSERT(max_mmu_ctxdoms > 0);
   1543 	for (i = 0; i < max_mmu_ctxdoms; i++) {
   1544 		sfmmup->sfmmu_ctxs[i].cnum = cnum;
   1545 		sfmmup->sfmmu_ctxs[i].gnum = 0;
   1546 	}
   1547 
   1548 	for (i = 0; i < max_mmu_page_sizes; i++) {
   1549 		sfmmup->sfmmu_ttecnt[i] = 0;
   1550 		sfmmup->sfmmu_scdrttecnt[i] = 0;
   1551 		sfmmup->sfmmu_ismttecnt[i] = 0;
   1552 		sfmmup->sfmmu_scdismttecnt[i] = 0;
   1553 		sfmmup->sfmmu_pgsz[i] = TTE8K;
   1554 	}
   1555 	sfmmup->sfmmu_tsb0_4minflcnt = 0;
   1556 	sfmmup->sfmmu_iblk = NULL;
   1557 	sfmmup->sfmmu_ismhat = 0;
   1558 	sfmmup->sfmmu_scdhat = 0;
   1559 	sfmmup->sfmmu_ismblkpa = (uint64_t)-1;
   1560 	if (sfmmup == ksfmmup) {
   1561 		CPUSET_ALL(sfmmup->sfmmu_cpusran);
   1562 	} else {
   1563 		CPUSET_ZERO(sfmmup->sfmmu_cpusran);
   1564 	}
   1565 	sfmmup->sfmmu_free = 0;
   1566 	sfmmup->sfmmu_rmstat = 0;
   1567 	sfmmup->sfmmu_clrbin = sfmmup->sfmmu_clrstart;
   1568 	sfmmup->sfmmu_xhat_provider = NULL;
   1569 	cv_init(&sfmmup->sfmmu_tsb_cv, NULL, CV_DEFAULT, NULL);
   1570 	sfmmup->sfmmu_srdp = NULL;
   1571 	SF_RGNMAP_ZERO(sfmmup->sfmmu_region_map);
   1572 	bzero(sfmmup->sfmmu_hmeregion_links, SFMMU_L1_HMERLINKS_SIZE);
   1573 	sfmmup->sfmmu_scdp = NULL;
   1574 	sfmmup->sfmmu_scd_link.next = NULL;
   1575 	sfmmup->sfmmu_scd_link.prev = NULL;
   1576 	return (sfmmup);
   1577 }
   1578 
   1579 /*
   1580  * Create per-MMU context domain kstats for a given MMU ctx.
   1581  */
   1582 static void
   1583 sfmmu_mmu_kstat_create(mmu_ctx_t *mmu_ctxp)
   1584 {
   1585 	mmu_ctx_stat_t	stat;
   1586 	kstat_t		*mmu_kstat;
   1587 
   1588 	ASSERT(MUTEX_HELD(&cpu_lock));
   1589 	ASSERT(mmu_ctxp->mmu_kstat == NULL);
   1590 
   1591 	mmu_kstat = kstat_create("unix", mmu_ctxp->mmu_idx, "mmu_ctx",
   1592 	    "hat", KSTAT_TYPE_NAMED, MMU_CTX_NUM_STATS, KSTAT_FLAG_VIRTUAL);
   1593 
   1594 	if (mmu_kstat == NULL) {
   1595 		cmn_err(CE_WARN, "kstat_create for MMU %d failed",
   1596 		    mmu_ctxp->mmu_idx);
   1597 	} else {
   1598 		mmu_kstat->ks_data = mmu_ctxp->mmu_kstat_data;
   1599 		for (stat = 0; stat < MMU_CTX_NUM_STATS; stat++)
   1600 			kstat_named_init(&mmu_ctxp->mmu_kstat_data[stat],
   1601 			    mmu_ctx_kstat_names[stat], KSTAT_DATA_INT64);
   1602 		mmu_ctxp->mmu_kstat = mmu_kstat;
   1603 		kstat_install(mmu_kstat);
   1604 	}
   1605 }
   1606 
   1607 /*
   1608  * plat_cpuid_to_mmu_ctx_info() is a platform interface that returns MMU
   1609  * context domain information for a given CPU. If a platform does not
   1610  * specify that interface, then the function below is used instead to return
   1611  * default information. The defaults are as follows:
   1612  *
   1613  *	- For sun4u systems there's one MMU context domain per CPU.
   1614  *	  This default is used by all sun4u systems except OPL. OPL systems
   1615  *	  provide platform specific interface to map CPU ids to MMU ids
   1616  *	  because on OPL more than 1 CPU shares a single MMU.
   1617  *        Note that on sun4v, there is one global context domain for
   1618  *	  the entire system. This is to avoid running into potential problem
   1619  *	  with ldom physical cpu substitution feature.
   1620  *	- The number of MMU context IDs supported on any CPU in the
   1621  *	  system is 8K.
   1622  */
   1623 /*ARGSUSED*/
   1624 static void
   1625 sfmmu_cpuid_to_mmu_ctx_info(processorid_t cpuid, mmu_ctx_info_t *infop)
   1626 {
   1627 	infop->mmu_nctxs = nctxs;
   1628 #ifndef sun4v
   1629 	infop->mmu_idx = cpu[cpuid]->cpu_seqid;
   1630 #else /* sun4v */
   1631 	infop->mmu_idx = 0;
   1632 #endif /* sun4v */
   1633 }
   1634 
   1635 /*
   1636  * Called during CPU initialization to set the MMU context-related information
   1637  * for a CPU.
   1638  *
   1639  * cpu_lock serializes accesses to mmu_ctxs and mmu_saved_gnum.
   1640  */
   1641 void
   1642 sfmmu_cpu_init(cpu_t *cp)
   1643 {
   1644 	mmu_ctx_info_t	info;
   1645 	mmu_ctx_t	*mmu_ctxp;
   1646 
   1647 	ASSERT(MUTEX_HELD(&cpu_lock));
   1648 
   1649 	if (&plat_cpuid_to_mmu_ctx_info == NULL)
   1650 		sfmmu_cpuid_to_mmu_ctx_info(cp->cpu_id, &info);
   1651 	else
   1652 		plat_cpuid_to_mmu_ctx_info(cp->cpu_id, &info);
   1653 
   1654 	ASSERT(info.mmu_idx < max_mmu_ctxdoms);
   1655 
   1656 	if ((mmu_ctxp = mmu_ctxs_tbl[info.mmu_idx]) == NULL) {
   1657 		/* Each mmu_ctx is cacheline aligned. */
   1658 		mmu_ctxp = kmem_cache_alloc(mmuctxdom_cache, KM_SLEEP);
   1659 		bzero(mmu_ctxp, sizeof (mmu_ctx_t));
   1660 
   1661 		mutex_init(&mmu_ctxp->mmu_lock, NULL, MUTEX_SPIN,
   1662 		    (void *)ipltospl(DISP_LEVEL));
   1663 		mmu_ctxp->mmu_idx = info.mmu_idx;
   1664 		mmu_ctxp->mmu_nctxs = info.mmu_nctxs;
   1665 		/*
   1666 		 * Globally for lifetime of a system,
   1667 		 * gnum must always increase.
   1668 		 * mmu_saved_gnum is protected by the cpu_lock.
   1669 		 */
   1670 		mmu_ctxp->mmu_gnum = mmu_saved_gnum + 1;
   1671 		mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS;
   1672 
   1673 		sfmmu_mmu_kstat_create(mmu_ctxp);
   1674 
   1675 		mmu_ctxs_tbl[info.mmu_idx] = mmu_ctxp;
   1676 	} else {
   1677 		ASSERT(mmu_ctxp->mmu_idx == info.mmu_idx);
   1678 	}
   1679 
   1680 	/*
   1681 	 * The mmu_lock is acquired here to prevent races with
   1682 	 * the wrap-around code.
   1683 	 */
   1684 	mutex_enter(&mmu_ctxp->mmu_lock);
   1685 
   1686 
   1687 	mmu_ctxp->mmu_ncpus++;
   1688 	CPUSET_ADD(mmu_ctxp->mmu_cpuset, cp->cpu_id);
   1689 	CPU_MMU_IDX(cp) = info.mmu_idx;
   1690 	CPU_MMU_CTXP(cp) = mmu_ctxp;
   1691 
   1692 	mutex_exit(&mmu_ctxp->mmu_lock);
   1693 }
   1694 
   1695 /*
   1696  * Called to perform MMU context-related cleanup for a CPU.
   1697  */
   1698 void
   1699 sfmmu_cpu_cleanup(cpu_t *cp)
   1700 {
   1701 	mmu_ctx_t	*mmu_ctxp;
   1702 
   1703 	ASSERT(MUTEX_HELD(&cpu_lock));
   1704 
   1705 	mmu_ctxp = CPU_MMU_CTXP(cp);
   1706 	ASSERT(mmu_ctxp != NULL);
   1707 
   1708 	/*
   1709 	 * The mmu_lock is acquired here to prevent races with
   1710 	 * the wrap-around code.
   1711 	 */
   1712 	mutex_enter(&mmu_ctxp->mmu_lock);
   1713 
   1714 	CPU_MMU_CTXP(cp) = NULL;
   1715 
   1716 	CPUSET_DEL(mmu_ctxp->mmu_cpuset, cp->cpu_id);
   1717 	if (--mmu_ctxp->mmu_ncpus == 0) {
   1718 		mmu_ctxs_tbl[mmu_ctxp->mmu_idx] = NULL;
   1719 		mutex_exit(&mmu_ctxp->mmu_lock);
   1720 		mutex_destroy(&mmu_ctxp->mmu_lock);
   1721 
   1722 		if (mmu_ctxp->mmu_kstat)
   1723 			kstat_delete(mmu_ctxp->mmu_kstat);
   1724 
   1725 		/* mmu_saved_gnum is protected by the cpu_lock. */
   1726 		if (mmu_saved_gnum < mmu_ctxp->mmu_gnum)
   1727 			mmu_saved_gnum = mmu_ctxp->mmu_gnum;
   1728 
   1729 		kmem_cache_free(mmuctxdom_cache, mmu_ctxp);
   1730 
   1731 		return;
   1732 	}
   1733 
   1734 	mutex_exit(&mmu_ctxp->mmu_lock);
   1735 }
   1736 
   1737 /*
   1738  * Hat_setup, makes an address space context the current active one.
   1739  * In sfmmu this translates to setting the secondary context with the
   1740  * corresponding context.
   1741  */
   1742 void
   1743 hat_setup(struct hat *sfmmup, int allocflag)
   1744 {
   1745 	hatlock_t *hatlockp;
   1746 
   1747 	/* Init needs some special treatment. */
   1748 	if (allocflag == HAT_INIT) {
   1749 		/*
   1750 		 * Make sure that we have
   1751 		 * 1. a TSB
   1752 		 * 2. a valid ctx that doesn't get stolen after this point.
   1753 		 */
   1754 		hatlockp = sfmmu_hat_enter(sfmmup);
   1755 
   1756 		/*
   1757 		 * Swap in the TSB.  hat_init() allocates tsbinfos without
   1758 		 * TSBs, but we need one for init, since the kernel does some
   1759 		 * special things to set up its stack and needs the TSB to
   1760 		 * resolve page faults.
   1761 		 */
   1762 		sfmmu_tsb_swapin(sfmmup, hatlockp);
   1763 
   1764 		sfmmu_get_ctx(sfmmup);
   1765 
   1766 		sfmmu_hat_exit(hatlockp);
   1767 	} else {
   1768 		ASSERT(allocflag == HAT_ALLOC);
   1769 
   1770 		hatlockp = sfmmu_hat_enter(sfmmup);
   1771 		kpreempt_disable();
   1772 
   1773 		CPUSET_ADD(sfmmup->sfmmu_cpusran, CPU->cpu_id);
   1774 		/*
   1775 		 * sfmmu_setctx_sec takes <pgsz|cnum> as a parameter,
   1776 		 * pagesize bits don't matter in this case since we are passing
   1777 		 * INVALID_CONTEXT to it.
   1778 		 * Compatibility Note: hw takes care of MMU_SCONTEXT1
   1779 		 */
   1780 		sfmmu_setctx_sec(INVALID_CONTEXT);
   1781 		sfmmu_clear_utsbinfo();
   1782 
   1783 		kpreempt_enable();
   1784 		sfmmu_hat_exit(hatlockp);
   1785 	}
   1786 }
   1787 
   1788 /*
   1789  * Free all the translation resources for the specified address space.
   1790  * Called from as_free when an address space is being destroyed.
   1791  */
   1792 void
   1793 hat_free_start(struct hat *sfmmup)
   1794 {
   1795 	ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock));
   1796 	ASSERT(sfmmup != ksfmmup);
   1797 	ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
   1798 
   1799 	sfmmup->sfmmu_free = 1;
   1800 	if (sfmmup->sfmmu_scdp != NULL) {
   1801 		sfmmu_leave_scd(sfmmup, 0);
   1802 	}
   1803 
   1804 	ASSERT(sfmmup->sfmmu_scdp == NULL);
   1805 }
   1806 
   1807 void
   1808 hat_free_end(struct hat *sfmmup)
   1809 {
   1810 	int i;
   1811 
   1812 	ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
   1813 	ASSERT(sfmmup->sfmmu_free == 1);
   1814 	ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0);
   1815 	ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0);
   1816 	ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0);
   1817 	ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0);
   1818 	ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0);
   1819 	ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0);
   1820 
   1821 	if (sfmmup->sfmmu_rmstat) {
   1822 		hat_freestat(sfmmup->sfmmu_as, NULL);
   1823 	}
   1824 
   1825 	while (sfmmup->sfmmu_tsb != NULL) {
   1826 		struct tsb_info *next = sfmmup->sfmmu_tsb->tsb_next;
   1827 		sfmmu_tsbinfo_free(sfmmup->sfmmu_tsb);
   1828 		sfmmup->sfmmu_tsb = next;
   1829 	}
   1830 
   1831 	if (sfmmup->sfmmu_srdp != NULL) {
   1832 		sfmmu_leave_srd(sfmmup);
   1833 		ASSERT(sfmmup->sfmmu_srdp == NULL);
   1834 		for (i = 0; i < SFMMU_L1_HMERLINKS; i++) {
   1835 			if (sfmmup->sfmmu_hmeregion_links[i] != NULL) {
   1836 				kmem_free(sfmmup->sfmmu_hmeregion_links[i],
   1837 				    SFMMU_L2_HMERLINKS_SIZE);
   1838 				sfmmup->sfmmu_hmeregion_links[i] = NULL;
   1839 			}
   1840 		}
   1841 	}
   1842 	sfmmu_free_sfmmu(sfmmup);
   1843 
   1844 #ifdef DEBUG
   1845 	for (i = 0; i < SFMMU_L1_HMERLINKS; i++) {
   1846 		ASSERT(sfmmup->sfmmu_hmeregion_links[i] == NULL);
   1847 	}
   1848 #endif
   1849 
   1850 	kmem_cache_free(sfmmuid_cache, sfmmup);
   1851 }
   1852 
   1853 /*
   1854  * Set up any translation structures, for the specified address space,
   1855  * that are needed or preferred when the process is being swapped in.
   1856  */
   1857 /* ARGSUSED */
   1858 void
   1859 hat_swapin(struct hat *hat)
   1860 {
   1861 	ASSERT(hat->sfmmu_xhat_provider == NULL);
   1862 }
   1863 
   1864 /*
   1865  * Free all of the translation resources, for the specified address space,
   1866  * that can be freed while the process is swapped out. Called from as_swapout.
   1867  * Also, free up the ctx that this process was using.
   1868  */
   1869 void
   1870 hat_swapout(struct hat *sfmmup)
   1871 {
   1872 	struct hmehash_bucket *hmebp;
   1873 	struct hme_blk *hmeblkp;
   1874 	struct hme_blk *pr_hblk = NULL;
   1875 	struct hme_blk *nx_hblk;
   1876 	int i;
   1877 	struct hme_blk *list = NULL;
   1878 	hatlock_t *hatlockp;
   1879 	struct tsb_info *tsbinfop;
   1880 	struct free_tsb {
   1881 		struct free_tsb *next;
   1882 		struct tsb_info *tsbinfop;
   1883 	};			/* free list of TSBs */
   1884 	struct free_tsb *freelist, *last, *next;
   1885 
   1886 	ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
   1887 	SFMMU_STAT(sf_swapout);
   1888 
   1889 	/*
   1890 	 * There is no way to go from an as to all its translations in sfmmu.
   1891 	 * Here is one of the times when we take the big hit and traverse
   1892 	 * the hash looking for hme_blks to free up.  Not only do we free up
   1893 	 * this as hme_blks but all those that are free.  We are obviously
   1894 	 * swapping because we need memory so let's free up as much
   1895 	 * as we can.
   1896 	 *
   1897 	 * Note that we don't flush TLB/TSB here -- it's not necessary
   1898 	 * because:
   1899 	 *  1) we free the ctx we're using and throw away the TSB(s);
   1900 	 *  2) processes aren't runnable while being swapped out.
   1901 	 */
   1902 	ASSERT(sfmmup != KHATID);
   1903 	for (i = 0; i <= UHMEHASH_SZ; i++) {
   1904 		hmebp = &uhme_hash[i];
   1905 		SFMMU_HASH_LOCK(hmebp);
   1906 		hmeblkp = hmebp->hmeblkp;
   1907 		pr_hblk = NULL;
   1908 		while (hmeblkp) {
   1909 
   1910 			ASSERT(!hmeblkp->hblk_xhat_bit);
   1911 
   1912 			if ((hmeblkp->hblk_tag.htag_id == sfmmup) &&
   1913 			    !hmeblkp->hblk_shw_bit && !hmeblkp->hblk_lckcnt) {
   1914 				ASSERT(!hmeblkp->hblk_shared);
   1915 				(void) sfmmu_hblk_unload(sfmmup, hmeblkp,
   1916 				    (caddr_t)get_hblk_base(hmeblkp),
   1917 				    get_hblk_endaddr(hmeblkp),
   1918 				    NULL, HAT_UNLOAD);
   1919 			}
   1920 			nx_hblk = hmeblkp->hblk_next;
   1921 			if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) {
   1922 				ASSERT(!hmeblkp->hblk_lckcnt);
   1923 				sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
   1924 				    &list, 0);
   1925 			} else {
   1926 				pr_hblk = hmeblkp;
   1927 			}
   1928 			hmeblkp = nx_hblk;
   1929 		}
   1930 		SFMMU_HASH_UNLOCK(hmebp);
   1931 	}
   1932 
   1933 	sfmmu_hblks_list_purge(&list, 0);
   1934 
   1935 	/*
   1936 	 * Now free up the ctx so that others can reuse it.
   1937 	 */
   1938 	hatlockp = sfmmu_hat_enter(sfmmup);
   1939 
   1940 	sfmmu_invalidate_ctx(sfmmup);
   1941 
   1942 	/*
   1943 	 * Free TSBs, but not tsbinfos, and set SWAPPED flag.
   1944 	 * If TSBs were never swapped in, just return.
   1945 	 * This implies that we don't support partial swapping
   1946 	 * of TSBs -- either all are swapped out, or none are.
   1947 	 *
   1948 	 * We must hold the HAT lock here to prevent racing with another
   1949 	 * thread trying to unmap TTEs from the TSB or running the post-
   1950 	 * relocator after relocating the TSB's memory.  Unfortunately, we
   1951 	 * can't free memory while holding the HAT lock or we could
   1952 	 * deadlock, so we build a list of TSBs to be freed after marking
   1953 	 * the tsbinfos as swapped out and free them after dropping the
   1954 	 * lock.
   1955 	 */
   1956 	if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) {
   1957 		sfmmu_hat_exit(hatlockp);
   1958 		return;
   1959 	}
   1960 
   1961 	SFMMU_FLAGS_SET(sfmmup, HAT_SWAPPED);
   1962 	last = freelist = NULL;
   1963 	for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL;
   1964 	    tsbinfop = tsbinfop->tsb_next) {
   1965 		ASSERT((tsbinfop->tsb_flags & TSB_SWAPPED) == 0);
   1966 
   1967 		/*
   1968 		 * Cast the TSB into a struct free_tsb and put it on the free
   1969 		 * list.
   1970 		 */
   1971 		if (freelist == NULL) {
   1972 			last = freelist = (struct free_tsb *)tsbinfop->tsb_va;
   1973 		} else {
   1974 			last->next = (struct free_tsb *)tsbinfop->tsb_va;
   1975 			last = last->next;
   1976 		}
   1977 		last->next = NULL;
   1978 		last->tsbinfop = tsbinfop;
   1979 		tsbinfop->tsb_flags |= TSB_SWAPPED;
   1980 		/*
   1981 		 * Zero out the TTE to clear the valid bit.
   1982 		 * Note we can't use a value like 0xbad because we want to
   1983 		 * ensure diagnostic bits are NEVER set on TTEs that might
   1984 		 * be loaded.  The intent is to catch any invalid access
   1985 		 * to the swapped TSB, such as a thread running with a valid
   1986 		 * context without first calling sfmmu_tsb_swapin() to
   1987 		 * allocate TSB memory.
   1988 		 */
   1989 		tsbinfop->tsb_tte.ll = 0;
   1990 	}
   1991 
   1992 	/* Now we can drop the lock and free the TSB memory. */
   1993 	sfmmu_hat_exit(hatlockp);
   1994 	for (; freelist != NULL; freelist = next) {
   1995 		next = freelist->next;
   1996 		sfmmu_tsb_free(freelist->tsbinfop);
   1997 	}
   1998 }
   1999 
   2000 /*
   2001  * Duplicate the translations of an as into another newas
   2002  */
   2003 /* ARGSUSED */
   2004 int
   2005 hat_dup(struct hat *hat, struct hat *newhat, caddr_t addr, size_t len,
   2006 	uint_t flag)
   2007 {
   2008 	sf_srd_t *srdp;
   2009 	sf_scd_t *scdp;
   2010 	int i;
   2011 	extern uint_t get_color_start(struct as *);
   2012 
   2013 	ASSERT(hat->sfmmu_xhat_provider == NULL);
   2014 	ASSERT((flag == 0) || (flag == HAT_DUP_ALL) || (flag == HAT_DUP_COW) ||
   2015 	    (flag == HAT_DUP_SRD));
   2016 	ASSERT(hat != ksfmmup);
   2017 	ASSERT(newhat != ksfmmup);
   2018 	ASSERT(flag != HAT_DUP_ALL || hat->sfmmu_srdp == newhat->sfmmu_srdp);
   2019 
   2020 	if (flag == HAT_DUP_COW) {
   2021 		panic("hat_dup: HAT_DUP_COW not supported");
   2022 	}
   2023 
   2024 	if (flag == HAT_DUP_SRD && ((srdp = hat->sfmmu_srdp) != NULL)) {
   2025 		ASSERT(srdp->srd_evp != NULL);
   2026 		VN_HOLD(srdp->srd_evp);
   2027 		ASSERT(srdp->srd_refcnt > 0);
   2028 		newhat->sfmmu_srdp = srdp;
   2029 		atomic_add_32((volatile uint_t *)&srdp->srd_refcnt, 1);
   2030 	}
   2031 
   2032 	/*
   2033 	 * HAT_DUP_ALL flag is used after as duplication is done.
   2034 	 */
   2035 	if (flag == HAT_DUP_ALL && ((srdp = newhat->sfmmu_srdp) != NULL)) {
   2036 		ASSERT(newhat->sfmmu_srdp->srd_refcnt >= 2);
   2037 		newhat->sfmmu_rtteflags = hat->sfmmu_rtteflags;
   2038 		if (hat->sfmmu_flags & HAT_4MTEXT_FLAG) {
   2039 			newhat->sfmmu_flags |= HAT_4MTEXT_FLAG;
   2040 		}
   2041 
   2042 		/* check if need to join scd */
   2043 		if ((scdp = hat->sfmmu_scdp) != NULL &&
   2044 		    newhat->sfmmu_scdp != scdp) {
   2045 			int ret;
   2046 			SF_RGNMAP_IS_SUBSET(&newhat->sfmmu_region_map,
   2047 			    &scdp->scd_region_map, ret);
   2048 			ASSERT(ret);
   2049 			sfmmu_join_scd(scdp, newhat);
   2050 			ASSERT(newhat->sfmmu_scdp == scdp &&
   2051 			    scdp->scd_refcnt >= 2);
   2052 			for (i = 0; i < max_mmu_page_sizes; i++) {
   2053 				newhat->sfmmu_ismttecnt[i] =
   2054 				    hat->sfmmu_ismttecnt[i];
   2055 				newhat->sfmmu_scdismttecnt[i] =
   2056 				    hat->sfmmu_scdismttecnt[i];
   2057 			}
   2058 		}
   2059 
   2060 		sfmmu_check_page_sizes(newhat, 1);
   2061 	}
   2062 
   2063 	if (flag == HAT_DUP_ALL && consistent_coloring == 0 &&
   2064 	    update_proc_pgcolorbase_after_fork != 0) {
   2065 		hat->sfmmu_clrbin = get_color_start(hat->sfmmu_as);
   2066 	}
   2067 	return (0);
   2068 }
   2069 
   2070 void
   2071 hat_memload(struct hat *hat, caddr_t addr, struct page *pp,
   2072 	uint_t attr, uint_t flags)
   2073 {
   2074 	hat_do_memload(hat, addr, pp, attr, flags,
   2075 	    SFMMU_INVALID_SHMERID);
   2076 }
   2077 
   2078 void
   2079 hat_memload_region(struct hat *hat, caddr_t addr, struct page *pp,
   2080 	uint_t attr, uint_t flags, hat_region_cookie_t rcookie)
   2081 {
   2082 	uint_t rid;
   2083 	if (rcookie == HAT_INVALID_REGION_COOKIE ||
   2084 	    hat->sfmmu_xhat_provider != NULL) {
   2085 		hat_do_memload(hat, addr, pp, attr, flags,
   2086 		    SFMMU_INVALID_SHMERID);
   2087 		return;
   2088 	}
   2089 	rid = (uint_t)((uint64_t)rcookie);
   2090 	ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   2091 	hat_do_memload(hat, addr, pp, attr, flags, rid);
   2092 }
   2093 
   2094 /*
   2095  * Set up addr to map to page pp with protection prot.
   2096  * As an optimization we also load the TSB with the
   2097  * corresponding tte but it is no big deal if  the tte gets kicked out.
   2098  */
   2099 static void
   2100 hat_do_memload(struct hat *hat, caddr_t addr, struct page *pp,
   2101 	uint_t attr, uint_t flags, uint_t rid)
   2102 {
   2103 	tte_t tte;
   2104 
   2105 
   2106 	ASSERT(hat != NULL);
   2107 	ASSERT(PAGE_LOCKED(pp));
   2108 	ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET));
   2109 	ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG));
   2110 	ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
   2111 	SFMMU_VALIDATE_HMERID(hat, rid, addr, MMU_PAGESIZE);
   2112 
   2113 	if (PP_ISFREE(pp)) {
   2114 		panic("hat_memload: loading a mapping to free page %p",
   2115 		    (void *)pp);
   2116 	}
   2117 
   2118 	if (hat->sfmmu_xhat_provider) {
   2119 		/* no regions for xhats */
   2120 		ASSERT(!SFMMU_IS_SHMERID_VALID(rid));
   2121 		XHAT_MEMLOAD(hat, addr, pp, attr, flags);
   2122 		return;
   2123 	}
   2124 
   2125 	ASSERT((hat == ksfmmup) ||
   2126 	    AS_LOCK_HELD(hat->sfmmu_as, &hat->sfmmu_as->a_lock));
   2127 
   2128 	if (flags & ~SFMMU_LOAD_ALLFLAG)
   2129 		cmn_err(CE_NOTE, "hat_memload: unsupported flags %d",
   2130 		    flags & ~SFMMU_LOAD_ALLFLAG);
   2131 
   2132 	if (hat->sfmmu_rmstat)
   2133 		hat_resvstat(MMU_PAGESIZE, hat->sfmmu_as, addr);
   2134 
   2135 #if defined(SF_ERRATA_57)
   2136 	if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) &&
   2137 	    (addr < errata57_limit) && (attr & PROT_EXEC) &&
   2138 	    !(flags & HAT_LOAD_SHARE)) {
   2139 		cmn_err(CE_WARN, "hat_memload: illegal attempt to make user "
   2140 		    " page executable");
   2141 		attr &= ~PROT_EXEC;
   2142 	}
   2143 #endif
   2144 
   2145 	sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K);
   2146 	(void) sfmmu_tteload_array(hat, &tte, addr, &pp, flags, rid);
   2147 
   2148 	/*
   2149 	 * Check TSB and TLB page sizes.
   2150 	 */
   2151 	if ((flags & HAT_LOAD_SHARE) == 0) {
   2152 		sfmmu_check_page_sizes(hat, 1);
   2153 	}
   2154 }
   2155 
   2156 /*
   2157  * hat_devload can be called to map real memory (e.g.
   2158  * /dev/kmem) and even though hat_devload will determine pf is
   2159  * for memory, it will be unable to get a shared lock on the
   2160  * page (because someone else has it exclusively) and will
   2161  * pass dp = NULL.  If tteload doesn't get a non-NULL
   2162  * page pointer it can't cache memory.
   2163  */
   2164 void
   2165 hat_devload(struct hat *hat, caddr_t addr, size_t len, pfn_t pfn,
   2166 	uint_t attr, int flags)
   2167 {
   2168 	tte_t tte;
   2169 	struct page *pp = NULL;
   2170 	int use_lgpg = 0;
   2171 
   2172 	ASSERT(hat != NULL);
   2173 
   2174 	if (hat->sfmmu_xhat_provider) {
   2175 		XHAT_DEVLOAD(hat, addr, len, pfn, attr, flags);
   2176 		return;
   2177 	}
   2178 
   2179 	ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG));
   2180 	ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
   2181 	ASSERT((hat == ksfmmup) ||
   2182 	    AS_LOCK_HELD(hat->sfmmu_as, &hat->sfmmu_as->a_lock));
   2183 	if (len == 0)
   2184 		panic("hat_devload: zero len");
   2185 	if (flags & ~SFMMU_LOAD_ALLFLAG)
   2186 		cmn_err(CE_NOTE, "hat_devload: unsupported flags %d",
   2187 		    flags & ~SFMMU_LOAD_ALLFLAG);
   2188 
   2189 #if defined(SF_ERRATA_57)
   2190 	if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) &&
   2191 	    (addr < errata57_limit) && (attr & PROT_EXEC) &&
   2192 	    !(flags & HAT_LOAD_SHARE)) {
   2193 		cmn_err(CE_WARN, "hat_devload: illegal attempt to make user "
   2194 		    " page executable");
   2195 		attr &= ~PROT_EXEC;
   2196 	}
   2197 #endif
   2198 
   2199 	/*
   2200 	 * If it's a memory page find its pp
   2201 	 */
   2202 	if (!(flags & HAT_LOAD_NOCONSIST) && pf_is_memory(pfn)) {
   2203 		pp = page_numtopp_nolock(pfn);
   2204 		if (pp == NULL) {
   2205 			flags |= HAT_LOAD_NOCONSIST;
   2206 		} else {
   2207 			if (PP_ISFREE(pp)) {
   2208 				panic("hat_memload: loading "
   2209 				    "a mapping to free page %p",
   2210 				    (void *)pp);
   2211 			}
   2212 			if (!PAGE_LOCKED(pp) && !PP_ISNORELOC(pp)) {
   2213 				panic("hat_memload: loading a mapping "
   2214 				    "to unlocked relocatable page %p",
   2215 				    (void *)pp);
   2216 			}
   2217 			ASSERT(len == MMU_PAGESIZE);
   2218 		}
   2219 	}
   2220 
   2221 	if (hat->sfmmu_rmstat)
   2222 		hat_resvstat(len, hat->sfmmu_as, addr);
   2223 
   2224 	if (flags & HAT_LOAD_NOCONSIST) {
   2225 		attr |= SFMMU_UNCACHEVTTE;
   2226 		use_lgpg = 1;
   2227 	}
   2228 	if (!pf_is_memory(pfn)) {
   2229 		attr |= SFMMU_UNCACHEPTTE | HAT_NOSYNC;
   2230 		use_lgpg = 1;
   2231 		switch (attr & HAT_ORDER_MASK) {
   2232 			case HAT_STRICTORDER:
   2233 			case HAT_UNORDERED_OK:
   2234 				/*
   2235 				 * we set the side effect bit for all non
   2236 				 * memory mappings unless merging is ok
   2237 				 */
   2238 				attr |= SFMMU_SIDEFFECT;
   2239 				break;
   2240 			case HAT_MERGING_OK:
   2241 			case HAT_LOADCACHING_OK:
   2242 			case HAT_STORECACHING_OK:
   2243 				break;
   2244 			default:
   2245 				panic("hat_devload: bad attr");
   2246 				break;
   2247 		}
   2248 	}
   2249 	while (len) {
   2250 		if (!use_lgpg) {
   2251 			sfmmu_memtte(&tte, pfn, attr, TTE8K);
   2252 			(void) sfmmu_tteload_array(hat, &tte, addr, &pp,
   2253 			    flags, SFMMU_INVALID_SHMERID);
   2254 			len -= MMU_PAGESIZE;
   2255 			addr += MMU_PAGESIZE;
   2256 			pfn++;
   2257 			continue;
   2258 		}
   2259 		/*
   2260 		 *  try to use large pages, check va/pa alignments
   2261 		 *  Note that 32M/256M page sizes are not (yet) supported.
   2262 		 */
   2263 		if ((len >= MMU_PAGESIZE4M) &&
   2264 		    !((uintptr_t)addr & MMU_PAGEOFFSET4M) &&
   2265 		    !(disable_large_pages & (1 << TTE4M)) &&
   2266 		    !(mmu_ptob(pfn) & MMU_PAGEOFFSET4M)) {
   2267 			sfmmu_memtte(&tte, pfn, attr, TTE4M);
   2268 			(void) sfmmu_tteload_array(hat, &tte, addr, &pp,
   2269 			    flags, SFMMU_INVALID_SHMERID);
   2270 			len -= MMU_PAGESIZE4M;
   2271 			addr += MMU_PAGESIZE4M;
   2272 			pfn += MMU_PAGESIZE4M / MMU_PAGESIZE;
   2273 		} else if ((len >= MMU_PAGESIZE512K) &&
   2274 		    !((uintptr_t)addr & MMU_PAGEOFFSET512K) &&
   2275 		    !(disable_large_pages & (1 << TTE512K)) &&
   2276 		    !(mmu_ptob(pfn) & MMU_PAGEOFFSET512K)) {
   2277 			sfmmu_memtte(&tte, pfn, attr, TTE512K);
   2278 			(void) sfmmu_tteload_array(hat, &tte, addr, &pp,
   2279 			    flags, SFMMU_INVALID_SHMERID);
   2280 			len -= MMU_PAGESIZE512K;
   2281 			addr += MMU_PAGESIZE512K;
   2282 			pfn += MMU_PAGESIZE512K / MMU_PAGESIZE;
   2283 		} else if ((len >= MMU_PAGESIZE64K) &&
   2284 		    !((uintptr_t)addr & MMU_PAGEOFFSET64K) &&
   2285 		    !(disable_large_pages & (1 << TTE64K)) &&
   2286 		    !(mmu_ptob(pfn) & MMU_PAGEOFFSET64K)) {
   2287 			sfmmu_memtte(&tte, pfn, attr, TTE64K);
   2288 			(void) sfmmu_tteload_array(hat, &tte, addr, &pp,
   2289 			    flags, SFMMU_INVALID_SHMERID);
   2290 			len -= MMU_PAGESIZE64K;
   2291 			addr += MMU_PAGESIZE64K;
   2292 			pfn += MMU_PAGESIZE64K / MMU_PAGESIZE;
   2293 		} else {
   2294 			sfmmu_memtte(&tte, pfn, attr, TTE8K);
   2295 			(void) sfmmu_tteload_array(hat, &tte, addr, &pp,
   2296 			    flags, SFMMU_INVALID_SHMERID);
   2297 			len -= MMU_PAGESIZE;
   2298 			addr += MMU_PAGESIZE;
   2299 			pfn++;
   2300 		}
   2301 	}
   2302 
   2303 	/*
   2304 	 * Check TSB and TLB page sizes.
   2305 	 */
   2306 	if ((flags & HAT_LOAD_SHARE) == 0) {
   2307 		sfmmu_check_page_sizes(hat, 1);
   2308 	}
   2309 }
   2310 
   2311 void
   2312 hat_memload_array(struct hat *hat, caddr_t addr, size_t len,
   2313 	struct page **pps, uint_t attr, uint_t flags)
   2314 {
   2315 	hat_do_memload_array(hat, addr, len, pps, attr, flags,
   2316 	    SFMMU_INVALID_SHMERID);
   2317 }
   2318 
   2319 void
   2320 hat_memload_array_region(struct hat *hat, caddr_t addr, size_t len,
   2321 	struct page **pps, uint_t attr, uint_t flags,
   2322 	hat_region_cookie_t rcookie)
   2323 {
   2324 	uint_t rid;
   2325 	if (rcookie == HAT_INVALID_REGION_COOKIE ||
   2326 	    hat->sfmmu_xhat_provider != NULL) {
   2327 		hat_do_memload_array(hat, addr, len, pps, attr, flags,
   2328 		    SFMMU_INVALID_SHMERID);
   2329 		return;
   2330 	}
   2331 	rid = (uint_t)((uint64_t)rcookie);
   2332 	ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   2333 	hat_do_memload_array(hat, addr, len, pps, attr, flags, rid);
   2334 }
   2335 
   2336 /*
   2337  * Map the largest extend possible out of the page array. The array may NOT
   2338  * be in order.  The largest possible mapping a page can have
   2339  * is specified in the p_szc field.  The p_szc field
   2340  * cannot change as long as there any mappings (large or small)
   2341  * to any of the pages that make up the large page. (ie. any
   2342  * promotion/demotion of page size is not up to the hat but up to
   2343  * the page free list manager).  The array
   2344  * should consist of properly aligned contigous pages that are
   2345  * part of a big page for a large mapping to be created.
   2346  */
   2347 static void
   2348 hat_do_memload_array(struct hat *hat, caddr_t addr, size_t len,
   2349 	struct page **pps, uint_t attr, uint_t flags, uint_t rid)
   2350 {
   2351 	int  ttesz;
   2352 	size_t mapsz;
   2353 	pgcnt_t	numpg, npgs;
   2354 	tte_t tte;
   2355 	page_t *pp;
   2356 	uint_t large_pages_disable;
   2357 
   2358 	ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET));
   2359 	SFMMU_VALIDATE_HMERID(hat, rid, addr, len);
   2360 
   2361 	if (hat->sfmmu_xhat_provider) {
   2362 		ASSERT(!SFMMU_IS_SHMERID_VALID(rid));
   2363 		XHAT_MEMLOAD_ARRAY(hat, addr, len, pps, attr, flags);
   2364 		return;
   2365 	}
   2366 
   2367 	if (hat->sfmmu_rmstat)
   2368 		hat_resvstat(len, hat->sfmmu_as, addr);
   2369 
   2370 #if defined(SF_ERRATA_57)
   2371 	if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) &&
   2372 	    (addr < errata57_limit) && (attr & PROT_EXEC) &&
   2373 	    !(flags & HAT_LOAD_SHARE)) {
   2374 		cmn_err(CE_WARN, "hat_memload_array: illegal attempt to make "
   2375 		    "user page executable");
   2376 		attr &= ~PROT_EXEC;
   2377 	}
   2378 #endif
   2379 
   2380 	/* Get number of pages */
   2381 	npgs = len >> MMU_PAGESHIFT;
   2382 
   2383 	if (flags & HAT_LOAD_SHARE) {
   2384 		large_pages_disable = disable_ism_large_pages;
   2385 	} else {
   2386 		large_pages_disable = disable_large_pages;
   2387 	}
   2388 
   2389 	if (npgs < NHMENTS || large_pages_disable == LARGE_PAGES_OFF) {
   2390 		sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs,
   2391 		    rid);
   2392 		return;
   2393 	}
   2394 
   2395 	while (npgs >= NHMENTS) {
   2396 		pp = *pps;
   2397 		for (ttesz = pp->p_szc; ttesz != TTE8K; ttesz--) {
   2398 			/*
   2399 			 * Check if this page size is disabled.
   2400 			 */
   2401 			if (large_pages_disable & (1 << ttesz))
   2402 				continue;
   2403 
   2404 			numpg = TTEPAGES(ttesz);
   2405 			mapsz = numpg << MMU_PAGESHIFT;
   2406 			if ((npgs >= numpg) &&
   2407 			    IS_P2ALIGNED(addr, mapsz) &&
   2408 			    IS_P2ALIGNED(pp->p_pagenum, numpg)) {
   2409 				/*
   2410 				 * At this point we have enough pages and
   2411 				 * we know the virtual address and the pfn
   2412 				 * are properly aligned.  We still need
   2413 				 * to check for physical contiguity but since
   2414 				 * it is very likely that this is the case
   2415 				 * we will assume they are so and undo
   2416 				 * the request if necessary.  It would
   2417 				 * be great if we could get a hint flag
   2418 				 * like HAT_CONTIG which would tell us
   2419 				 * the pages are contigous for sure.
   2420 				 */
   2421 				sfmmu_memtte(&tte, (*pps)->p_pagenum,
   2422 				    attr, ttesz);
   2423 				if (!sfmmu_tteload_array(hat, &tte, addr,
   2424 				    pps, flags, rid)) {
   2425 					break;
   2426 				}
   2427 			}
   2428 		}
   2429 		if (ttesz == TTE8K) {
   2430 			/*
   2431 			 * We were not able to map array using a large page
   2432 			 * batch a hmeblk or fraction at a time.
   2433 			 */
   2434 			numpg = ((uintptr_t)addr >> MMU_PAGESHIFT)
   2435 			    & (NHMENTS-1);
   2436 			numpg = NHMENTS - numpg;
   2437 			ASSERT(numpg <= npgs);
   2438 			mapsz = numpg * MMU_PAGESIZE;
   2439 			sfmmu_memload_batchsmall(hat, addr, pps, attr, flags,
   2440 			    numpg, rid);
   2441 		}
   2442 		addr += mapsz;
   2443 		npgs -= numpg;
   2444 		pps += numpg;
   2445 	}
   2446 
   2447 	if (npgs) {
   2448 		sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs,
   2449 		    rid);
   2450 	}
   2451 
   2452 	/*
   2453 	 * Check TSB and TLB page sizes.
   2454 	 */
   2455 	if ((flags & HAT_LOAD_SHARE) == 0) {
   2456 		sfmmu_check_page_sizes(hat, 1);
   2457 	}
   2458 }
   2459 
   2460 /*
   2461  * Function tries to batch 8K pages into the same hme blk.
   2462  */
   2463 static void
   2464 sfmmu_memload_batchsmall(struct hat *hat, caddr_t vaddr, page_t **pps,
   2465 		    uint_t attr, uint_t flags, pgcnt_t npgs, uint_t rid)
   2466 {
   2467 	tte_t	tte;
   2468 	page_t *pp;
   2469 	struct hmehash_bucket *hmebp;
   2470 	struct hme_blk *hmeblkp;
   2471 	int	index;
   2472 
   2473 	while (npgs) {
   2474 		/*
   2475 		 * Acquire the hash bucket.
   2476 		 */
   2477 		hmebp = sfmmu_tteload_acquire_hashbucket(hat, vaddr, TTE8K,
   2478 		    rid);
   2479 		ASSERT(hmebp);
   2480 
   2481 		/*
   2482 		 * Find the hment block.
   2483 		 */
   2484 		hmeblkp = sfmmu_tteload_find_hmeblk(hat, hmebp, vaddr,
   2485 		    TTE8K, flags, rid);
   2486 		ASSERT(hmeblkp);
   2487 
   2488 		do {
   2489 			/*
   2490 			 * Make the tte.
   2491 			 */
   2492 			pp = *pps;
   2493 			sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K);
   2494 
   2495 			/*
   2496 			 * Add the translation.
   2497 			 */
   2498 			(void) sfmmu_tteload_addentry(hat, hmeblkp, &tte,
   2499 			    vaddr, pps, flags, rid);
   2500 
   2501 			/*
   2502 			 * Goto next page.
   2503 			 */
   2504 			pps++;
   2505 			npgs--;
   2506 
   2507 			/*
   2508 			 * Goto next address.
   2509 			 */
   2510 			vaddr += MMU_PAGESIZE;
   2511 
   2512 			/*
   2513 			 * Don't crossover into a different hmentblk.
   2514 			 */
   2515 			index = (int)(((uintptr_t)vaddr >> MMU_PAGESHIFT) &
   2516 			    (NHMENTS-1));
   2517 
   2518 		} while (index != 0 && npgs != 0);
   2519 
   2520 		/*
   2521 		 * Release the hash bucket.
   2522 		 */
   2523 
   2524 		sfmmu_tteload_release_hashbucket(hmebp);
   2525 	}
   2526 }
   2527 
   2528 /*
   2529  * Construct a tte for a page:
   2530  *
   2531  * tte_valid = 1
   2532  * tte_size2 = size & TTE_SZ2_BITS (Panther and Olympus-C only)
   2533  * tte_size = size
   2534  * tte_nfo = attr & HAT_NOFAULT
   2535  * tte_ie = attr & HAT_STRUCTURE_LE
   2536  * tte_hmenum = hmenum
   2537  * tte_pahi = pp->p_pagenum >> TTE_PASHIFT;
   2538  * tte_palo = pp->p_pagenum & TTE_PALOMASK;
   2539  * tte_ref = 1 (optimization)
   2540  * tte_wr_perm = attr & PROT_WRITE;
   2541  * tte_no_sync = attr & HAT_NOSYNC
   2542  * tte_lock = attr & SFMMU_LOCKTTE
   2543  * tte_cp = !(attr & SFMMU_UNCACHEPTTE)
   2544  * tte_cv = !(attr & SFMMU_UNCACHEVTTE)
   2545  * tte_e = attr & SFMMU_SIDEFFECT
   2546  * tte_priv = !(attr & PROT_USER)
   2547  * tte_hwwr = if nosync is set and it is writable we set the mod bit (opt)
   2548  * tte_glb = 0
   2549  */
   2550 void
   2551 sfmmu_memtte(tte_t *ttep, pfn_t pfn, uint_t attr, int tte_sz)
   2552 {
   2553 	ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
   2554 
   2555 	ttep->tte_inthi = MAKE_TTE_INTHI(pfn, attr, tte_sz, 0 /* hmenum */);
   2556 	ttep->tte_intlo = MAKE_TTE_INTLO(pfn, attr, tte_sz, 0 /* hmenum */);
   2557 
   2558 	if (TTE_IS_NOSYNC(ttep)) {
   2559 		TTE_SET_REF(ttep);
   2560 		if (TTE_IS_WRITABLE(ttep)) {
   2561 			TTE_SET_MOD(ttep);
   2562 		}
   2563 	}
   2564 	if (TTE_IS_NFO(ttep) && TTE_IS_EXECUTABLE(ttep)) {
   2565 		panic("sfmmu_memtte: can't set both NFO and EXEC bits");
   2566 	}
   2567 }
   2568 
   2569 /*
   2570  * This function will add a translation to the hme_blk and allocate the
   2571  * hme_blk if one does not exist.
   2572  * If a page structure is specified then it will add the
   2573  * corresponding hment to the mapping list.
   2574  * It will also update the hmenum field for the tte.
   2575  *
   2576  * Currently this function is only used for kernel mappings.
   2577  * So pass invalid region to sfmmu_tteload_array().
   2578  */
   2579 void
   2580 sfmmu_tteload(struct hat *sfmmup, tte_t *ttep, caddr_t vaddr, page_t *pp,
   2581 	uint_t flags)
   2582 {
   2583 	ASSERT(sfmmup == ksfmmup);
   2584 	(void) sfmmu_tteload_array(sfmmup, ttep, vaddr, &pp, flags,
   2585 	    SFMMU_INVALID_SHMERID);
   2586 }
   2587 
   2588 /*
   2589  * Load (ttep != NULL) or unload (ttep == NULL) one entry in the TSB.
   2590  * Assumes that a particular page size may only be resident in one TSB.
   2591  */
   2592 static void
   2593 sfmmu_mod_tsb(sfmmu_t *sfmmup, caddr_t vaddr, tte_t *ttep, int ttesz)
   2594 {
   2595 	struct tsb_info *tsbinfop = NULL;
   2596 	uint64_t tag;
   2597 	struct tsbe *tsbe_addr;
   2598 	uint64_t tsb_base;
   2599 	uint_t tsb_size;
   2600 	int vpshift = MMU_PAGESHIFT;
   2601 	int phys = 0;
   2602 
   2603 	if (sfmmup == ksfmmup) { /* No support for 32/256M ksfmmu pages */
   2604 		phys = ktsb_phys;
   2605 		if (ttesz >= TTE4M) {
   2606 #ifndef sun4v
   2607 			ASSERT((ttesz != TTE32M) && (ttesz != TTE256M));
   2608 #endif
   2609 			tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base;
   2610 			tsb_size = ktsb4m_szcode;
   2611 		} else {
   2612 			tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base;
   2613 			tsb_size = ktsb_szcode;
   2614 		}
   2615 	} else {
   2616 		SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz);
   2617 
   2618 		/*
   2619 		 * If there isn't a TSB for this page size, or the TSB is
   2620 		 * swapped out, there is nothing to do.  Note that the latter
   2621 		 * case seems impossible but can occur if hat_pageunload()
   2622 		 * is called on an ISM mapping while the process is swapped
   2623 		 * out.
   2624 		 */
   2625 		if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED))
   2626 			return;
   2627 
   2628 		/*
   2629 		 * If another thread is in the middle of relocating a TSB
   2630 		 * we can't unload the entry so set a flag so that the
   2631 		 * TSB will be flushed before it can be accessed by the
   2632 		 * process.
   2633 		 */
   2634 		if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) {
   2635 			if (ttep == NULL)
   2636 				tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED;
   2637 			return;
   2638 		}
   2639 #if defined(UTSB_PHYS)
   2640 		phys = 1;
   2641 		tsb_base = (uint64_t)tsbinfop->tsb_pa;
   2642 #else
   2643 		tsb_base = (uint64_t)tsbinfop->tsb_va;
   2644 #endif
   2645 		tsb_size = tsbinfop->tsb_szc;
   2646 	}
   2647 	if (ttesz >= TTE4M)
   2648 		vpshift = MMU_PAGESHIFT4M;
   2649 
   2650 	tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size);
   2651 	tag = sfmmu_make_tsbtag(vaddr);
   2652 
   2653 	if (ttep == NULL) {
   2654 		sfmmu_unload_tsbe(tsbe_addr, tag, phys);
   2655 	} else {
   2656 		if (ttesz >= TTE4M) {
   2657 			SFMMU_STAT(sf_tsb_load4m);
   2658 		} else {
   2659 			SFMMU_STAT(sf_tsb_load8k);
   2660 		}
   2661 
   2662 		sfmmu_load_tsbe(tsbe_addr, tag, ttep, phys);
   2663 	}
   2664 }
   2665 
   2666 /*
   2667  * Unmap all entries from [start, end) matching the given page size.
   2668  *
   2669  * This function is used primarily to unmap replicated 64K or 512K entries
   2670  * from the TSB that are inserted using the base page size TSB pointer, but
   2671  * it may also be called to unmap a range of addresses from the TSB.
   2672  */
   2673 void
   2674 sfmmu_unload_tsb_range(sfmmu_t *sfmmup, caddr_t start, caddr_t end, int ttesz)
   2675 {
   2676 	struct tsb_info *tsbinfop;
   2677 	uint64_t tag;
   2678 	struct tsbe *tsbe_addr;
   2679 	caddr_t vaddr;
   2680 	uint64_t tsb_base;
   2681 	int vpshift, vpgsz;
   2682 	uint_t tsb_size;
   2683 	int phys = 0;
   2684 
   2685 	/*
   2686 	 * Assumptions:
   2687 	 *  If ttesz == 8K, 64K or 512K, we walk through the range 8K
   2688 	 *  at a time shooting down any valid entries we encounter.
   2689 	 *
   2690 	 *  If ttesz >= 4M we walk the range 4M at a time shooting
   2691 	 *  down any valid mappings we find.
   2692 	 */
   2693 	if (sfmmup == ksfmmup) {
   2694 		phys = ktsb_phys;
   2695 		if (ttesz >= TTE4M) {
   2696 #ifndef sun4v
   2697 			ASSERT((ttesz != TTE32M) && (ttesz != TTE256M));
   2698 #endif
   2699 			tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base;
   2700 			tsb_size = ktsb4m_szcode;
   2701 		} else {
   2702 			tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base;
   2703 			tsb_size = ktsb_szcode;
   2704 		}
   2705 	} else {
   2706 		SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz);
   2707 
   2708 		/*
   2709 		 * If there isn't a TSB for this page size, or the TSB is
   2710 		 * swapped out, there is nothing to do.  Note that the latter
   2711 		 * case seems impossible but can occur if hat_pageunload()
   2712 		 * is called on an ISM mapping while the process is swapped
   2713 		 * out.
   2714 		 */
   2715 		if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED))
   2716 			return;
   2717 
   2718 		/*
   2719 		 * If another thread is in the middle of relocating a TSB
   2720 		 * we can't unload the entry so set a flag so that the
   2721 		 * TSB will be flushed before it can be accessed by the
   2722 		 * process.
   2723 		 */
   2724 		if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) {
   2725 			tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED;
   2726 			return;
   2727 		}
   2728 #if defined(UTSB_PHYS)
   2729 		phys = 1;
   2730 		tsb_base = (uint64_t)tsbinfop->tsb_pa;
   2731 #else
   2732 		tsb_base = (uint64_t)tsbinfop->tsb_va;
   2733 #endif
   2734 		tsb_size = tsbinfop->tsb_szc;
   2735 	}
   2736 	if (ttesz >= TTE4M) {
   2737 		vpshift = MMU_PAGESHIFT4M;
   2738 		vpgsz = MMU_PAGESIZE4M;
   2739 	} else {
   2740 		vpshift = MMU_PAGESHIFT;
   2741 		vpgsz = MMU_PAGESIZE;
   2742 	}
   2743 
   2744 	for (vaddr = start; vaddr < end; vaddr += vpgsz) {
   2745 		tag = sfmmu_make_tsbtag(vaddr);
   2746 		tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size);
   2747 		sfmmu_unload_tsbe(tsbe_addr, tag, phys);
   2748 	}
   2749 }
   2750 
   2751 /*
   2752  * Select the optimum TSB size given the number of mappings
   2753  * that need to be cached.
   2754  */
   2755 static int
   2756 sfmmu_select_tsb_szc(pgcnt_t pgcnt)
   2757 {
   2758 	int szc = 0;
   2759 
   2760 #ifdef DEBUG
   2761 	if (tsb_grow_stress) {
   2762 		uint32_t randval = (uint32_t)gettick() >> 4;
   2763 		return (randval % (tsb_max_growsize + 1));
   2764 	}
   2765 #endif	/* DEBUG */
   2766 
   2767 	while ((szc < tsb_max_growsize) && (pgcnt > SFMMU_RSS_TSBSIZE(szc)))
   2768 		szc++;
   2769 	return (szc);
   2770 }
   2771 
   2772 /*
   2773  * This function will add a translation to the hme_blk and allocate the
   2774  * hme_blk if one does not exist.
   2775  * If a page structure is specified then it will add the
   2776  * corresponding hment to the mapping list.
   2777  * It will also update the hmenum field for the tte.
   2778  * Furthermore, it attempts to create a large page translation
   2779  * for <addr,hat> at page array pps.  It assumes addr and first
   2780  * pp is correctly aligned.  It returns 0 if successful and 1 otherwise.
   2781  */
   2782 static int
   2783 sfmmu_tteload_array(sfmmu_t *sfmmup, tte_t *ttep, caddr_t vaddr,
   2784 	page_t **pps, uint_t flags, uint_t rid)
   2785 {
   2786 	struct hmehash_bucket *hmebp;
   2787 	struct hme_blk *hmeblkp;
   2788 	int 	ret;
   2789 	uint_t	size;
   2790 
   2791 	/*
   2792 	 * Get mapping size.
   2793 	 */
   2794 	size = TTE_CSZ(ttep);
   2795 	ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size)));
   2796 
   2797 	/*
   2798 	 * Acquire the hash bucket.
   2799 	 */
   2800 	hmebp = sfmmu_tteload_acquire_hashbucket(sfmmup, vaddr, size, rid);
   2801 	ASSERT(hmebp);
   2802 
   2803 	/*
   2804 	 * Find the hment block.
   2805 	 */
   2806 	hmeblkp = sfmmu_tteload_find_hmeblk(sfmmup, hmebp, vaddr, size, flags,
   2807 	    rid);
   2808 	ASSERT(hmeblkp);
   2809 
   2810 	/*
   2811 	 * Add the translation.
   2812 	 */
   2813 	ret = sfmmu_tteload_addentry(sfmmup, hmeblkp, ttep, vaddr, pps, flags,
   2814 	    rid);
   2815 
   2816 	/*
   2817 	 * Release the hash bucket.
   2818 	 */
   2819 	sfmmu_tteload_release_hashbucket(hmebp);
   2820 
   2821 	return (ret);
   2822 }
   2823 
   2824 /*
   2825  * Function locks and returns a pointer to the hash bucket for vaddr and size.
   2826  */
   2827 static struct hmehash_bucket *
   2828 sfmmu_tteload_acquire_hashbucket(sfmmu_t *sfmmup, caddr_t vaddr, int size,
   2829     uint_t rid)
   2830 {
   2831 	struct hmehash_bucket *hmebp;
   2832 	int hmeshift;
   2833 	void *htagid = sfmmutohtagid(sfmmup, rid);
   2834 
   2835 	ASSERT(htagid != NULL);
   2836 
   2837 	hmeshift = HME_HASH_SHIFT(size);
   2838 
   2839 	hmebp = HME_HASH_FUNCTION(htagid, vaddr, hmeshift);
   2840 
   2841 	SFMMU_HASH_LOCK(hmebp);
   2842 
   2843 	return (hmebp);
   2844 }
   2845 
   2846 /*
   2847  * Function returns a pointer to an hmeblk in the hash bucket, hmebp. If the
   2848  * hmeblk doesn't exists for the [sfmmup, vaddr & size] signature, a hmeblk is
   2849  * allocated.
   2850  */
   2851 static struct hme_blk *
   2852 sfmmu_tteload_find_hmeblk(sfmmu_t *sfmmup, struct hmehash_bucket *hmebp,
   2853 	caddr_t vaddr, uint_t size, uint_t flags, uint_t rid)
   2854 {
   2855 	hmeblk_tag hblktag;
   2856 	int hmeshift;
   2857 	struct hme_blk *hmeblkp, *pr_hblk, *list = NULL;
   2858 
   2859 	SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size));
   2860 
   2861 	hblktag.htag_id = sfmmutohtagid(sfmmup, rid);
   2862 	ASSERT(hblktag.htag_id != NULL);
   2863 	hmeshift = HME_HASH_SHIFT(size);
   2864 	hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift);
   2865 	hblktag.htag_rehash = HME_HASH_REHASH(size);
   2866 	hblktag.htag_rid = rid;
   2867 
   2868 ttearray_realloc:
   2869 
   2870 	HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list);
   2871 
   2872 	/*
   2873 	 * We block until hblk_reserve_lock is released; it's held by
   2874 	 * the thread, temporarily using hblk_reserve, until hblk_reserve is
   2875 	 * replaced by a hblk from sfmmu8_cache.
   2876 	 */
   2877 	if (hmeblkp == (struct hme_blk *)hblk_reserve &&
   2878 	    hblk_reserve_thread != curthread) {
   2879 		SFMMU_HASH_UNLOCK(hmebp);
   2880 		mutex_enter(&hblk_reserve_lock);
   2881 		mutex_exit(&hblk_reserve_lock);
   2882 		SFMMU_STAT(sf_hblk_reserve_hit);
   2883 		SFMMU_HASH_LOCK(hmebp);
   2884 		goto ttearray_realloc;
   2885 	}
   2886 
   2887 	if (hmeblkp == NULL) {
   2888 		hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size,
   2889 		    hblktag, flags, rid);
   2890 		ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared);
   2891 		ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared);
   2892 	} else {
   2893 		/*
   2894 		 * It is possible for 8k and 64k hblks to collide since they
   2895 		 * have the same rehash value. This is because we
   2896 		 * lazily free hblks and 8K/64K blks could be lingering.
   2897 		 * If we find size mismatch we free the block and & try again.
   2898 		 */
   2899 		if (get_hblk_ttesz(hmeblkp) != size) {
   2900 			ASSERT(!hmeblkp->hblk_vcnt);
   2901 			ASSERT(!hmeblkp->hblk_hmecnt);
   2902 			sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
   2903 			    &list, 0);
   2904 			goto ttearray_realloc;
   2905 		}
   2906 		if (hmeblkp->hblk_shw_bit) {
   2907 			/*
   2908 			 * if the hblk was previously used as a shadow hblk then
   2909 			 * we will change it to a normal hblk
   2910 			 */
   2911 			ASSERT(!hmeblkp->hblk_shared);
   2912 			if (hmeblkp->hblk_shw_mask) {
   2913 				sfmmu_shadow_hcleanup(sfmmup, hmeblkp, hmebp);
   2914 				ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
   2915 				goto ttearray_realloc;
   2916 			} else {
   2917 				hmeblkp->hblk_shw_bit = 0;
   2918 			}
   2919 		}
   2920 		SFMMU_STAT(sf_hblk_hit);
   2921 	}
   2922 
   2923 	/*
   2924 	 * hat_memload() should never call kmem_cache_free() for kernel hmeblks;
   2925 	 * see block comment showing the stacktrace in sfmmu_hblk_alloc();
   2926 	 * set the flag parameter to 1 so that sfmmu_hblks_list_purge() will
   2927 	 * just add these hmeblks to the per-cpu pending queue.
   2928 	 */
   2929 	sfmmu_hblks_list_purge(&list, 1);
   2930 
   2931 	ASSERT(get_hblk_ttesz(hmeblkp) == size);
   2932 	ASSERT(!hmeblkp->hblk_shw_bit);
   2933 	ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared);
   2934 	ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared);
   2935 	ASSERT(hmeblkp->hblk_tag.htag_rid == rid);
   2936 
   2937 	return (hmeblkp);
   2938 }
   2939 
   2940 /*
   2941  * Function adds a tte entry into the hmeblk. It returns 0 if successful and 1
   2942  * otherwise.
   2943  */
   2944 static int
   2945 sfmmu_tteload_addentry(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, tte_t *ttep,
   2946 	caddr_t vaddr, page_t **pps, uint_t flags, uint_t rid)
   2947 {
   2948 	page_t *pp = *pps;
   2949 	int hmenum, size, remap;
   2950 	tte_t tteold, flush_tte;
   2951 #ifdef DEBUG
   2952 	tte_t orig_old;
   2953 #endif /* DEBUG */
   2954 	struct sf_hment *sfhme;
   2955 	kmutex_t *pml, *pmtx;
   2956 	hatlock_t *hatlockp;
   2957 	int myflt;
   2958 
   2959 	/*
   2960 	 * remove this panic when we decide to let user virtual address
   2961 	 * space be >= USERLIMIT.
   2962 	 */
   2963 	if (!TTE_IS_PRIVILEGED(ttep) && vaddr >= (caddr_t)USERLIMIT)
   2964 		panic("user addr %p in kernel space", (void *)vaddr);
   2965 #if defined(TTE_IS_GLOBAL)
   2966 	if (TTE_IS_GLOBAL(ttep))
   2967 		panic("sfmmu_tteload: creating global tte");
   2968 #endif
   2969 
   2970 #ifdef DEBUG
   2971 	if (pf_is_memory(sfmmu_ttetopfn(ttep, vaddr)) &&
   2972 	    !TTE_IS_PCACHEABLE(ttep) && !sfmmu_allow_nc_trans)
   2973 		panic("sfmmu_tteload: non cacheable memory tte");
   2974 #endif /* DEBUG */
   2975 
   2976 	/* don't simulate dirty bit for writeable ISM/DISM mappings */
   2977 	if ((flags & HAT_LOAD_SHARE) && TTE_IS_WRITABLE(ttep)) {
   2978 		TTE_SET_REF(ttep);
   2979 		TTE_SET_MOD(ttep);
   2980 	}
   2981 
   2982 	if ((flags & HAT_LOAD_SHARE) || !TTE_IS_REF(ttep) ||
   2983 	    !TTE_IS_MOD(ttep)) {
   2984 		/*
   2985 		 * Don't load TSB for dummy as in ISM.  Also don't preload
   2986 		 * the TSB if the TTE isn't writable since we're likely to
   2987 		 * fault on it again -- preloading can be fairly expensive.
   2988 		 */
   2989 		flags |= SFMMU_NO_TSBLOAD;
   2990 	}
   2991 
   2992 	size = TTE_CSZ(ttep);
   2993 	switch (size) {
   2994 	case TTE8K:
   2995 		SFMMU_STAT(sf_tteload8k);
   2996 		break;
   2997 	case TTE64K:
   2998 		SFMMU_STAT(sf_tteload64k);
   2999 		break;
   3000 	case TTE512K:
   3001 		SFMMU_STAT(sf_tteload512k);
   3002 		break;
   3003 	case TTE4M:
   3004 		SFMMU_STAT(sf_tteload4m);
   3005 		break;
   3006 	case (TTE32M):
   3007 		SFMMU_STAT(sf_tteload32m);
   3008 		ASSERT(mmu_page_sizes == max_mmu_page_sizes);
   3009 		break;
   3010 	case (TTE256M):
   3011 		SFMMU_STAT(sf_tteload256m);
   3012 		ASSERT(mmu_page_sizes == max_mmu_page_sizes);
   3013 		break;
   3014 	}
   3015 
   3016 	ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size)));
   3017 	SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size));
   3018 	ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared);
   3019 	ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared);
   3020 
   3021 	HBLKTOHME_IDX(sfhme, hmeblkp, vaddr, hmenum);
   3022 
   3023 	/*
   3024 	 * Need to grab mlist lock here so that pageunload
   3025 	 * will not change tte behind us.
   3026 	 */
   3027 	if (pp) {
   3028 		pml = sfmmu_mlist_enter(pp);
   3029 	}
   3030 
   3031 	sfmmu_copytte(&sfhme->hme_tte, &tteold);
   3032 	/*
   3033 	 * Look for corresponding hment and if valid verify
   3034 	 * pfns are equal.
   3035 	 */
   3036 	remap = TTE_IS_VALID(&tteold);
   3037 	if (remap) {
   3038 		pfn_t	new_pfn, old_pfn;
   3039 
   3040 		old_pfn = TTE_TO_PFN(vaddr, &tteold);
   3041 		new_pfn = TTE_TO_PFN(vaddr, ttep);
   3042 
   3043 		if (flags & HAT_LOAD_REMAP) {
   3044 			/* make sure we are remapping same type of pages */
   3045 			if (pf_is_memory(old_pfn) != pf_is_memory(new_pfn)) {
   3046 				panic("sfmmu_tteload - tte remap io<->memory");
   3047 			}
   3048 			if (old_pfn != new_pfn &&
   3049 			    (pp != NULL || sfhme->hme_page != NULL)) {
   3050 				panic("sfmmu_tteload - tte remap pp != NULL");
   3051 			}
   3052 		} else if (old_pfn != new_pfn) {
   3053 			panic("sfmmu_tteload - tte remap, hmeblkp 0x%p",
   3054 			    (void *)hmeblkp);
   3055 		}
   3056 		ASSERT(TTE_CSZ(&tteold) == TTE_CSZ(ttep));
   3057 	}
   3058 
   3059 	if (pp) {
   3060 		if (size == TTE8K) {
   3061 #ifdef VAC
   3062 			/*
   3063 			 * Handle VAC consistency
   3064 			 */
   3065 			if (!remap && (cache & CACHE_VAC) && !PP_ISNC(pp)) {
   3066 				sfmmu_vac_conflict(sfmmup, vaddr, pp);
   3067 			}
   3068 #endif
   3069 
   3070 			if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) {
   3071 				pmtx = sfmmu_page_enter(pp);
   3072 				PP_CLRRO(pp);
   3073 				sfmmu_page_exit(pmtx);
   3074 			} else if (!PP_ISMAPPED(pp) &&
   3075 			    (!TTE_IS_WRITABLE(ttep)) && !(PP_ISMOD(pp))) {
   3076 				pmtx = sfmmu_page_enter(pp);
   3077 				if (!(PP_ISMOD(pp))) {
   3078 					PP_SETRO(pp);
   3079 				}
   3080 				sfmmu_page_exit(pmtx);
   3081 			}
   3082 
   3083 		} else if (sfmmu_pagearray_setup(vaddr, pps, ttep, remap)) {
   3084 			/*
   3085 			 * sfmmu_pagearray_setup failed so return
   3086 			 */
   3087 			sfmmu_mlist_exit(pml);
   3088 			return (1);
   3089 		}
   3090 	}
   3091 
   3092 	/*
   3093 	 * Make sure hment is not on a mapping list.
   3094 	 */
   3095 	ASSERT(remap || (sfhme->hme_page == NULL));
   3096 
   3097 	/* if it is not a remap then hme->next better be NULL */
   3098 	ASSERT((!remap) ? sfhme->hme_next == NULL : 1);
   3099 
   3100 	if (flags & HAT_LOAD_LOCK) {
   3101 		if ((hmeblkp->hblk_lckcnt + 1) >= MAX_HBLK_LCKCNT) {
   3102 			panic("too high lckcnt-hmeblk %p",
   3103 			    (void *)hmeblkp);
   3104 		}
   3105 		atomic_add_32(&hmeblkp->hblk_lckcnt, 1);
   3106 
   3107 		HBLK_STACK_TRACE(hmeblkp, HBLK_LOCK);
   3108 	}
   3109 
   3110 #ifdef VAC
   3111 	if (pp && PP_ISNC(pp)) {
   3112 		/*
   3113 		 * If the physical page is marked to be uncacheable, like
   3114 		 * by a vac conflict, make sure the new mapping is also
   3115 		 * uncacheable.
   3116 		 */
   3117 		TTE_CLR_VCACHEABLE(ttep);
   3118 		ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR);
   3119 	}
   3120 #endif
   3121 	ttep->tte_hmenum = hmenum;
   3122 
   3123 #ifdef DEBUG
   3124 	orig_old = tteold;
   3125 #endif /* DEBUG */
   3126 
   3127 	while (sfmmu_modifytte_try(&tteold, ttep, &sfhme->hme_tte) < 0) {
   3128 		if ((sfmmup == KHATID) &&
   3129 		    (flags & (HAT_LOAD_LOCK | HAT_LOAD_REMAP))) {
   3130 			sfmmu_copytte(&sfhme->hme_tte, &tteold);
   3131 		}
   3132 #ifdef DEBUG
   3133 		chk_tte(&orig_old, &tteold, ttep, hmeblkp);
   3134 #endif /* DEBUG */
   3135 	}
   3136 	ASSERT(TTE_IS_VALID(&sfhme->hme_tte));
   3137 
   3138 	if (!TTE_IS_VALID(&tteold)) {
   3139 
   3140 		atomic_add_16(&hmeblkp->hblk_vcnt, 1);
   3141 		if (rid == SFMMU_INVALID_SHMERID) {
   3142 			atomic_add_long(&sfmmup->sfmmu_ttecnt[size], 1);
   3143 		} else {
   3144 			sf_srd_t *srdp = sfmmup->sfmmu_srdp;
   3145 			sf_region_t *rgnp = srdp->srd_hmergnp[rid];
   3146 			/*
   3147 			 * We already accounted for region ttecnt's in sfmmu
   3148 			 * during hat_join_region() processing. Here we
   3149 			 * only update ttecnt's in region struture.
   3150 			 */
   3151 			atomic_add_long(&rgnp->rgn_ttecnt[size], 1);
   3152 		}
   3153 	}
   3154 
   3155 	myflt = (astosfmmu(curthread->t_procp->p_as) == sfmmup);
   3156 	if (size > TTE8K && (flags & HAT_LOAD_SHARE) == 0 &&
   3157 	    sfmmup != ksfmmup) {
   3158 		uchar_t tteflag = 1 << size;
   3159 		if (rid == SFMMU_INVALID_SHMERID) {
   3160 			if (!(sfmmup->sfmmu_tteflags & tteflag)) {
   3161 				hatlockp = sfmmu_hat_enter(sfmmup);
   3162 				sfmmup->sfmmu_tteflags |= tteflag;
   3163 				sfmmu_hat_exit(hatlockp);
   3164 			}
   3165 		} else if (!(sfmmup->sfmmu_rtteflags & tteflag)) {
   3166 			hatlockp = sfmmu_hat_enter(sfmmup);
   3167 			sfmmup->sfmmu_rtteflags |= tteflag;
   3168 			sfmmu_hat_exit(hatlockp);
   3169 		}
   3170 		/*
   3171 		 * Update the current CPU tsbmiss area, so the current thread
   3172 		 * won't need to take the tsbmiss for the new pagesize.
   3173 		 * The other threads in the process will update their tsb
   3174 		 * miss area lazily in sfmmu_tsbmiss_exception() when they
   3175 		 * fail to find the translation for a newly added pagesize.
   3176 		 */
   3177 		if (size > TTE64K && myflt) {
   3178 			struct tsbmiss *tsbmp;
   3179 			kpreempt_disable();
   3180 			tsbmp = &tsbmiss_area[CPU->cpu_id];
   3181 			if (rid == SFMMU_INVALID_SHMERID) {
   3182 				if (!(tsbmp->uhat_tteflags & tteflag)) {
   3183 					tsbmp->uhat_tteflags |= tteflag;
   3184 				}
   3185 			} else {
   3186 				if (!(tsbmp->uhat_rtteflags & tteflag)) {
   3187 					tsbmp->uhat_rtteflags |= tteflag;
   3188 				}
   3189 			}
   3190 			kpreempt_enable();
   3191 		}
   3192 	}
   3193 
   3194 	if (size >= TTE4M && (flags & HAT_LOAD_TEXT) &&
   3195 	    !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) {
   3196 		hatlockp = sfmmu_hat_enter(sfmmup);
   3197 		SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG);
   3198 		sfmmu_hat_exit(hatlockp);
   3199 	}
   3200 
   3201 	flush_tte.tte_intlo = (tteold.tte_intlo ^ ttep->tte_intlo) &
   3202 	    hw_tte.tte_intlo;
   3203 	flush_tte.tte_inthi = (tteold.tte_inthi ^ ttep->tte_inthi) &
   3204 	    hw_tte.tte_inthi;
   3205 
   3206 	if (remap && (flush_tte.tte_inthi || flush_tte.tte_intlo)) {
   3207 		/*
   3208 		 * If remap and new tte differs from old tte we need
   3209 		 * to sync the mod bit and flush TLB/TSB.  We don't
   3210 		 * need to sync ref bit because we currently always set
   3211 		 * ref bit in tteload.
   3212 		 */
   3213 		ASSERT(TTE_IS_REF(ttep));
   3214 		if (TTE_IS_MOD(&tteold)) {
   3215 			sfmmu_ttesync(sfmmup, vaddr, &tteold, pp);
   3216 		}
   3217 		/*
   3218 		 * hwtte bits shouldn't change for SRD hmeblks as long as SRD
   3219 		 * hmes are only used for read only text. Adding this code for
   3220 		 * completeness and future use of shared hmeblks with writable
   3221 		 * mappings of VMODSORT vnodes.
   3222 		 */
   3223 		if (hmeblkp->hblk_shared) {
   3224 			cpuset_t cpuset = sfmmu_rgntlb_demap(vaddr,
   3225 			    sfmmup->sfmmu_srdp->srd_hmergnp[rid], hmeblkp, 1);
   3226 			xt_sync(cpuset);
   3227 			SFMMU_STAT_ADD(sf_region_remap_demap, 1);
   3228 		} else {
   3229 			sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 0);
   3230 			xt_sync(sfmmup->sfmmu_cpusran);
   3231 		}
   3232 	}
   3233 
   3234 	if ((flags & SFMMU_NO_TSBLOAD) == 0) {
   3235 		/*
   3236 		 * We only preload 8K and 4M mappings into the TSB, since
   3237 		 * 64K and 512K mappings are replicated and hence don't
   3238 		 * have a single, unique TSB entry. Ditto for 32M/256M.
   3239 		 */
   3240 		if (size == TTE8K || size == TTE4M) {
   3241 			sf_scd_t *scdp;
   3242 			hatlockp = sfmmu_hat_enter(sfmmup);
   3243 			/*
   3244 			 * Don't preload private TSB if the mapping is used
   3245 			 * by the shctx in the SCD.
   3246 			 */
   3247 			scdp = sfmmup->sfmmu_scdp;
   3248 			if (rid == SFMMU_INVALID_SHMERID || scdp == NULL ||
   3249 			    !SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) {
   3250 				sfmmu_load_tsb(sfmmup, vaddr, &sfhme->hme_tte,
   3251 				    size);
   3252 			}
   3253 			sfmmu_hat_exit(hatlockp);
   3254 		}
   3255 	}
   3256 	if (pp) {
   3257 		if (!remap) {
   3258 			HME_ADD(sfhme, pp);
   3259 			atomic_add_16(&hmeblkp->hblk_hmecnt, 1);
   3260 			ASSERT(hmeblkp->hblk_hmecnt > 0);
   3261 
   3262 			/*
   3263 			 * Cannot ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS)
   3264 			 * see pageunload() for comment.
   3265 			 */
   3266 		}
   3267 		sfmmu_mlist_exit(pml);
   3268 	}
   3269 
   3270 	return (0);
   3271 }
   3272 /*
   3273  * Function unlocks hash bucket.
   3274  */
   3275 static void
   3276 sfmmu_tteload_release_hashbucket(struct hmehash_bucket *hmebp)
   3277 {
   3278 	ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
   3279 	SFMMU_HASH_UNLOCK(hmebp);
   3280 }
   3281 
   3282 /*
   3283  * function which checks and sets up page array for a large
   3284  * translation.  Will set p_vcolor, p_index, p_ro fields.
   3285  * Assumes addr and pfnum of first page are properly aligned.
   3286  * Will check for physical contiguity. If check fails it return
   3287  * non null.
   3288  */
   3289 static int
   3290 sfmmu_pagearray_setup(caddr_t addr, page_t **pps, tte_t *ttep, int remap)
   3291 {
   3292 	int 	i, index, ttesz;
   3293 	pfn_t	pfnum;
   3294 	pgcnt_t	npgs;
   3295 	page_t *pp, *pp1;
   3296 	kmutex_t *pmtx;
   3297 #ifdef VAC
   3298 	int osz;
   3299 	int cflags = 0;
   3300 	int vac_err = 0;
   3301 #endif
   3302 	int newidx = 0;
   3303 
   3304 	ttesz = TTE_CSZ(ttep);
   3305 
   3306 	ASSERT(ttesz > TTE8K);
   3307 
   3308 	npgs = TTEPAGES(ttesz);
   3309 	index = PAGESZ_TO_INDEX(ttesz);
   3310 
   3311 	pfnum = (*pps)->p_pagenum;
   3312 	ASSERT(IS_P2ALIGNED(pfnum, npgs));
   3313 
   3314 	/*
   3315 	 * Save the first pp so we can do HAT_TMPNC at the end.
   3316 	 */
   3317 	pp1 = *pps;
   3318 #ifdef VAC
   3319 	osz = fnd_mapping_sz(pp1);
   3320 #endif
   3321 
   3322 	for (i = 0; i < npgs; i++, pps++) {
   3323 		pp = *pps;
   3324 		ASSERT(PAGE_LOCKED(pp));
   3325 		ASSERT(pp->p_szc >= ttesz);
   3326 		ASSERT(pp->p_szc == pp1->p_szc);
   3327 		ASSERT(sfmmu_mlist_held(pp));
   3328 
   3329 		/*
   3330 		 * XXX is it possible to maintain P_RO on the root only?
   3331 		 */
   3332 		if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) {
   3333 			pmtx = sfmmu_page_enter(pp);
   3334 			PP_CLRRO(pp);
   3335 			sfmmu_page_exit(pmtx);
   3336 		} else if (!PP_ISMAPPED(pp) && !TTE_IS_WRITABLE(ttep) &&
   3337 		    !PP_ISMOD(pp)) {
   3338 			pmtx = sfmmu_page_enter(pp);
   3339 			if (!(PP_ISMOD(pp))) {
   3340 				PP_SETRO(pp);
   3341 			}
   3342 			sfmmu_page_exit(pmtx);
   3343 		}
   3344 
   3345 		/*
   3346 		 * If this is a remap we skip vac & contiguity checks.
   3347 		 */
   3348 		if (remap)
   3349 			continue;
   3350 
   3351 		/*
   3352 		 * set p_vcolor and detect any vac conflicts.
   3353 		 */
   3354 #ifdef VAC
   3355 		if (vac_err == 0) {
   3356 			vac_err = sfmmu_vacconflict_array(addr, pp, &cflags);
   3357 
   3358 		}
   3359 #endif
   3360 
   3361 		/*
   3362 		 * Save current index in case we need to undo it.
   3363 		 * Note: "PAGESZ_TO_INDEX(sz)	(1 << (sz))"
   3364 		 *	"SFMMU_INDEX_SHIFT	6"
   3365 		 *	 "SFMMU_INDEX_MASK	((1 << SFMMU_INDEX_SHIFT) - 1)"
   3366 		 *	 "PP_MAPINDEX(p_index)	(p_index & SFMMU_INDEX_MASK)"
   3367 		 *
   3368 		 * So:	index = PAGESZ_TO_INDEX(ttesz);
   3369 		 *	if ttesz == 1 then index = 0x2
   3370 		 *		    2 then index = 0x4
   3371 		 *		    3 then index = 0x8
   3372 		 *		    4 then index = 0x10
   3373 		 *		    5 then index = 0x20
   3374 		 * The code below checks if it's a new pagesize (ie, newidx)
   3375 		 * in case we need to take it back out of p_index,
   3376 		 * and then or's the new index into the existing index.
   3377 		 */
   3378 		if ((PP_MAPINDEX(pp) & index) == 0)
   3379 			newidx = 1;
   3380 		pp->p_index = (PP_MAPINDEX(pp) | index);
   3381 
   3382 		/*
   3383 		 * contiguity check
   3384 		 */
   3385 		if (pp->p_pagenum != pfnum) {
   3386 			/*
   3387 			 * If we fail the contiguity test then
   3388 			 * the only thing we need to fix is the p_index field.
   3389 			 * We might get a few extra flushes but since this
   3390 			 * path is rare that is ok.  The p_ro field will
   3391 			 * get automatically fixed on the next tteload to
   3392 			 * the page.  NO TNC bit is set yet.
   3393 			 */
   3394 			while (i >= 0) {
   3395 				pp = *pps;
   3396 				if (newidx)
   3397 					pp->p_index = (PP_MAPINDEX(pp) &
   3398 					    ~index);
   3399 				pps--;
   3400 				i--;
   3401 			}
   3402 			return (1);
   3403 		}
   3404 		pfnum++;
   3405 		addr += MMU_PAGESIZE;
   3406 	}
   3407 
   3408 #ifdef VAC
   3409 	if (vac_err) {
   3410 		if (ttesz > osz) {
   3411 			/*
   3412 			 * There are some smaller mappings that causes vac
   3413 			 * conflicts. Convert all existing small mappings to
   3414 			 * TNC.
   3415 			 */
   3416 			SFMMU_STAT_ADD(sf_uncache_conflict, npgs);
   3417 			sfmmu_page_cache_array(pp1, HAT_TMPNC, CACHE_FLUSH,
   3418 			    npgs);
   3419 		} else {
   3420 			/* EMPTY */
   3421 			/*
   3422 			 * If there exists an big page mapping,
   3423 			 * that means the whole existing big page
   3424 			 * has TNC setting already. No need to covert to
   3425 			 * TNC again.
   3426 			 */
   3427 			ASSERT(PP_ISTNC(pp1));
   3428 		}
   3429 	}
   3430 #endif	/* VAC */
   3431 
   3432 	return (0);
   3433 }
   3434 
   3435 #ifdef VAC
   3436 /*
   3437  * Routine that detects vac consistency for a large page. It also
   3438  * sets virtual color for all pp's for this big mapping.
   3439  */
   3440 static int
   3441 sfmmu_vacconflict_array(caddr_t addr, page_t *pp, int *cflags)
   3442 {
   3443 	int vcolor, ocolor;
   3444 
   3445 	ASSERT(sfmmu_mlist_held(pp));
   3446 
   3447 	if (PP_ISNC(pp)) {
   3448 		return (HAT_TMPNC);
   3449 	}
   3450 
   3451 	vcolor = addr_to_vcolor(addr);
   3452 	if (PP_NEWPAGE(pp)) {
   3453 		PP_SET_VCOLOR(pp, vcolor);
   3454 		return (0);
   3455 	}
   3456 
   3457 	ocolor = PP_GET_VCOLOR(pp);
   3458 	if (ocolor == vcolor) {
   3459 		return (0);
   3460 	}
   3461 
   3462 	if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) {
   3463 		/*
   3464 		 * Previous user of page had a differnet color
   3465 		 * but since there are no current users
   3466 		 * we just flush the cache and change the color.
   3467 		 * As an optimization for large pages we flush the
   3468 		 * entire cache of that color and set a flag.
   3469 		 */
   3470 		SFMMU_STAT(sf_pgcolor_conflict);
   3471 		if (!CacheColor_IsFlushed(*cflags, ocolor)) {
   3472 			CacheColor_SetFlushed(*cflags, ocolor);
   3473 			sfmmu_cache_flushcolor(ocolor, pp->p_pagenum);
   3474 		}
   3475 		PP_SET_VCOLOR(pp, vcolor);
   3476 		return (0);
   3477 	}
   3478 
   3479 	/*
   3480 	 * We got a real conflict with a current mapping.
   3481 	 * set flags to start unencaching all mappings
   3482 	 * and return failure so we restart looping
   3483 	 * the pp array from the beginning.
   3484 	 */
   3485 	return (HAT_TMPNC);
   3486 }
   3487 #endif	/* VAC */
   3488 
   3489 /*
   3490  * creates a large page shadow hmeblk for a tte.
   3491  * The purpose of this routine is to allow us to do quick unloads because
   3492  * the vm layer can easily pass a very large but sparsely populated range.
   3493  */
   3494 static struct hme_blk *
   3495 sfmmu_shadow_hcreate(sfmmu_t *sfmmup, caddr_t vaddr, int ttesz, uint_t flags)
   3496 {
   3497 	struct hmehash_bucket *hmebp;
   3498 	hmeblk_tag hblktag;
   3499 	int hmeshift, size, vshift;
   3500 	uint_t shw_mask, newshw_mask;
   3501 	struct hme_blk *hmeblkp;
   3502 
   3503 	ASSERT(sfmmup != KHATID);
   3504 	if (mmu_page_sizes == max_mmu_page_sizes) {
   3505 		ASSERT(ttesz < TTE256M);
   3506 	} else {
   3507 		ASSERT(ttesz < TTE4M);
   3508 		ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0);
   3509 		ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0);
   3510 	}
   3511 
   3512 	if (ttesz == TTE8K) {
   3513 		size = TTE512K;
   3514 	} else {
   3515 		size = ++ttesz;
   3516 	}
   3517 
   3518 	hblktag.htag_id = sfmmup;
   3519 	hmeshift = HME_HASH_SHIFT(size);
   3520 	hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift);
   3521 	hblktag.htag_rehash = HME_HASH_REHASH(size);
   3522 	hblktag.htag_rid = SFMMU_INVALID_SHMERID;
   3523 	hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift);
   3524 
   3525 	SFMMU_HASH_LOCK(hmebp);
   3526 
   3527 	HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp);
   3528 	ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve);
   3529 	if (hmeblkp == NULL) {
   3530 		hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size,
   3531 		    hblktag, flags, SFMMU_INVALID_SHMERID);
   3532 	}
   3533 	ASSERT(hmeblkp);
   3534 	if (!hmeblkp->hblk_shw_mask) {
   3535 		/*
   3536 		 * if this is a unused hblk it was just allocated or could
   3537 		 * potentially be a previous large page hblk so we need to
   3538 		 * set the shadow bit.
   3539 		 */
   3540 		ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt);
   3541 		hmeblkp->hblk_shw_bit = 1;
   3542 	} else if (hmeblkp->hblk_shw_bit == 0) {
   3543 		panic("sfmmu_shadow_hcreate: shw bit not set in hmeblkp 0x%p",
   3544 		    (void *)hmeblkp);
   3545 	}
   3546 	ASSERT(hmeblkp->hblk_shw_bit == 1);
   3547 	ASSERT(!hmeblkp->hblk_shared);
   3548 	vshift = vaddr_to_vshift(hblktag, vaddr, size);
   3549 	ASSERT(vshift < 8);
   3550 	/*
   3551 	 * Atomically set shw mask bit
   3552 	 */
   3553 	do {
   3554 		shw_mask = hmeblkp->hblk_shw_mask;
   3555 		newshw_mask = shw_mask | (1 << vshift);
   3556 		newshw_mask = cas32(&hmeblkp->hblk_shw_mask, shw_mask,
   3557 		    newshw_mask);
   3558 	} while (newshw_mask != shw_mask);
   3559 
   3560 	SFMMU_HASH_UNLOCK(hmebp);
   3561 
   3562 	return (hmeblkp);
   3563 }
   3564 
   3565 /*
   3566  * This routine cleanup a previous shadow hmeblk and changes it to
   3567  * a regular hblk.  This happens rarely but it is possible
   3568  * when a process wants to use large pages and there are hblks still
   3569  * lying around from the previous as that used these hmeblks.
   3570  * The alternative was to cleanup the shadow hblks at unload time
   3571  * but since so few user processes actually use large pages, it is
   3572  * better to be lazy and cleanup at this time.
   3573  */
   3574 static void
   3575 sfmmu_shadow_hcleanup(sfmmu_t *sfmmup, struct hme_blk *hmeblkp,
   3576 	struct hmehash_bucket *hmebp)
   3577 {
   3578 	caddr_t addr, endaddr;
   3579 	int hashno, size;
   3580 
   3581 	ASSERT(hmeblkp->hblk_shw_bit);
   3582 	ASSERT(!hmeblkp->hblk_shared);
   3583 
   3584 	ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
   3585 
   3586 	if (!hmeblkp->hblk_shw_mask) {
   3587 		hmeblkp->hblk_shw_bit = 0;
   3588 		return;
   3589 	}
   3590 	addr = (caddr_t)get_hblk_base(hmeblkp);
   3591 	endaddr = get_hblk_endaddr(hmeblkp);
   3592 	size = get_hblk_ttesz(hmeblkp);
   3593 	hashno = size - 1;
   3594 	ASSERT(hashno > 0);
   3595 	SFMMU_HASH_UNLOCK(hmebp);
   3596 
   3597 	sfmmu_free_hblks(sfmmup, addr, endaddr, hashno);
   3598 
   3599 	SFMMU_HASH_LOCK(hmebp);
   3600 }
   3601 
   3602 static void
   3603 sfmmu_free_hblks(sfmmu_t *sfmmup, caddr_t addr, caddr_t endaddr,
   3604 	int hashno)
   3605 {
   3606 	int hmeshift, shadow = 0;
   3607 	hmeblk_tag hblktag;
   3608 	struct hmehash_bucket *hmebp;
   3609 	struct hme_blk *hmeblkp;
   3610 	struct hme_blk *nx_hblk, *pr_hblk, *list = NULL;
   3611 
   3612 	ASSERT(hashno > 0);
   3613 	hblktag.htag_id = sfmmup;
   3614 	hblktag.htag_rehash = hashno;
   3615 	hblktag.htag_rid = SFMMU_INVALID_SHMERID;
   3616 
   3617 	hmeshift = HME_HASH_SHIFT(hashno);
   3618 
   3619 	while (addr < endaddr) {
   3620 		hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
   3621 		hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
   3622 		SFMMU_HASH_LOCK(hmebp);
   3623 		/* inline HME_HASH_SEARCH */
   3624 		hmeblkp = hmebp->hmeblkp;
   3625 		pr_hblk = NULL;
   3626 		while (hmeblkp) {
   3627 			if (HTAGS_EQ(hmeblkp->hblk_tag, hblktag)) {
   3628 				/* found hme_blk */
   3629 				ASSERT(!hmeblkp->hblk_shared);
   3630 				if (hmeblkp->hblk_shw_bit) {
   3631 					if (hmeblkp->hblk_shw_mask) {
   3632 						shadow = 1;
   3633 						sfmmu_shadow_hcleanup(sfmmup,
   3634 						    hmeblkp, hmebp);
   3635 						break;
   3636 					} else {
   3637 						hmeblkp->hblk_shw_bit = 0;
   3638 					}
   3639 				}
   3640 
   3641 				/*
   3642 				 * Hblk_hmecnt and hblk_vcnt could be non zero
   3643 				 * since hblk_unload() does not gurantee that.
   3644 				 *
   3645 				 * XXX - this could cause tteload() to spin
   3646 				 * where sfmmu_shadow_hcleanup() is called.
   3647 				 */
   3648 			}
   3649 
   3650 			nx_hblk = hmeblkp->hblk_next;
   3651 			if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) {
   3652 				sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
   3653 				    &list, 0);
   3654 			} else {
   3655 				pr_hblk = hmeblkp;
   3656 			}
   3657 			hmeblkp = nx_hblk;
   3658 		}
   3659 
   3660 		SFMMU_HASH_UNLOCK(hmebp);
   3661 
   3662 		if (shadow) {
   3663 			/*
   3664 			 * We found another shadow hblk so cleaned its
   3665 			 * children.  We need to go back and cleanup
   3666 			 * the original hblk so we don't change the
   3667 			 * addr.
   3668 			 */
   3669 			shadow = 0;
   3670 		} else {
   3671 			addr = (caddr_t)roundup((uintptr_t)addr + 1,
   3672 			    (1 << hmeshift));
   3673 		}
   3674 	}
   3675 	sfmmu_hblks_list_purge(&list, 0);
   3676 }
   3677 
   3678 /*
   3679  * This routine's job is to delete stale invalid shared hmeregions hmeblks that
   3680  * may still linger on after pageunload.
   3681  */
   3682 static void
   3683 sfmmu_cleanup_rhblk(sf_srd_t *srdp, caddr_t addr, uint_t rid, int ttesz)
   3684 {
   3685 	int hmeshift;
   3686 	hmeblk_tag hblktag;
   3687 	struct hmehash_bucket *hmebp;
   3688 	struct hme_blk *hmeblkp;
   3689 	struct hme_blk *pr_hblk;
   3690 	struct hme_blk *list = NULL;
   3691 
   3692 	ASSERT(SFMMU_IS_SHMERID_VALID(rid));
   3693 	ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   3694 
   3695 	hmeshift = HME_HASH_SHIFT(ttesz);
   3696 	hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
   3697 	hblktag.htag_rehash = ttesz;
   3698 	hblktag.htag_rid = rid;
   3699 	hblktag.htag_id = srdp;
   3700 	hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift);
   3701 
   3702 	SFMMU_HASH_LOCK(hmebp);
   3703 	HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list);
   3704 	if (hmeblkp != NULL) {
   3705 		ASSERT(hmeblkp->hblk_shared);
   3706 		ASSERT(!hmeblkp->hblk_shw_bit);
   3707 		if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) {
   3708 			panic("sfmmu_cleanup_rhblk: valid hmeblk");
   3709 		}
   3710 		ASSERT(!hmeblkp->hblk_lckcnt);
   3711 		sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
   3712 		    &list, 0);
   3713 	}
   3714 	SFMMU_HASH_UNLOCK(hmebp);
   3715 	sfmmu_hblks_list_purge(&list, 0);
   3716 }
   3717 
   3718 /* ARGSUSED */
   3719 static void
   3720 sfmmu_rgn_cb_noop(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr,
   3721     size_t r_size, void *r_obj, u_offset_t r_objoff)
   3722 {
   3723 }
   3724 
   3725 /*
   3726  * Searches for an hmeblk which maps addr, then unloads this mapping
   3727  * and updates *eaddrp, if the hmeblk is found.
   3728  */
   3729 static void
   3730 sfmmu_unload_hmeregion_va(sf_srd_t *srdp, uint_t rid, caddr_t addr,
   3731     caddr_t eaddr, int ttesz, caddr_t *eaddrp)
   3732 {
   3733 	int hmeshift;
   3734 	hmeblk_tag hblktag;
   3735 	struct hmehash_bucket *hmebp;
   3736 	struct hme_blk *hmeblkp;
   3737 	struct hme_blk *pr_hblk;
   3738 	struct hme_blk *list = NULL;
   3739 
   3740 	ASSERT(SFMMU_IS_SHMERID_VALID(rid));
   3741 	ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   3742 	ASSERT(ttesz >= HBLK_MIN_TTESZ);
   3743 
   3744 	hmeshift = HME_HASH_SHIFT(ttesz);
   3745 	hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
   3746 	hblktag.htag_rehash = ttesz;
   3747 	hblktag.htag_rid = rid;
   3748 	hblktag.htag_id = srdp;
   3749 	hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift);
   3750 
   3751 	SFMMU_HASH_LOCK(hmebp);
   3752 	HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list);
   3753 	if (hmeblkp != NULL) {
   3754 		ASSERT(hmeblkp->hblk_shared);
   3755 		ASSERT(!hmeblkp->hblk_lckcnt);
   3756 		if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) {
   3757 			*eaddrp = sfmmu_hblk_unload(NULL, hmeblkp, addr,
   3758 			    eaddr, NULL, HAT_UNLOAD);
   3759 			ASSERT(*eaddrp > addr);
   3760 		}
   3761 		ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt);
   3762 		sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
   3763 		    &list, 0);
   3764 	}
   3765 	SFMMU_HASH_UNLOCK(hmebp);
   3766 	sfmmu_hblks_list_purge(&list, 0);
   3767 }
   3768 
   3769 static void
   3770 sfmmu_unload_hmeregion(sf_srd_t *srdp, sf_region_t *rgnp)
   3771 {
   3772 	int ttesz = rgnp->rgn_pgszc;
   3773 	size_t rsz = rgnp->rgn_size;
   3774 	caddr_t rsaddr = rgnp->rgn_saddr;
   3775 	caddr_t readdr = rsaddr + rsz;
   3776 	caddr_t rhsaddr;
   3777 	caddr_t va;
   3778 	uint_t rid = rgnp->rgn_id;
   3779 	caddr_t cbsaddr;
   3780 	caddr_t cbeaddr;
   3781 	hat_rgn_cb_func_t rcbfunc;
   3782 	ulong_t cnt;
   3783 
   3784 	ASSERT(SFMMU_IS_SHMERID_VALID(rid));
   3785 	ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   3786 
   3787 	ASSERT(IS_P2ALIGNED(rsaddr, TTEBYTES(ttesz)));
   3788 	ASSERT(IS_P2ALIGNED(rsz, TTEBYTES(ttesz)));
   3789 	if (ttesz < HBLK_MIN_TTESZ) {
   3790 		ttesz = HBLK_MIN_TTESZ;
   3791 		rhsaddr = (caddr_t)P2ALIGN((uintptr_t)rsaddr, HBLK_MIN_BYTES);
   3792 	} else {
   3793 		rhsaddr = rsaddr;
   3794 	}
   3795 
   3796 	if ((rcbfunc = rgnp->rgn_cb_function) == NULL) {
   3797 		rcbfunc = sfmmu_rgn_cb_noop;
   3798 	}
   3799 
   3800 	while (ttesz >= HBLK_MIN_TTESZ) {
   3801 		cbsaddr = rsaddr;
   3802 		cbeaddr = rsaddr;
   3803 		if (!(rgnp->rgn_hmeflags & (1 << ttesz))) {
   3804 			ttesz--;
   3805 			continue;
   3806 		}
   3807 		cnt = 0;
   3808 		va = rsaddr;
   3809 		while (va < readdr) {
   3810 			ASSERT(va >= rhsaddr);
   3811 			if (va != cbeaddr) {
   3812 				if (cbeaddr != cbsaddr) {
   3813 					ASSERT(cbeaddr > cbsaddr);
   3814 					(*rcbfunc)(cbsaddr, cbeaddr,
   3815 					    rsaddr, rsz, rgnp->rgn_obj,
   3816 					    rgnp->rgn_objoff);
   3817 				}
   3818 				cbsaddr = va;
   3819 				cbeaddr = va;
   3820 			}
   3821 			sfmmu_unload_hmeregion_va(srdp, rid, va, readdr,
   3822 			    ttesz, &cbeaddr);
   3823 			cnt++;
   3824 			va = rhsaddr + (cnt << TTE_PAGE_SHIFT(ttesz));
   3825 		}
   3826 		if (cbeaddr != cbsaddr) {
   3827 			ASSERT(cbeaddr > cbsaddr);
   3828 			(*rcbfunc)(cbsaddr, cbeaddr, rsaddr,
   3829 			    rsz, rgnp->rgn_obj,
   3830 			    rgnp->rgn_objoff);
   3831 		}
   3832 		ttesz--;
   3833 	}
   3834 }
   3835 
   3836 /*
   3837  * Release one hardware address translation lock on the given address range.
   3838  */
   3839 void
   3840 hat_unlock(struct hat *sfmmup, caddr_t addr, size_t len)
   3841 {
   3842 	struct hmehash_bucket *hmebp;
   3843 	hmeblk_tag hblktag;
   3844 	int hmeshift, hashno = 1;
   3845 	struct hme_blk *hmeblkp, *list = NULL;
   3846 	caddr_t endaddr;
   3847 
   3848 	ASSERT(sfmmup != NULL);
   3849 	ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
   3850 
   3851 	ASSERT((sfmmup == ksfmmup) ||
   3852 	    AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock));
   3853 	ASSERT((len & MMU_PAGEOFFSET) == 0);
   3854 	endaddr = addr + len;
   3855 	hblktag.htag_id = sfmmup;
   3856 	hblktag.htag_rid = SFMMU_INVALID_SHMERID;
   3857 
   3858 	/*
   3859 	 * Spitfire supports 4 page sizes.
   3860 	 * Most pages are expected to be of the smallest page size (8K) and
   3861 	 * these will not need to be rehashed. 64K pages also don't need to be
   3862 	 * rehashed because an hmeblk spans 64K of address space. 512K pages
   3863 	 * might need 1 rehash and and 4M pages might need 2 rehashes.
   3864 	 */
   3865 	while (addr < endaddr) {
   3866 		hmeshift = HME_HASH_SHIFT(hashno);
   3867 		hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
   3868 		hblktag.htag_rehash = hashno;
   3869 		hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
   3870 
   3871 		SFMMU_HASH_LOCK(hmebp);
   3872 
   3873 		HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list);
   3874 		if (hmeblkp != NULL) {
   3875 			ASSERT(!hmeblkp->hblk_shared);
   3876 			/*
   3877 			 * If we encounter a shadow hmeblk then
   3878 			 * we know there are no valid hmeblks mapping
   3879 			 * this address at this size or larger.
   3880 			 * Just increment address by the smallest
   3881 			 * page size.
   3882 			 */
   3883 			if (hmeblkp->hblk_shw_bit) {
   3884 				addr += MMU_PAGESIZE;
   3885 			} else {
   3886 				addr = sfmmu_hblk_unlock(hmeblkp, addr,
   3887 				    endaddr);
   3888 			}
   3889 			SFMMU_HASH_UNLOCK(hmebp);
   3890 			hashno = 1;
   3891 			continue;
   3892 		}
   3893 		SFMMU_HASH_UNLOCK(hmebp);
   3894 
   3895 		if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) {
   3896 			/*
   3897 			 * We have traversed the whole list and rehashed
   3898 			 * if necessary without finding the address to unlock
   3899 			 * which should never happen.
   3900 			 */
   3901 			panic("sfmmu_unlock: addr not found. "
   3902 			    "addr %p hat %p", (void *)addr, (void *)sfmmup);
   3903 		} else {
   3904 			hashno++;
   3905 		}
   3906 	}
   3907 
   3908 	sfmmu_hblks_list_purge(&list, 0);
   3909 }
   3910 
   3911 void
   3912 hat_unlock_region(struct hat *sfmmup, caddr_t addr, size_t len,
   3913     hat_region_cookie_t rcookie)
   3914 {
   3915 	sf_srd_t *srdp;
   3916 	sf_region_t *rgnp;
   3917 	int ttesz;
   3918 	uint_t rid;
   3919 	caddr_t eaddr;
   3920 	caddr_t va;
   3921 	int hmeshift;
   3922 	hmeblk_tag hblktag;
   3923 	struct hmehash_bucket *hmebp;
   3924 	struct hme_blk *hmeblkp;
   3925 	struct hme_blk *pr_hblk;
   3926 	struct hme_blk *list;
   3927 
   3928 	if (rcookie == HAT_INVALID_REGION_COOKIE) {
   3929 		hat_unlock(sfmmup, addr, len);
   3930 		return;
   3931 	}
   3932 
   3933 	ASSERT(sfmmup != NULL);
   3934 	ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
   3935 	ASSERT(sfmmup != ksfmmup);
   3936 
   3937 	srdp = sfmmup->sfmmu_srdp;
   3938 	rid = (uint_t)((uint64_t)rcookie);
   3939 	ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   3940 	eaddr = addr + len;
   3941 	va = addr;
   3942 	list = NULL;
   3943 	rgnp = srdp->srd_hmergnp[rid];
   3944 	SFMMU_VALIDATE_HMERID(sfmmup, rid, addr, len);
   3945 
   3946 	ASSERT(IS_P2ALIGNED(addr, TTEBYTES(rgnp->rgn_pgszc)));
   3947 	ASSERT(IS_P2ALIGNED(len, TTEBYTES(rgnp->rgn_pgszc)));
   3948 	if (rgnp->rgn_pgszc < HBLK_MIN_TTESZ) {
   3949 		ttesz = HBLK_MIN_TTESZ;
   3950 	} else {
   3951 		ttesz = rgnp->rgn_pgszc;
   3952 	}
   3953 	while (va < eaddr) {
   3954 		while (ttesz < rgnp->rgn_pgszc &&
   3955 		    IS_P2ALIGNED(va, TTEBYTES(ttesz + 1))) {
   3956 			ttesz++;
   3957 		}
   3958 		while (ttesz >= HBLK_MIN_TTESZ) {
   3959 			if (!(rgnp->rgn_hmeflags & (1 << ttesz))) {
   3960 				ttesz--;
   3961 				continue;
   3962 			}
   3963 			hmeshift = HME_HASH_SHIFT(ttesz);
   3964 			hblktag.htag_bspage = HME_HASH_BSPAGE(va, hmeshift);
   3965 			hblktag.htag_rehash = ttesz;
   3966 			hblktag.htag_rid = rid;
   3967 			hblktag.htag_id = srdp;
   3968 			hmebp = HME_HASH_FUNCTION(srdp, va, hmeshift);
   3969 			SFMMU_HASH_LOCK(hmebp);
   3970 			HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk,
   3971 			    &list);
   3972 			if (hmeblkp == NULL) {
   3973 				SFMMU_HASH_UNLOCK(hmebp);
   3974 				ttesz--;
   3975 				continue;
   3976 			}
   3977 			ASSERT(hmeblkp->hblk_shared);
   3978 			va = sfmmu_hblk_unlock(hmeblkp, va, eaddr);
   3979 			ASSERT(va >= eaddr ||
   3980 			    IS_P2ALIGNED((uintptr_t)va, TTEBYTES(ttesz)));
   3981 			SFMMU_HASH_UNLOCK(hmebp);
   3982 			break;
   3983 		}
   3984 		if (ttesz < HBLK_MIN_TTESZ) {
   3985 			panic("hat_unlock_region: addr not found "
   3986 			    "addr %p hat %p", (void *)va, (void *)sfmmup);
   3987 		}
   3988 	}
   3989 	sfmmu_hblks_list_purge(&list, 0);
   3990 }
   3991 
   3992 /*
   3993  * Function to unlock a range of addresses in an hmeblk.  It returns the
   3994  * next address that needs to be unlocked.
   3995  * Should be called with the hash lock held.
   3996  */
   3997 static caddr_t
   3998 sfmmu_hblk_unlock(struct hme_blk *hmeblkp, caddr_t addr, caddr_t endaddr)
   3999 {
   4000 	struct sf_hment *sfhme;
   4001 	tte_t tteold, ttemod;
   4002 	int ttesz, ret;
   4003 
   4004 	ASSERT(in_hblk_range(hmeblkp, addr));
   4005 	ASSERT(hmeblkp->hblk_shw_bit == 0);
   4006 
   4007 	endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
   4008 	ttesz = get_hblk_ttesz(hmeblkp);
   4009 
   4010 	HBLKTOHME(sfhme, hmeblkp, addr);
   4011 	while (addr < endaddr) {
   4012 readtte:
   4013 		sfmmu_copytte(&sfhme->hme_tte, &tteold);
   4014 		if (TTE_IS_VALID(&tteold)) {
   4015 
   4016 			ttemod = tteold;
   4017 
   4018 			ret = sfmmu_modifytte_try(&tteold, &ttemod,
   4019 			    &sfhme->hme_tte);
   4020 
   4021 			if (ret < 0)
   4022 				goto readtte;
   4023 
   4024 			if (hmeblkp->hblk_lckcnt == 0)
   4025 				panic("zero hblk lckcnt");
   4026 
   4027 			if (((uintptr_t)addr + TTEBYTES(ttesz)) >
   4028 			    (uintptr_t)endaddr)
   4029 				panic("can't unlock large tte");
   4030 
   4031 			ASSERT(hmeblkp->hblk_lckcnt > 0);
   4032 			atomic_add_32(&hmeblkp->hblk_lckcnt, -1);
   4033 			HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK);
   4034 		} else {
   4035 			panic("sfmmu_hblk_unlock: invalid tte");
   4036 		}
   4037 		addr += TTEBYTES(ttesz);
   4038 		sfhme++;
   4039 	}
   4040 	return (addr);
   4041 }
   4042 
   4043 /*
   4044  * Physical Address Mapping Framework
   4045  *
   4046  * General rules:
   4047  *
   4048  * (1) Applies only to seg_kmem memory pages. To make things easier,
   4049  *     seg_kpm addresses are also accepted by the routines, but nothing
   4050  *     is done with them since by definition their PA mappings are static.
   4051  * (2) hat_add_callback() may only be called while holding the page lock
   4052  *     SE_SHARED or SE_EXCL of the underlying page (e.g., as_pagelock()),
   4053  *     or passing HAC_PAGELOCK flag.
   4054  * (3) prehandler() and posthandler() may not call hat_add_callback() or
   4055  *     hat_delete_callback(), nor should they allocate memory. Post quiesce
   4056  *     callbacks may not sleep or acquire adaptive mutex locks.
   4057  * (4) Either prehandler() or posthandler() (but not both) may be specified
   4058  *     as being NULL.  Specifying an errhandler() is optional.
   4059  *
   4060  * Details of using the framework:
   4061  *
   4062  * registering a callback (hat_register_callback())
   4063  *
   4064  *	Pass prehandler, posthandler, errhandler addresses
   4065  *	as described below. If capture_cpus argument is nonzero,
   4066  *	suspend callback to the prehandler will occur with CPUs
   4067  *	captured and executing xc_loop() and CPUs will remain
   4068  *	captured until after the posthandler suspend callback
   4069  *	occurs.
   4070  *
   4071  * adding a callback (hat_add_callback())
   4072  *
   4073  *      as_pagelock();
   4074  *	hat_add_callback();
   4075  *      save returned pfn in private data structures or program registers;
   4076  *      as_pageunlock();
   4077  *
   4078  * prehandler()
   4079  *
   4080  *	Stop all accesses by physical address to this memory page.
   4081  *	Called twice: the first, PRESUSPEND, is a context safe to acquire
   4082  *	adaptive locks. The second, SUSPEND, is called at high PIL with
   4083  *	CPUs captured so adaptive locks may NOT be acquired (and all spin
   4084  *	locks must be XCALL_PIL or higher locks).
   4085  *
   4086  *	May return the following errors:
   4087  *		EIO:	A fatal error has occurred. This will result in panic.
   4088  *		EAGAIN:	The page cannot be suspended. This will fail the
   4089  *			relocation.
   4090  *		0:	Success.
   4091  *
   4092  * posthandler()
   4093  *
   4094  *      Save new pfn in private data structures or program registers;
   4095  *	not allowed to fail (non-zero return values will result in panic).
   4096  *
   4097  * errhandler()
   4098  *
   4099  *	called when an error occurs related to the callback.  Currently
   4100  *	the only such error is HAT_CB_ERR_LEAKED which indicates that
   4101  *	a page is being freed, but there are still outstanding callback(s)
   4102  *	registered on the page.
   4103  *
   4104  * removing a callback (hat_delete_callback(); e.g., prior to freeing memory)
   4105  *
   4106  *	stop using physical address
   4107  *	hat_delete_callback();
   4108  *
   4109  */
   4110 
   4111 /*
   4112  * Register a callback class.  Each subsystem should do this once and
   4113  * cache the id_t returned for use in setting up and tearing down callbacks.
   4114  *
   4115  * There is no facility for removing callback IDs once they are created;
   4116  * the "key" should be unique for each module, so in case a module is unloaded
   4117  * and subsequently re-loaded, we can recycle the module's previous entry.
   4118  */
   4119 id_t
   4120 hat_register_callback(int key,
   4121 	int (*prehandler)(caddr_t, uint_t, uint_t, void *),
   4122 	int (*posthandler)(caddr_t, uint_t, uint_t, void *, pfn_t),
   4123 	int (*errhandler)(caddr_t, uint_t, uint_t, void *),
   4124 	int capture_cpus)
   4125 {
   4126 	id_t id;
   4127 
   4128 	/*
   4129 	 * Search the table for a pre-existing callback associated with
   4130 	 * the identifier "key".  If one exists, we re-use that entry in
   4131 	 * the table for this instance, otherwise we assign the next
   4132 	 * available table slot.
   4133 	 */
   4134 	for (id = 0; id < sfmmu_max_cb_id; id++) {
   4135 		if (sfmmu_cb_table[id].key == key)
   4136 			break;
   4137 	}
   4138 
   4139 	if (id == sfmmu_max_cb_id) {
   4140 		id = sfmmu_cb_nextid++;
   4141 		if (id >= sfmmu_max_cb_id)
   4142 			panic("hat_register_callback: out of callback IDs");
   4143 	}
   4144 
   4145 	ASSERT(prehandler != NULL || posthandler != NULL);
   4146 
   4147 	sfmmu_cb_table[id].key = key;
   4148 	sfmmu_cb_table[id].prehandler = prehandler;
   4149 	sfmmu_cb_table[id].posthandler = posthandler;
   4150 	sfmmu_cb_table[id].errhandler = errhandler;
   4151 	sfmmu_cb_table[id].capture_cpus = capture_cpus;
   4152 
   4153 	return (id);
   4154 }
   4155 
   4156 #define	HAC_COOKIE_NONE	(void *)-1
   4157 
   4158 /*
   4159  * Add relocation callbacks to the specified addr/len which will be called
   4160  * when relocating the associated page. See the description of pre and
   4161  * posthandler above for more details.
   4162  *
   4163  * If HAC_PAGELOCK is included in flags, the underlying memory page is
   4164  * locked internally so the caller must be able to deal with the callback
   4165  * running even before this function has returned.  If HAC_PAGELOCK is not
   4166  * set, it is assumed that the underlying memory pages are locked.
   4167  *
   4168  * Since the caller must track the individual page boundaries anyway,
   4169  * we only allow a callback to be added to a single page (large
   4170  * or small).  Thus [addr, addr + len) MUST be contained within a single
   4171  * page.
   4172  *
   4173  * Registering multiple callbacks on the same [addr, addr+len) is supported,
   4174  * _provided_that_ a unique parameter is specified for each callback.
   4175  * If multiple callbacks are registered on the same range the callback will
   4176  * be invoked with each unique parameter. Registering the same callback with
   4177  * the same argument more than once will result in corrupted kernel state.
   4178  *
   4179  * Returns the pfn of the underlying kernel page in *rpfn
   4180  * on success, or PFN_INVALID on failure.
   4181  *
   4182  * cookiep (if passed) provides storage space for an opaque cookie
   4183  * to return later to hat_delete_callback(). This cookie makes the callback
   4184  * deletion significantly quicker by avoiding a potentially lengthy hash
   4185  * search.
   4186  *
   4187  * Returns values:
   4188  *    0:      success
   4189  *    ENOMEM: memory allocation failure (e.g. flags was passed as HAC_NOSLEEP)
   4190  *    EINVAL: callback ID is not valid
   4191  *    ENXIO:  ["vaddr", "vaddr" + len) is not mapped in the kernel's address
   4192  *            space
   4193  *    ERANGE: ["vaddr", "vaddr" + len) crosses a page boundary
   4194  */
   4195 int
   4196 hat_add_callback(id_t callback_id, caddr_t vaddr, uint_t len, uint_t flags,
   4197 	void *pvt, pfn_t *rpfn, void **cookiep)
   4198 {
   4199 	struct 		hmehash_bucket *hmebp;
   4200 	hmeblk_tag 	hblktag;
   4201 	struct hme_blk	*hmeblkp;
   4202 	int 		hmeshift, hashno;
   4203 	caddr_t 	saddr, eaddr, baseaddr;
   4204 	struct pa_hment *pahmep;
   4205 	struct sf_hment *sfhmep, *osfhmep;
   4206 	kmutex_t	*pml;
   4207 	tte_t   	tte;
   4208 	page_t		*pp;
   4209 	vnode_t		*vp;
   4210 	u_offset_t	off;
   4211 	pfn_t		pfn;
   4212 	int		kmflags = (flags & HAC_SLEEP)? KM_SLEEP : KM_NOSLEEP;
   4213 	int		locked = 0;
   4214 
   4215 	/*
   4216 	 * For KPM mappings, just return the physical address since we
   4217 	 * don't need to register any callbacks.
   4218 	 */
   4219 	if (IS_KPM_ADDR(vaddr)) {
   4220 		uint64_t paddr;
   4221 		SFMMU_KPM_VTOP(vaddr, paddr);
   4222 		*rpfn = btop(paddr);
   4223 		if (cookiep != NULL)
   4224 			*cookiep = HAC_COOKIE_NONE;
   4225 		return (0);
   4226 	}
   4227 
   4228 	if (callback_id < (id_t)0 || callback_id >= sfmmu_cb_nextid) {
   4229 		*rpfn = PFN_INVALID;
   4230 		return (EINVAL);
   4231 	}
   4232 
   4233 	if ((pahmep = kmem_cache_alloc(pa_hment_cache, kmflags)) == NULL) {
   4234 		*rpfn = PFN_INVALID;
   4235 		return (ENOMEM);
   4236 	}
   4237 
   4238 	sfhmep = &pahmep->sfment;
   4239 
   4240 	saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK);
   4241 	eaddr = saddr + len;
   4242 
   4243 rehash:
   4244 	/* Find the mapping(s) for this page */
   4245 	for (hashno = TTE64K, hmeblkp = NULL;
   4246 	    hmeblkp == NULL && hashno <= mmu_hashcnt;
   4247 	    hashno++) {
   4248 		hmeshift = HME_HASH_SHIFT(hashno);
   4249 		hblktag.htag_id = ksfmmup;
   4250 		hblktag.htag_rid = SFMMU_INVALID_SHMERID;
   4251 		hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift);
   4252 		hblktag.htag_rehash = hashno;
   4253 		hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift);
   4254 
   4255 		SFMMU_HASH_LOCK(hmebp);
   4256 
   4257 		HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp);
   4258 
   4259 		if (hmeblkp == NULL)
   4260 			SFMMU_HASH_UNLOCK(hmebp);
   4261 	}
   4262 
   4263 	if (hmeblkp == NULL) {
   4264 		kmem_cache_free(pa_hment_cache, pahmep);
   4265 		*rpfn = PFN_INVALID;
   4266 		return (ENXIO);
   4267 	}
   4268 
   4269 	ASSERT(!hmeblkp->hblk_shared);
   4270 
   4271 	HBLKTOHME(osfhmep, hmeblkp, saddr);
   4272 	sfmmu_copytte(&osfhmep->hme_tte, &tte);
   4273 
   4274 	if (!TTE_IS_VALID(&tte)) {
   4275 		SFMMU_HASH_UNLOCK(hmebp);
   4276 		kmem_cache_free(pa_hment_cache, pahmep);
   4277 		*rpfn = PFN_INVALID;
   4278 		return (ENXIO);
   4279 	}
   4280 
   4281 	/*
   4282 	 * Make sure the boundaries for the callback fall within this
   4283 	 * single mapping.
   4284 	 */
   4285 	baseaddr = (caddr_t)get_hblk_base(hmeblkp);
   4286 	ASSERT(saddr >= baseaddr);
   4287 	if (eaddr > saddr + TTEBYTES(TTE_CSZ(&tte))) {
   4288 		SFMMU_HASH_UNLOCK(hmebp);
   4289 		kmem_cache_free(pa_hment_cache, pahmep);
   4290 		*rpfn = PFN_INVALID;
   4291 		return (ERANGE);
   4292 	}
   4293 
   4294 	pfn = sfmmu_ttetopfn(&tte, vaddr);
   4295 
   4296 	/*
   4297 	 * The pfn may not have a page_t underneath in which case we
   4298 	 * just return it. This can happen if we are doing I/O to a
   4299 	 * static portion of the kernel's address space, for instance.
   4300 	 */
   4301 	pp = osfhmep->hme_page;
   4302 	if (pp == NULL) {
   4303 		SFMMU_HASH_UNLOCK(hmebp);
   4304 		kmem_cache_free(pa_hment_cache, pahmep);
   4305 		*rpfn = pfn;
   4306 		if (cookiep)
   4307 			*cookiep = HAC_COOKIE_NONE;
   4308 		return (0);
   4309 	}
   4310 	ASSERT(pp == PP_PAGEROOT(pp));
   4311 
   4312 	vp = pp->p_vnode;
   4313 	off = pp->p_offset;
   4314 
   4315 	pml = sfmmu_mlist_enter(pp);
   4316 
   4317 	if (flags & HAC_PAGELOCK) {
   4318 		if (!page_trylock(pp, SE_SHARED)) {
   4319 			/*
   4320 			 * Somebody is holding SE_EXCL lock. Might
   4321 			 * even be hat_page_relocate(). Drop all
   4322 			 * our locks, lookup the page in &kvp, and
   4323 			 * retry. If it doesn't exist in &kvp and &zvp,
   4324 			 * then we must be dealing with a kernel mapped
   4325 			 * page which doesn't actually belong to
   4326 			 * segkmem so we punt.
   4327 			 */
   4328 			sfmmu_mlist_exit(pml);
   4329 			SFMMU_HASH_UNLOCK(hmebp);
   4330 			pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED);
   4331 
   4332 			/* check zvp before giving up */
   4333 			if (pp == NULL)
   4334 				pp = page_lookup(&zvp, (u_offset_t)saddr,
   4335 				    SE_SHARED);
   4336 
   4337 			/* Okay, we didn't find it, give up */
   4338 			if (pp == NULL) {
   4339 				kmem_cache_free(pa_hment_cache, pahmep);
   4340 				*rpfn = pfn;
   4341 				if (cookiep)
   4342 					*cookiep = HAC_COOKIE_NONE;
   4343 				return (0);
   4344 			}
   4345 			page_unlock(pp);
   4346 			goto rehash;
   4347 		}
   4348 		locked = 1;
   4349 	}
   4350 
   4351 	if (!PAGE_LOCKED(pp) && !panicstr)
   4352 		panic("hat_add_callback: page 0x%p not locked", (void *)pp);
   4353 
   4354 	if (osfhmep->hme_page != pp || pp->p_vnode != vp ||
   4355 	    pp->p_offset != off) {
   4356 		/*
   4357 		 * The page moved before we got our hands on it.  Drop
   4358 		 * all the locks and try again.
   4359 		 */
   4360 		ASSERT((flags & HAC_PAGELOCK) != 0);
   4361 		sfmmu_mlist_exit(pml);
   4362 		SFMMU_HASH_UNLOCK(hmebp);
   4363 		page_unlock(pp);
   4364 		locked = 0;
   4365 		goto rehash;
   4366 	}
   4367 
   4368 	if (!VN_ISKAS(vp)) {
   4369 		/*
   4370 		 * This is not a segkmem page but another page which
   4371 		 * has been kernel mapped. It had better have at least
   4372 		 * a share lock on it. Return the pfn.
   4373 		 */
   4374 		sfmmu_mlist_exit(pml);
   4375 		SFMMU_HASH_UNLOCK(hmebp);
   4376 		if (locked)
   4377 			page_unlock(pp);
   4378 		kmem_cache_free(pa_hment_cache, pahmep);
   4379 		ASSERT(PAGE_LOCKED(pp));
   4380 		*rpfn = pfn;
   4381 		if (cookiep)
   4382 			*cookiep = HAC_COOKIE_NONE;
   4383 		return (0);
   4384 	}
   4385 
   4386 	/*
   4387 	 * Setup this pa_hment and link its embedded dummy sf_hment into
   4388 	 * the mapping list.
   4389 	 */
   4390 	pp->p_share++;
   4391 	pahmep->cb_id = callback_id;
   4392 	pahmep->addr = vaddr;
   4393 	pahmep->len = len;
   4394 	pahmep->refcnt = 1;
   4395 	pahmep->flags = 0;
   4396 	pahmep->pvt = pvt;
   4397 
   4398 	sfhmep->hme_tte.ll = 0;
   4399 	sfhmep->hme_data = pahmep;
   4400 	sfhmep->hme_prev = osfhmep;
   4401 	sfhmep->hme_next = osfhmep->hme_next;
   4402 
   4403 	if (osfhmep->hme_next)
   4404 		osfhmep->hme_next->hme_prev = sfhmep;
   4405 
   4406 	osfhmep->hme_next = sfhmep;
   4407 
   4408 	sfmmu_mlist_exit(pml);
   4409 	SFMMU_HASH_UNLOCK(hmebp);
   4410 
   4411 	if (locked)
   4412 		page_unlock(pp);
   4413 
   4414 	*rpfn = pfn;
   4415 	if (cookiep)
   4416 		*cookiep = (void *)pahmep;
   4417 
   4418 	return (0);
   4419 }
   4420 
   4421 /*
   4422  * Remove the relocation callbacks from the specified addr/len.
   4423  */
   4424 void
   4425 hat_delete_callback(caddr_t vaddr, uint_t len, void *pvt, uint_t flags,
   4426 	void *cookie)
   4427 {
   4428 	struct		hmehash_bucket *hmebp;
   4429 	hmeblk_tag	hblktag;
   4430 	struct hme_blk	*hmeblkp;
   4431 	int		hmeshift, hashno;
   4432 	caddr_t		saddr;
   4433 	struct pa_hment	*pahmep;
   4434 	struct sf_hment	*sfhmep, *osfhmep;
   4435 	kmutex_t	*pml;
   4436 	tte_t		tte;
   4437 	page_t		*pp;
   4438 	vnode_t		*vp;
   4439 	u_offset_t	off;
   4440 	int		locked = 0;
   4441 
   4442 	/*
   4443 	 * If the cookie is HAC_COOKIE_NONE then there is no pa_hment to
   4444 	 * remove so just return.
   4445 	 */
   4446 	if (cookie == HAC_COOKIE_NONE || IS_KPM_ADDR(vaddr))
   4447 		return;
   4448 
   4449 	saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK);
   4450 
   4451 rehash:
   4452 	/* Find the mapping(s) for this page */
   4453 	for (hashno = TTE64K, hmeblkp = NULL;
   4454 	    hmeblkp == NULL && hashno <= mmu_hashcnt;
   4455 	    hashno++) {
   4456 		hmeshift = HME_HASH_SHIFT(hashno);
   4457 		hblktag.htag_id = ksfmmup;
   4458 		hblktag.htag_rid = SFMMU_INVALID_SHMERID;
   4459 		hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift);
   4460 		hblktag.htag_rehash = hashno;
   4461 		hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift);
   4462 
   4463 		SFMMU_HASH_LOCK(hmebp);
   4464 
   4465 		HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp);
   4466 
   4467 		if (hmeblkp == NULL)
   4468 			SFMMU_HASH_UNLOCK(hmebp);
   4469 	}
   4470 
   4471 	if (hmeblkp == NULL)
   4472 		return;
   4473 
   4474 	ASSERT(!hmeblkp->hblk_shared);
   4475 
   4476 	HBLKTOHME(osfhmep, hmeblkp, saddr);
   4477 
   4478 	sfmmu_copytte(&osfhmep->hme_tte, &tte);
   4479 	if (!TTE_IS_VALID(&tte)) {
   4480 		SFMMU_HASH_UNLOCK(hmebp);
   4481 		return;
   4482 	}
   4483 
   4484 	pp = osfhmep->hme_page;
   4485 	if (pp == NULL) {
   4486 		SFMMU_HASH_UNLOCK(hmebp);
   4487 		ASSERT(cookie == NULL);
   4488 		return;
   4489 	}
   4490 
   4491 	vp = pp->p_vnode;
   4492 	off = pp->p_offset;
   4493 
   4494 	pml = sfmmu_mlist_enter(pp);
   4495 
   4496 	if (flags & HAC_PAGELOCK) {
   4497 		if (!page_trylock(pp, SE_SHARED)) {
   4498 			/*
   4499 			 * Somebody is holding SE_EXCL lock. Might
   4500 			 * even be hat_page_relocate(). Drop all
   4501 			 * our locks, lookup the page in &kvp, and
   4502 			 * retry. If it doesn't exist in &kvp and &zvp,
   4503 			 * then we must be dealing with a kernel mapped
   4504 			 * page which doesn't actually belong to
   4505 			 * segkmem so we punt.
   4506 			 */
   4507 			sfmmu_mlist_exit(pml);
   4508 			SFMMU_HASH_UNLOCK(hmebp);
   4509 			pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED);
   4510 			/* check zvp before giving up */
   4511 			if (pp == NULL)
   4512 				pp = page_lookup(&zvp, (u_offset_t)saddr,
   4513 				    SE_SHARED);
   4514 
   4515 			if (pp == NULL) {
   4516 				ASSERT(cookie == NULL);
   4517 				return;
   4518 			}
   4519 			page_unlock(pp);
   4520 			goto rehash;
   4521 		}
   4522 		locked = 1;
   4523 	}
   4524 
   4525 	ASSERT(PAGE_LOCKED(pp));
   4526 
   4527 	if (osfhmep->hme_page != pp || pp->p_vnode != vp ||
   4528 	    pp->p_offset != off) {
   4529 		/*
   4530 		 * The page moved before we got our hands on it.  Drop
   4531 		 * all the locks and try again.
   4532 		 */
   4533 		ASSERT((flags & HAC_PAGELOCK) != 0);
   4534 		sfmmu_mlist_exit(pml);
   4535 		SFMMU_HASH_UNLOCK(hmebp);
   4536 		page_unlock(pp);
   4537 		locked = 0;
   4538 		goto rehash;
   4539 	}
   4540 
   4541 	if (!VN_ISKAS(vp)) {
   4542 		/*
   4543 		 * This is not a segkmem page but another page which
   4544 		 * has been kernel mapped.
   4545 		 */
   4546 		sfmmu_mlist_exit(pml);
   4547 		SFMMU_HASH_UNLOCK(hmebp);
   4548 		if (locked)
   4549 			page_unlock(pp);
   4550 		ASSERT(cookie == NULL);
   4551 		return;
   4552 	}
   4553 
   4554 	if (cookie != NULL) {
   4555 		pahmep = (struct pa_hment *)cookie;
   4556 		sfhmep = &pahmep->sfment;
   4557 	} else {
   4558 		for (sfhmep = pp->p_mapping; sfhmep != NULL;
   4559 		    sfhmep = sfhmep->hme_next) {
   4560 
   4561 			/*
   4562 			 * skip va<->pa mappings
   4563 			 */
   4564 			if (!IS_PAHME(sfhmep))
   4565 				continue;
   4566 
   4567 			pahmep = sfhmep->hme_data;
   4568 			ASSERT(pahmep != NULL);
   4569 
   4570 			/*
   4571 			 * if pa_hment matches, remove it
   4572 			 */
   4573 			if ((pahmep->pvt == pvt) &&
   4574 			    (pahmep->addr == vaddr) &&
   4575 			    (pahmep->len == len)) {
   4576 				break;
   4577 			}
   4578 		}
   4579 	}
   4580 
   4581 	if (sfhmep == NULL) {
   4582 		if (!panicstr) {
   4583 			panic("hat_delete_callback: pa_hment not found, pp %p",
   4584 			    (void *)pp);
   4585 		}
   4586 		return;
   4587 	}
   4588 
   4589 	/*
   4590 	 * Note: at this point a valid kernel mapping must still be
   4591 	 * present on this page.
   4592 	 */
   4593 	pp->p_share--;
   4594 	if (pp->p_share <= 0)
   4595 		panic("hat_delete_callback: zero p_share");
   4596 
   4597 	if (--pahmep->refcnt == 0) {
   4598 		if (pahmep->flags != 0)
   4599 			panic("hat_delete_callback: pa_hment is busy");
   4600 
   4601 		/*
   4602 		 * Remove sfhmep from the mapping list for the page.
   4603 		 */
   4604 		if (sfhmep->hme_prev) {
   4605 			sfhmep->hme_prev->hme_next = sfhmep->hme_next;
   4606 		} else {
   4607 			pp->p_mapping = sfhmep->hme_next;
   4608 		}
   4609 
   4610 		if (sfhmep->hme_next)
   4611 			sfhmep->hme_next->hme_prev = sfhmep->hme_prev;
   4612 
   4613 		sfmmu_mlist_exit(pml);
   4614 		SFMMU_HASH_UNLOCK(hmebp);
   4615 
   4616 		if (locked)
   4617 			page_unlock(pp);
   4618 
   4619 		kmem_cache_free(pa_hment_cache, pahmep);
   4620 		return;
   4621 	}
   4622 
   4623 	sfmmu_mlist_exit(pml);
   4624 	SFMMU_HASH_UNLOCK(hmebp);
   4625 	if (locked)
   4626 		page_unlock(pp);
   4627 }
   4628 
   4629 /*
   4630  * hat_probe returns 1 if the translation for the address 'addr' is
   4631  * loaded, zero otherwise.
   4632  *
   4633  * hat_probe should be used only for advisorary purposes because it may
   4634  * occasionally return the wrong value. The implementation must guarantee that
   4635  * returning the wrong value is a very rare event. hat_probe is used
   4636  * to implement optimizations in the segment drivers.
   4637  *
   4638  */
   4639 int
   4640 hat_probe(struct hat *sfmmup, caddr_t addr)
   4641 {
   4642 	pfn_t pfn;
   4643 	tte_t tte;
   4644 
   4645 	ASSERT(sfmmup != NULL);
   4646 	ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
   4647 
   4648 	ASSERT((sfmmup == ksfmmup) ||
   4649 	    AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock));
   4650 
   4651 	if (sfmmup == ksfmmup) {
   4652 		while ((pfn = sfmmu_vatopfn(addr, sfmmup, &tte))
   4653 		    == PFN_SUSPENDED) {
   4654 			sfmmu_vatopfn_suspended(addr, sfmmup, &tte);
   4655 		}
   4656 	} else {
   4657 		pfn = sfmmu_uvatopfn(addr, sfmmup, NULL);
   4658 	}
   4659 
   4660 	if (pfn != PFN_INVALID)
   4661 		return (1);
   4662 	else
   4663 		return (0);
   4664 }
   4665 
   4666 ssize_t
   4667 hat_getpagesize(struct hat *sfmmup, caddr_t addr)
   4668 {
   4669 	tte_t tte;
   4670 
   4671 	ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
   4672 
   4673 	if (sfmmup == ksfmmup) {
   4674 		if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) {
   4675 			return (-1);
   4676 		}
   4677 	} else {
   4678 		if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) {
   4679 			return (-1);
   4680 		}
   4681 	}
   4682 
   4683 	ASSERT(TTE_IS_VALID(&tte));
   4684 	return (TTEBYTES(TTE_CSZ(&tte)));
   4685 }
   4686 
   4687 uint_t
   4688 hat_getattr(struct hat *sfmmup, caddr_t addr, uint_t *attr)
   4689 {
   4690 	tte_t tte;
   4691 
   4692 	ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
   4693 
   4694 	if (sfmmup == ksfmmup) {
   4695 		if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) {
   4696 			tte.ll = 0;
   4697 		}
   4698 	} else {
   4699 		if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) {
   4700 			tte.ll = 0;
   4701 		}
   4702 	}
   4703 	if (TTE_IS_VALID(&tte)) {
   4704 		*attr = sfmmu_ptov_attr(&tte);
   4705 		return (0);
   4706 	}
   4707 	*attr = 0;
   4708 	return ((uint_t)0xffffffff);
   4709 }
   4710 
   4711 /*
   4712  * Enables more attributes on specified address range (ie. logical OR)
   4713  */
   4714 void
   4715 hat_setattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr)
   4716 {
   4717 	if (hat->sfmmu_xhat_provider) {
   4718 		XHAT_SETATTR(hat, addr, len, attr);
   4719 		return;
   4720 	} else {
   4721 		/*
   4722 		 * This must be a CPU HAT. If the address space has
   4723 		 * XHATs attached, change attributes for all of them,
   4724 		 * just in case
   4725 		 */
   4726 		ASSERT(hat->sfmmu_as != NULL);
   4727 		if (hat->sfmmu_as->a_xhat != NULL)
   4728 			xhat_setattr_all(hat->sfmmu_as, addr, len, attr);
   4729 	}
   4730 
   4731 	sfmmu_chgattr(hat, addr, len, attr, SFMMU_SETATTR);
   4732 }
   4733 
   4734 /*
   4735  * Assigns attributes to the specified address range.  All the attributes
   4736  * are specified.
   4737  */
   4738 void
   4739 hat_chgattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr)
   4740 {
   4741 	if (hat->sfmmu_xhat_provider) {
   4742 		XHAT_CHGATTR(hat, addr, len, attr);
   4743 		return;
   4744 	} else {
   4745 		/*
   4746 		 * This must be a CPU HAT. If the address space has
   4747 		 * XHATs attached, change attributes for all of them,
   4748 		 * just in case
   4749 		 */
   4750 		ASSERT(hat->sfmmu_as != NULL);
   4751 		if (hat->sfmmu_as->a_xhat != NULL)
   4752 			xhat_chgattr_all(hat->sfmmu_as, addr, len, attr);
   4753 	}
   4754 
   4755 	sfmmu_chgattr(hat, addr, len, attr, SFMMU_CHGATTR);
   4756 }
   4757 
   4758 /*
   4759  * Remove attributes on the specified address range (ie. loginal NAND)
   4760  */
   4761 void
   4762 hat_clrattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr)
   4763 {
   4764 	if (hat->sfmmu_xhat_provider) {
   4765 		XHAT_CLRATTR(hat, addr, len, attr);
   4766 		return;
   4767 	} else {
   4768 		/*
   4769 		 * This must be a CPU HAT. If the address space has
   4770 		 * XHATs attached, change attributes for all of them,
   4771 		 * just in case
   4772 		 */
   4773 		ASSERT(hat->sfmmu_as != NULL);
   4774 		if (hat->sfmmu_as->a_xhat != NULL)
   4775 			xhat_clrattr_all(hat->sfmmu_as, addr, len, attr);
   4776 	}
   4777 
   4778 	sfmmu_chgattr(hat, addr, len, attr, SFMMU_CLRATTR);
   4779 }
   4780 
   4781 /*
   4782  * Change attributes on an address range to that specified by attr and mode.
   4783  */
   4784 static void
   4785 sfmmu_chgattr(struct hat *sfmmup, caddr_t addr, size_t len, uint_t attr,
   4786 	int mode)
   4787 {
   4788 	struct hmehash_bucket *hmebp;
   4789 	hmeblk_tag hblktag;
   4790 	int hmeshift, hashno = 1;
   4791 	struct hme_blk *hmeblkp, *list = NULL;
   4792 	caddr_t endaddr;
   4793 	cpuset_t cpuset;
   4794 	demap_range_t dmr;
   4795 
   4796 	CPUSET_ZERO(cpuset);
   4797 
   4798 	ASSERT((sfmmup == ksfmmup) ||
   4799 	    AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock));
   4800 	ASSERT((len & MMU_PAGEOFFSET) == 0);
   4801 	ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0);
   4802 
   4803 	if ((attr & PROT_USER) && (mode != SFMMU_CLRATTR) &&
   4804 	    ((addr + len) > (caddr_t)USERLIMIT)) {
   4805 		panic("user addr %p in kernel space",
   4806 		    (void *)addr);
   4807 	}
   4808 
   4809 	endaddr = addr + len;
   4810 	hblktag.htag_id = sfmmup;
   4811 	hblktag.htag_rid = SFMMU_INVALID_SHMERID;
   4812 	DEMAP_RANGE_INIT(sfmmup, &dmr);
   4813 
   4814 	while (addr < endaddr) {
   4815 		hmeshift = HME_HASH_SHIFT(hashno);
   4816 		hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
   4817 		hblktag.htag_rehash = hashno;
   4818 		hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
   4819 
   4820 		SFMMU_HASH_LOCK(hmebp);
   4821 
   4822 		HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list);
   4823 		if (hmeblkp != NULL) {
   4824 			ASSERT(!hmeblkp->hblk_shared);
   4825 			/*
   4826 			 * We've encountered a shadow hmeblk so skip the range
   4827 			 * of the next smaller mapping size.
   4828 			 */
   4829 			if (hmeblkp->hblk_shw_bit) {
   4830 				ASSERT(sfmmup != ksfmmup);
   4831 				ASSERT(hashno > 1);
   4832 				addr = (caddr_t)P2END((uintptr_t)addr,
   4833 				    TTEBYTES(hashno - 1));
   4834 			} else {
   4835 				addr = sfmmu_hblk_chgattr(sfmmup,
   4836 				    hmeblkp, addr, endaddr, &dmr, attr, mode);
   4837 			}
   4838 			SFMMU_HASH_UNLOCK(hmebp);
   4839 			hashno = 1;
   4840 			continue;
   4841 		}
   4842 		SFMMU_HASH_UNLOCK(hmebp);
   4843 
   4844 		if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) {
   4845 			/*
   4846 			 * We have traversed the whole list and rehashed
   4847 			 * if necessary without finding the address to chgattr.
   4848 			 * This is ok, so we increment the address by the
   4849 			 * smallest hmeblk range for kernel mappings or for
   4850 			 * user mappings with no large pages, and the largest
   4851 			 * hmeblk range, to account for shadow hmeblks, for
   4852 			 * user mappings with large pages and continue.
   4853 			 */
   4854 			if (sfmmup == ksfmmup)
   4855 				addr = (caddr_t)P2END((uintptr_t)addr,
   4856 				    TTEBYTES(1));
   4857 			else
   4858 				addr = (caddr_t)P2END((uintptr_t)addr,
   4859 				    TTEBYTES(hashno));
   4860 			hashno = 1;
   4861 		} else {
   4862 			hashno++;
   4863 		}
   4864 	}
   4865 
   4866 	sfmmu_hblks_list_purge(&list, 0);
   4867 	DEMAP_RANGE_FLUSH(&dmr);
   4868 	cpuset = sfmmup->sfmmu_cpusran;
   4869 	xt_sync(cpuset);
   4870 }
   4871 
   4872 /*
   4873  * This function chgattr on a range of addresses in an hmeblk.  It returns the
   4874  * next addres that needs to be chgattr.
   4875  * It should be called with the hash lock held.
   4876  * XXX It should be possible to optimize chgattr by not flushing every time but
   4877  * on the other hand:
   4878  * 1. do one flush crosscall.
   4879  * 2. only flush if we are increasing permissions (make sure this will work)
   4880  */
   4881 static caddr_t
   4882 sfmmu_hblk_chgattr(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr,
   4883 	caddr_t endaddr, demap_range_t *dmrp, uint_t attr, int mode)
   4884 {
   4885 	tte_t tte, tteattr, tteflags, ttemod;
   4886 	struct sf_hment *sfhmep;
   4887 	int ttesz;
   4888 	struct page *pp = NULL;
   4889 	kmutex_t *pml, *pmtx;
   4890 	int ret;
   4891 	int use_demap_range;
   4892 #if defined(SF_ERRATA_57)
   4893 	int check_exec;
   4894 #endif
   4895 
   4896 	ASSERT(in_hblk_range(hmeblkp, addr));
   4897 	ASSERT(hmeblkp->hblk_shw_bit == 0);
   4898 	ASSERT(!hmeblkp->hblk_shared);
   4899 
   4900 	endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
   4901 	ttesz = get_hblk_ttesz(hmeblkp);
   4902 
   4903 	/*
   4904 	 * Flush the current demap region if addresses have been
   4905 	 * skipped or the page size doesn't match.
   4906 	 */
   4907 	use_demap_range = (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp));
   4908 	if (use_demap_range) {
   4909 		DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr);
   4910 	} else {
   4911 		DEMAP_RANGE_FLUSH(dmrp);
   4912 	}
   4913 
   4914 	tteattr.ll = sfmmu_vtop_attr(attr, mode, &tteflags);
   4915 #if defined(SF_ERRATA_57)
   4916 	check_exec = (sfmmup != ksfmmup) &&
   4917 	    AS_TYPE_64BIT(sfmmup->sfmmu_as) &&
   4918 	    TTE_IS_EXECUTABLE(&tteattr);
   4919 #endif
   4920 	HBLKTOHME(sfhmep, hmeblkp, addr);
   4921 	while (addr < endaddr) {
   4922 		sfmmu_copytte(&sfhmep->hme_tte, &tte);
   4923 		if (TTE_IS_VALID(&tte)) {
   4924 			if ((tte.ll & tteflags.ll) == tteattr.ll) {
   4925 				/*
   4926 				 * if the new attr is the same as old
   4927 				 * continue
   4928 				 */
   4929 				goto next_addr;
   4930 			}
   4931 			if (!TTE_IS_WRITABLE(&tteattr)) {
   4932 				/*
   4933 				 * make sure we clear hw modify bit if we
   4934 				 * removing write protections
   4935 				 */
   4936 				tteflags.tte_intlo |= TTE_HWWR_INT;
   4937 			}
   4938 
   4939 			pml = NULL;
   4940 			pp = sfhmep->hme_page;
   4941 			if (pp) {
   4942 				pml = sfmmu_mlist_enter(pp);
   4943 			}
   4944 
   4945 			if (pp != sfhmep->hme_page) {
   4946 				/*
   4947 				 * tte must have been unloaded.
   4948 				 */
   4949 				ASSERT(pml);
   4950 				sfmmu_mlist_exit(pml);
   4951 				continue;
   4952 			}
   4953 
   4954 			ASSERT(pp == NULL || sfmmu_mlist_held(pp));
   4955 
   4956 			ttemod = tte;
   4957 			ttemod.ll = (ttemod.ll & ~tteflags.ll) | tteattr.ll;
   4958 			ASSERT(TTE_TO_TTEPFN(&ttemod) == TTE_TO_TTEPFN(&tte));
   4959 
   4960 #if defined(SF_ERRATA_57)
   4961 			if (check_exec && addr < errata57_limit)
   4962 				ttemod.tte_exec_perm = 0;
   4963 #endif
   4964 			ret = sfmmu_modifytte_try(&tte, &ttemod,
   4965 			    &sfhmep->hme_tte);
   4966 
   4967 			if (ret < 0) {
   4968 				/* tte changed underneath us */
   4969 				if (pml) {
   4970 					sfmmu_mlist_exit(pml);
   4971 				}
   4972 				continue;
   4973 			}
   4974 
   4975 			if (tteflags.tte_intlo & TTE_HWWR_INT) {
   4976 				/*
   4977 				 * need to sync if we are clearing modify bit.
   4978 				 */
   4979 				sfmmu_ttesync(sfmmup, addr, &tte, pp);
   4980 			}
   4981 
   4982 			if (pp && PP_ISRO(pp)) {
   4983 				if (tteattr.tte_intlo & TTE_WRPRM_INT) {
   4984 					pmtx = sfmmu_page_enter(pp);
   4985 					PP_CLRRO(pp);
   4986 					sfmmu_page_exit(pmtx);
   4987 				}
   4988 			}
   4989 
   4990 			if (ret > 0 && use_demap_range) {
   4991 				DEMAP_RANGE_MARKPG(dmrp, addr);
   4992 			} else if (ret > 0) {
   4993 				sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0);
   4994 			}
   4995 
   4996 			if (pml) {
   4997 				sfmmu_mlist_exit(pml);
   4998 			}
   4999 		}
   5000 next_addr:
   5001 		addr += TTEBYTES(ttesz);
   5002 		sfhmep++;
   5003 		DEMAP_RANGE_NEXTPG(dmrp);
   5004 	}
   5005 	return (addr);
   5006 }
   5007 
   5008 /*
   5009  * This routine converts virtual attributes to physical ones.  It will
   5010  * update the tteflags field with the tte mask corresponding to the attributes
   5011  * affected and it returns the new attributes.  It will also clear the modify
   5012  * bit if we are taking away write permission.  This is necessary since the
   5013  * modify bit is the hardware permission bit and we need to clear it in order
   5014  * to detect write faults.
   5015  */
   5016 static uint64_t
   5017 sfmmu_vtop_attr(uint_t attr, int mode, tte_t *ttemaskp)
   5018 {
   5019 	tte_t ttevalue;
   5020 
   5021 	ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
   5022 
   5023 	switch (mode) {
   5024 	case SFMMU_CHGATTR:
   5025 		/* all attributes specified */
   5026 		ttevalue.tte_inthi = MAKE_TTEATTR_INTHI(attr);
   5027 		ttevalue.tte_intlo = MAKE_TTEATTR_INTLO(attr);
   5028 		ttemaskp->tte_inthi = TTEINTHI_ATTR;
   5029 		ttemaskp->tte_intlo = TTEINTLO_ATTR;
   5030 		break;
   5031 	case SFMMU_SETATTR:
   5032 		ASSERT(!(attr & ~HAT_PROT_MASK));
   5033 		ttemaskp->ll = 0;
   5034 		ttevalue.ll = 0;
   5035 		/*
   5036 		 * a valid tte implies exec and read for sfmmu
   5037 		 * so no need to do anything about them.
   5038 		 * since priviledged access implies user access
   5039 		 * PROT_USER doesn't make sense either.
   5040 		 */
   5041 		if (attr & PROT_WRITE) {
   5042 			ttemaskp->tte_intlo |= TTE_WRPRM_INT;
   5043 			ttevalue.tte_intlo |= TTE_WRPRM_INT;
   5044 		}
   5045 		break;
   5046 	case SFMMU_CLRATTR:
   5047 		/* attributes will be nand with current ones */
   5048 		if (attr & ~(PROT_WRITE | PROT_USER)) {
   5049 			panic("sfmmu: attr %x not supported", attr);
   5050 		}
   5051 		ttemaskp->ll = 0;
   5052 		ttevalue.ll = 0;
   5053 		if (attr & PROT_WRITE) {
   5054 			/* clear both writable and modify bit */
   5055 			ttemaskp->tte_intlo |= TTE_WRPRM_INT | TTE_HWWR_INT;
   5056 		}
   5057 		if (attr & PROT_USER) {
   5058 			ttemaskp->tte_intlo |= TTE_PRIV_INT;
   5059 			ttevalue.tte_intlo |= TTE_PRIV_INT;
   5060 		}
   5061 		break;
   5062 	default:
   5063 		panic("sfmmu_vtop_attr: bad mode %x", mode);
   5064 	}
   5065 	ASSERT(TTE_TO_TTEPFN(&ttevalue) == 0);
   5066 	return (ttevalue.ll);
   5067 }
   5068 
   5069 static uint_t
   5070 sfmmu_ptov_attr(tte_t *ttep)
   5071 {
   5072 	uint_t attr;
   5073 
   5074 	ASSERT(TTE_IS_VALID(ttep));
   5075 
   5076 	attr = PROT_READ;
   5077 
   5078 	if (TTE_IS_WRITABLE(ttep)) {
   5079 		attr |= PROT_WRITE;
   5080 	}
   5081 	if (TTE_IS_EXECUTABLE(ttep)) {
   5082 		attr |= PROT_EXEC;
   5083 	}
   5084 	if (!TTE_IS_PRIVILEGED(ttep)) {
   5085 		attr |= PROT_USER;
   5086 	}
   5087 	if (TTE_IS_NFO(ttep)) {
   5088 		attr |= HAT_NOFAULT;
   5089 	}
   5090 	if (TTE_IS_NOSYNC(ttep)) {
   5091 		attr |= HAT_NOSYNC;
   5092 	}
   5093 	if (TTE_IS_SIDEFFECT(ttep)) {
   5094 		attr |= SFMMU_SIDEFFECT;
   5095 	}
   5096 	if (!TTE_IS_VCACHEABLE(ttep)) {
   5097 		attr |= SFMMU_UNCACHEVTTE;
   5098 	}
   5099 	if (!TTE_IS_PCACHEABLE(ttep)) {
   5100 		attr |= SFMMU_UNCACHEPTTE;
   5101 	}
   5102 	return (attr);
   5103 }
   5104 
   5105 /*
   5106  * hat_chgprot is a deprecated hat call.  New segment drivers
   5107  * should store all attributes and use hat_*attr calls.
   5108  *
   5109  * Change the protections in the virtual address range
   5110  * given to the specified virtual protection.  If vprot is ~PROT_WRITE,
   5111  * then remove write permission, leaving the other
   5112  * permissions unchanged.  If vprot is ~PROT_USER, remove user permissions.
   5113  *
   5114  */
   5115 void
   5116 hat_chgprot(struct hat *sfmmup, caddr_t addr, size_t len, uint_t vprot)
   5117 {
   5118 	struct hmehash_bucket *hmebp;
   5119 	hmeblk_tag hblktag;
   5120 	int hmeshift, hashno = 1;
   5121 	struct hme_blk *hmeblkp, *list = NULL;
   5122 	caddr_t endaddr;
   5123 	cpuset_t cpuset;
   5124 	demap_range_t dmr;
   5125 
   5126 	ASSERT((len & MMU_PAGEOFFSET) == 0);
   5127 	ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0);
   5128 
   5129 	if (sfmmup->sfmmu_xhat_provider) {
   5130 		XHAT_CHGPROT(sfmmup, addr, len, vprot);
   5131 		return;
   5132 	} else {
   5133 		/*
   5134 		 * This must be a CPU HAT. If the address space has
   5135 		 * XHATs attached, change attributes for all of them,
   5136 		 * just in case
   5137 		 */
   5138 		ASSERT(sfmmup->sfmmu_as != NULL);
   5139 		if (sfmmup->sfmmu_as->a_xhat != NULL)
   5140 			xhat_chgprot_all(sfmmup->sfmmu_as, addr, len, vprot);
   5141 	}
   5142 
   5143 	CPUSET_ZERO(cpuset);
   5144 
   5145 	if ((vprot != (uint_t)~PROT_WRITE) && (vprot & PROT_USER) &&
   5146 	    ((addr + len) > (caddr_t)USERLIMIT)) {
   5147 		panic("user addr %p vprot %x in kernel space",
   5148 		    (void *)addr, vprot);
   5149 	}
   5150 	endaddr = addr + len;
   5151 	hblktag.htag_id = sfmmup;
   5152 	hblktag.htag_rid = SFMMU_INVALID_SHMERID;
   5153 	DEMAP_RANGE_INIT(sfmmup, &dmr);
   5154 
   5155 	while (addr < endaddr) {
   5156 		hmeshift = HME_HASH_SHIFT(hashno);
   5157 		hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
   5158 		hblktag.htag_rehash = hashno;
   5159 		hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
   5160 
   5161 		SFMMU_HASH_LOCK(hmebp);
   5162 
   5163 		HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list);
   5164 		if (hmeblkp != NULL) {
   5165 			ASSERT(!hmeblkp->hblk_shared);
   5166 			/*
   5167 			 * We've encountered a shadow hmeblk so skip the range
   5168 			 * of the next smaller mapping size.
   5169 			 */
   5170 			if (hmeblkp->hblk_shw_bit) {
   5171 				ASSERT(sfmmup != ksfmmup);
   5172 				ASSERT(hashno > 1);
   5173 				addr = (caddr_t)P2END((uintptr_t)addr,
   5174 				    TTEBYTES(hashno - 1));
   5175 			} else {
   5176 				addr = sfmmu_hblk_chgprot(sfmmup, hmeblkp,
   5177 				    addr, endaddr, &dmr, vprot);
   5178 			}
   5179 			SFMMU_HASH_UNLOCK(hmebp);
   5180 			hashno = 1;
   5181 			continue;
   5182 		}
   5183 		SFMMU_HASH_UNLOCK(hmebp);
   5184 
   5185 		if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) {
   5186 			/*
   5187 			 * We have traversed the whole list and rehashed
   5188 			 * if necessary without finding the address to chgprot.
   5189 			 * This is ok so we increment the address by the
   5190 			 * smallest hmeblk range for kernel mappings and the
   5191 			 * largest hmeblk range, to account for shadow hmeblks,
   5192 			 * for user mappings and continue.
   5193 			 */
   5194 			if (sfmmup == ksfmmup)
   5195 				addr = (caddr_t)P2END((uintptr_t)addr,
   5196 				    TTEBYTES(1));
   5197 			else
   5198 				addr = (caddr_t)P2END((uintptr_t)addr,
   5199 				    TTEBYTES(hashno));
   5200 			hashno = 1;
   5201 		} else {
   5202 			hashno++;
   5203 		}
   5204 	}
   5205 
   5206 	sfmmu_hblks_list_purge(&list, 0);
   5207 	DEMAP_RANGE_FLUSH(&dmr);
   5208 	cpuset = sfmmup->sfmmu_cpusran;
   5209 	xt_sync(cpuset);
   5210 }
   5211 
   5212 /*
   5213  * This function chgprots a range of addresses in an hmeblk.  It returns the
   5214  * next addres that needs to be chgprot.
   5215  * It should be called with the hash lock held.
   5216  * XXX It shold be possible to optimize chgprot by not flushing every time but
   5217  * on the other hand:
   5218  * 1. do one flush crosscall.
   5219  * 2. only flush if we are increasing permissions (make sure this will work)
   5220  */
   5221 static caddr_t
   5222 sfmmu_hblk_chgprot(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, caddr_t addr,
   5223 	caddr_t endaddr, demap_range_t *dmrp, uint_t vprot)
   5224 {
   5225 	uint_t pprot;
   5226 	tte_t tte, ttemod;
   5227 	struct sf_hment *sfhmep;
   5228 	uint_t tteflags;
   5229 	int ttesz;
   5230 	struct page *pp = NULL;
   5231 	kmutex_t *pml, *pmtx;
   5232 	int ret;
   5233 	int use_demap_range;
   5234 #if defined(SF_ERRATA_57)
   5235 	int check_exec;
   5236 #endif
   5237 
   5238 	ASSERT(in_hblk_range(hmeblkp, addr));
   5239 	ASSERT(hmeblkp->hblk_shw_bit == 0);
   5240 	ASSERT(!hmeblkp->hblk_shared);
   5241 
   5242 #ifdef DEBUG
   5243 	if (get_hblk_ttesz(hmeblkp) != TTE8K &&
   5244 	    (endaddr < get_hblk_endaddr(hmeblkp))) {
   5245 		panic("sfmmu_hblk_chgprot: partial chgprot of large page");
   5246 	}
   5247 #endif /* DEBUG */
   5248 
   5249 	endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
   5250 	ttesz = get_hblk_ttesz(hmeblkp);
   5251 
   5252 	pprot = sfmmu_vtop_prot(vprot, &tteflags);
   5253 #if defined(SF_ERRATA_57)
   5254 	check_exec = (sfmmup != ksfmmup) &&
   5255 	    AS_TYPE_64BIT(sfmmup->sfmmu_as) &&
   5256 	    ((vprot & PROT_EXEC) == PROT_EXEC);
   5257 #endif
   5258 	HBLKTOHME(sfhmep, hmeblkp, addr);
   5259 
   5260 	/*
   5261 	 * Flush the current demap region if addresses have been
   5262 	 * skipped or the page size doesn't match.
   5263 	 */
   5264 	use_demap_range = (TTEBYTES(ttesz) == MMU_PAGESIZE);
   5265 	if (use_demap_range) {
   5266 		DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr);
   5267 	} else {
   5268 		DEMAP_RANGE_FLUSH(dmrp);
   5269 	}
   5270 
   5271 	while (addr < endaddr) {
   5272 		sfmmu_copytte(&sfhmep->hme_tte, &tte);
   5273 		if (TTE_IS_VALID(&tte)) {
   5274 			if (TTE_GET_LOFLAGS(&tte, tteflags) == pprot) {
   5275 				/*
   5276 				 * if the new protection is the same as old
   5277 				 * continue
   5278 				 */
   5279 				goto next_addr;
   5280 			}
   5281 			pml = NULL;
   5282 			pp = sfhmep->hme_page;
   5283 			if (pp) {
   5284 				pml = sfmmu_mlist_enter(pp);
   5285 			}
   5286 			if (pp != sfhmep->hme_page) {
   5287 				/*
   5288 				 * tte most have been unloaded
   5289 				 * underneath us.  Recheck
   5290 				 */
   5291 				ASSERT(pml);
   5292 				sfmmu_mlist_exit(pml);
   5293 				continue;
   5294 			}
   5295 
   5296 			ASSERT(pp == NULL || sfmmu_mlist_held(pp));
   5297 
   5298 			ttemod = tte;
   5299 			TTE_SET_LOFLAGS(&ttemod, tteflags, pprot);
   5300 #if defined(SF_ERRATA_57)
   5301 			if (check_exec && addr < errata57_limit)
   5302 				ttemod.tte_exec_perm = 0;
   5303 #endif
   5304 			ret = sfmmu_modifytte_try(&tte, &ttemod,
   5305 			    &sfhmep->hme_tte);
   5306 
   5307 			if (ret < 0) {
   5308 				/* tte changed underneath us */
   5309 				if (pml) {
   5310 					sfmmu_mlist_exit(pml);
   5311 				}
   5312 				continue;
   5313 			}
   5314 
   5315 			if (tteflags & TTE_HWWR_INT) {
   5316 				/*
   5317 				 * need to sync if we are clearing modify bit.
   5318 				 */
   5319 				sfmmu_ttesync(sfmmup, addr, &tte, pp);
   5320 			}
   5321 
   5322 			if (pp && PP_ISRO(pp)) {
   5323 				if (pprot & TTE_WRPRM_INT) {
   5324 					pmtx = sfmmu_page_enter(pp);
   5325 					PP_CLRRO(pp);
   5326 					sfmmu_page_exit(pmtx);
   5327 				}
   5328 			}
   5329 
   5330 			if (ret > 0 && use_demap_range) {
   5331 				DEMAP_RANGE_MARKPG(dmrp, addr);
   5332 			} else if (ret > 0) {
   5333 				sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0);
   5334 			}
   5335 
   5336 			if (pml) {
   5337 				sfmmu_mlist_exit(pml);
   5338 			}
   5339 		}
   5340 next_addr:
   5341 		addr += TTEBYTES(ttesz);
   5342 		sfhmep++;
   5343 		DEMAP_RANGE_NEXTPG(dmrp);
   5344 	}
   5345 	return (addr);
   5346 }
   5347 
   5348 /*
   5349  * This routine is deprecated and should only be used by hat_chgprot.
   5350  * The correct routine is sfmmu_vtop_attr.
   5351  * This routine converts virtual page protections to physical ones.  It will
   5352  * update the tteflags field with the tte mask corresponding to the protections
   5353  * affected and it returns the new protections.  It will also clear the modify
   5354  * bit if we are taking away write permission.  This is necessary since the
   5355  * modify bit is the hardware permission bit and we need to clear it in order
   5356  * to detect write faults.
   5357  * It accepts the following special protections:
   5358  * ~PROT_WRITE = remove write permissions.
   5359  * ~PROT_USER = remove user permissions.
   5360  */
   5361 static uint_t
   5362 sfmmu_vtop_prot(uint_t vprot, uint_t *tteflagsp)
   5363 {
   5364 	if (vprot == (uint_t)~PROT_WRITE) {
   5365 		*tteflagsp = TTE_WRPRM_INT | TTE_HWWR_INT;
   5366 		return (0);		/* will cause wrprm to be cleared */
   5367 	}
   5368 	if (vprot == (uint_t)~PROT_USER) {
   5369 		*tteflagsp = TTE_PRIV_INT;
   5370 		return (0);		/* will cause privprm to be cleared */
   5371 	}
   5372 	if ((vprot == 0) || (vprot == PROT_USER) ||
   5373 	    ((vprot & PROT_ALL) != vprot)) {
   5374 		panic("sfmmu_vtop_prot -- bad prot %x", vprot);
   5375 	}
   5376 
   5377 	switch (vprot) {
   5378 	case (PROT_READ):
   5379 	case (PROT_EXEC):
   5380 	case (PROT_EXEC | PROT_READ):
   5381 		*tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT;
   5382 		return (TTE_PRIV_INT); 		/* set prv and clr wrt */
   5383 	case (PROT_WRITE):
   5384 	case (PROT_WRITE | PROT_READ):
   5385 	case (PROT_EXEC | PROT_WRITE):
   5386 	case (PROT_EXEC | PROT_WRITE | PROT_READ):
   5387 		*tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT;
   5388 		return (TTE_PRIV_INT | TTE_WRPRM_INT); 	/* set prv and wrt */
   5389 	case (PROT_USER | PROT_READ):
   5390 	case (PROT_USER | PROT_EXEC):
   5391 	case (PROT_USER | PROT_EXEC | PROT_READ):
   5392 		*tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT;
   5393 		return (0); 			/* clr prv and wrt */
   5394 	case (PROT_USER | PROT_WRITE):
   5395 	case (PROT_USER | PROT_WRITE | PROT_READ):
   5396 	case (PROT_USER | PROT_EXEC | PROT_WRITE):
   5397 	case (PROT_USER | PROT_EXEC | PROT_WRITE | PROT_READ):
   5398 		*tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT;
   5399 		return (TTE_WRPRM_INT); 	/* clr prv and set wrt */
   5400 	default:
   5401 		panic("sfmmu_vtop_prot -- bad prot %x", vprot);
   5402 	}
   5403 	return (0);
   5404 }
   5405 
   5406 /*
   5407  * Alternate unload for very large virtual ranges. With a true 64 bit VA,
   5408  * the normal algorithm would take too long for a very large VA range with
   5409  * few real mappings. This routine just walks thru all HMEs in the global
   5410  * hash table to find and remove mappings.
   5411  */
   5412 static void
   5413 hat_unload_large_virtual(
   5414 	struct hat		*sfmmup,
   5415 	caddr_t			startaddr,
   5416 	size_t			len,
   5417 	uint_t			flags,
   5418 	hat_callback_t		*callback)
   5419 {
   5420 	struct hmehash_bucket *hmebp;
   5421 	struct hme_blk *hmeblkp;
   5422 	struct hme_blk *pr_hblk = NULL;
   5423 	struct hme_blk *nx_hblk;
   5424 	struct hme_blk *list = NULL;
   5425 	int i;
   5426 	demap_range_t dmr, *dmrp;
   5427 	cpuset_t cpuset;
   5428 	caddr_t	endaddr = startaddr + len;
   5429 	caddr_t	sa;
   5430 	caddr_t	ea;
   5431 	caddr_t	cb_sa[MAX_CB_ADDR];
   5432 	caddr_t	cb_ea[MAX_CB_ADDR];
   5433 	int	addr_cnt = 0;
   5434 	int	a = 0;
   5435 
   5436 	if (sfmmup->sfmmu_free) {
   5437 		dmrp = NULL;
   5438 	} else {
   5439 		dmrp = &dmr;
   5440 		DEMAP_RANGE_INIT(sfmmup, dmrp);
   5441 	}
   5442 
   5443 	/*
   5444 	 * Loop through all the hash buckets of HME blocks looking for matches.
   5445 	 */
   5446 	for (i = 0; i <= UHMEHASH_SZ; i++) {
   5447 		hmebp = &uhme_hash[i];
   5448 		SFMMU_HASH_LOCK(hmebp);
   5449 		hmeblkp = hmebp->hmeblkp;
   5450 		pr_hblk = NULL;
   5451 		while (hmeblkp) {
   5452 			nx_hblk = hmeblkp->hblk_next;
   5453 
   5454 			/*
   5455 			 * skip if not this context, if a shadow block or
   5456 			 * if the mapping is not in the requested range
   5457 			 */
   5458 			if (hmeblkp->hblk_tag.htag_id != sfmmup ||
   5459 			    hmeblkp->hblk_shw_bit ||
   5460 			    (sa = (caddr_t)get_hblk_base(hmeblkp)) >= endaddr ||
   5461 			    (ea = get_hblk_endaddr(hmeblkp)) <= startaddr) {
   5462 				pr_hblk = hmeblkp;
   5463 				goto next_block;
   5464 			}
   5465 
   5466 			ASSERT(!hmeblkp->hblk_shared);
   5467 			/*
   5468 			 * unload if there are any current valid mappings
   5469 			 */
   5470 			if (hmeblkp->hblk_vcnt != 0 ||
   5471 			    hmeblkp->hblk_hmecnt != 0)
   5472 				(void) sfmmu_hblk_unload(sfmmup, hmeblkp,
   5473 				    sa, ea, dmrp, flags);
   5474 
   5475 			/*
   5476 			 * on unmap we also release the HME block itself, once
   5477 			 * all mappings are gone.
   5478 			 */
   5479 			if ((flags & HAT_UNLOAD_UNMAP) != 0 &&
   5480 			    !hmeblkp->hblk_vcnt &&
   5481 			    !hmeblkp->hblk_hmecnt) {
   5482 				ASSERT(!hmeblkp->hblk_lckcnt);
   5483 				sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
   5484 				    &list, 0);
   5485 			} else {
   5486 				pr_hblk = hmeblkp;
   5487 			}
   5488 
   5489 			if (callback == NULL)
   5490 				goto next_block;
   5491 
   5492 			/*
   5493 			 * HME blocks may span more than one page, but we may be
   5494 			 * unmapping only one page, so check for a smaller range
   5495 			 * for the callback
   5496 			 */
   5497 			if (sa < startaddr)
   5498 				sa = startaddr;
   5499 			if (--ea > endaddr)
   5500 				ea = endaddr - 1;
   5501 
   5502 			cb_sa[addr_cnt] = sa;
   5503 			cb_ea[addr_cnt] = ea;
   5504 			if (++addr_cnt == MAX_CB_ADDR) {
   5505 				if (dmrp != NULL) {
   5506 					DEMAP_RANGE_FLUSH(dmrp);
   5507 					cpuset = sfmmup->sfmmu_cpusran;
   5508 					xt_sync(cpuset);
   5509 				}
   5510 
   5511 				for (a = 0; a < MAX_CB_ADDR; ++a) {
   5512 					callback->hcb_start_addr = cb_sa[a];
   5513 					callback->hcb_end_addr = cb_ea[a];
   5514 					callback->hcb_function(callback);
   5515 				}
   5516 				addr_cnt = 0;
   5517 			}
   5518 
   5519 next_block:
   5520 			hmeblkp = nx_hblk;
   5521 		}
   5522 		SFMMU_HASH_UNLOCK(hmebp);
   5523 	}
   5524 
   5525 	sfmmu_hblks_list_purge(&list, 0);
   5526 	if (dmrp != NULL) {
   5527 		DEMAP_RANGE_FLUSH(dmrp);
   5528 		cpuset = sfmmup->sfmmu_cpusran;
   5529 		xt_sync(cpuset);
   5530 	}
   5531 
   5532 	for (a = 0; a < addr_cnt; ++a) {
   5533 		callback->hcb_start_addr = cb_sa[a];
   5534 		callback->hcb_end_addr = cb_ea[a];
   5535 		callback->hcb_function(callback);
   5536 	}
   5537 
   5538 	/*
   5539 	 * Check TSB and TLB page sizes if the process isn't exiting.
   5540 	 */
   5541 	if (!sfmmup->sfmmu_free)
   5542 		sfmmu_check_page_sizes(sfmmup, 0);
   5543 }
   5544 
   5545 /*
   5546  * Unload all the mappings in the range [addr..addr+len). addr and len must
   5547  * be MMU_PAGESIZE aligned.
   5548  */
   5549 
   5550 extern struct seg *segkmap;
   5551 #define	ISSEGKMAP(sfmmup, addr) (sfmmup == ksfmmup && \
   5552 segkmap->s_base <= (addr) && (addr) < (segkmap->s_base + segkmap->s_size))
   5553 
   5554 
   5555 void
   5556 hat_unload_callback(
   5557 	struct hat *sfmmup,
   5558 	caddr_t addr,
   5559 	size_t len,
   5560 	uint_t flags,
   5561 	hat_callback_t *callback)
   5562 {
   5563 	struct hmehash_bucket *hmebp;
   5564 	hmeblk_tag hblktag;
   5565 	int hmeshift, hashno, iskernel;
   5566 	struct hme_blk *hmeblkp, *pr_hblk, *list = NULL;
   5567 	caddr_t endaddr;
   5568 	cpuset_t cpuset;
   5569 	int addr_count = 0;
   5570 	int a;
   5571 	caddr_t cb_start_addr[MAX_CB_ADDR];
   5572 	caddr_t cb_end_addr[MAX_CB_ADDR];
   5573 	int issegkmap = ISSEGKMAP(sfmmup, addr);
   5574 	demap_range_t dmr, *dmrp;
   5575 
   5576 	if (sfmmup->sfmmu_xhat_provider) {
   5577 		XHAT_UNLOAD_CALLBACK(sfmmup, addr, len, flags, callback);
   5578 		return;
   5579 	} else {
   5580 		/*
   5581 		 * This must be a CPU HAT. If the address space has
   5582 		 * XHATs attached, unload the mappings for all of them,
   5583 		 * just in case
   5584 		 */
   5585 		ASSERT(sfmmup->sfmmu_as != NULL);
   5586 		if (sfmmup->sfmmu_as->a_xhat != NULL)
   5587 			xhat_unload_callback_all(sfmmup->sfmmu_as, addr,
   5588 			    len, flags, callback);
   5589 	}
   5590 
   5591 	ASSERT((sfmmup == ksfmmup) || (flags & HAT_UNLOAD_OTHER) || \
   5592 	    AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock));
   5593 
   5594 	ASSERT(sfmmup != NULL);
   5595 	ASSERT((len & MMU_PAGEOFFSET) == 0);
   5596 	ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET));
   5597 
   5598 	/*
   5599 	 * Probing through a large VA range (say 63 bits) will be slow, even
   5600 	 * at 4 Meg steps between the probes. So, when the virtual address range
   5601 	 * is very large, search the HME entries for what to unload.
   5602 	 *
   5603 	 *	len >> TTE_PAGE_SHIFT(TTE4M) is the # of 4Meg probes we'd need
   5604 	 *
   5605 	 *	UHMEHASH_SZ is number of hash buckets to examine
   5606 	 *
   5607 	 */
   5608 	if (sfmmup != KHATID && (len >> TTE_PAGE_SHIFT(TTE4M)) > UHMEHASH_SZ) {
   5609 		hat_unload_large_virtual(sfmmup, addr, len, flags, callback);
   5610 		return;
   5611 	}
   5612 
   5613 	CPUSET_ZERO(cpuset);
   5614 
   5615 	/*
   5616 	 * If the process is exiting, we can save a lot of fuss since
   5617 	 * we'll flush the TLB when we free the ctx anyway.
   5618 	 */
   5619 	if (sfmmup->sfmmu_free)
   5620 		dmrp = NULL;
   5621 	else
   5622 		dmrp = &dmr;
   5623 
   5624 	DEMAP_RANGE_INIT(sfmmup, dmrp);
   5625 	endaddr = addr + len;
   5626 	hblktag.htag_id = sfmmup;
   5627 	hblktag.htag_rid = SFMMU_INVALID_SHMERID;
   5628 
   5629 	/*
   5630 	 * It is likely for the vm to call unload over a wide range of
   5631 	 * addresses that are actually very sparsely populated by
   5632 	 * translations.  In order to speed this up the sfmmu hat supports
   5633 	 * the concept of shadow hmeblks. Dummy large page hmeblks that
   5634 	 * correspond to actual small translations are allocated at tteload
   5635 	 * time and are referred to as shadow hmeblks.  Now, during unload
   5636 	 * time, we first check if we have a shadow hmeblk for that
   5637 	 * translation.  The absence of one means the corresponding address
   5638 	 * range is empty and can be skipped.
   5639 	 *
   5640 	 * The kernel is an exception to above statement and that is why
   5641 	 * we don't use shadow hmeblks and hash starting from the smallest
   5642 	 * page size.
   5643 	 */
   5644 	if (sfmmup == KHATID) {
   5645 		iskernel = 1;
   5646 		hashno = TTE64K;
   5647 	} else {
   5648 		iskernel = 0;
   5649 		if (mmu_page_sizes == max_mmu_page_sizes) {
   5650 			hashno = TTE256M;
   5651 		} else {
   5652 			hashno = TTE4M;
   5653 		}
   5654 	}
   5655 	while (addr < endaddr) {
   5656 		hmeshift = HME_HASH_SHIFT(hashno);
   5657 		hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
   5658 		hblktag.htag_rehash = hashno;
   5659 		hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
   5660 
   5661 		SFMMU_HASH_LOCK(hmebp);
   5662 
   5663 		HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list);
   5664 		if (hmeblkp == NULL) {
   5665 			/*
   5666 			 * didn't find an hmeblk. skip the appropiate
   5667 			 * address range.
   5668 			 */
   5669 			SFMMU_HASH_UNLOCK(hmebp);
   5670 			if (iskernel) {
   5671 				if (hashno < mmu_hashcnt) {
   5672 					hashno++;
   5673 					continue;
   5674 				} else {
   5675 					hashno = TTE64K;
   5676 					addr = (caddr_t)roundup((uintptr_t)addr
   5677 					    + 1, MMU_PAGESIZE64K);
   5678 					continue;
   5679 				}
   5680 			}
   5681 			addr = (caddr_t)roundup((uintptr_t)addr + 1,
   5682 			    (1 << hmeshift));
   5683 			if ((uintptr_t)addr & MMU_PAGEOFFSET512K) {
   5684 				ASSERT(hashno == TTE64K);
   5685 				continue;
   5686 			}
   5687 			if ((uintptr_t)addr & MMU_PAGEOFFSET4M) {
   5688 				hashno = TTE512K;
   5689 				continue;
   5690 			}
   5691 			if (mmu_page_sizes == max_mmu_page_sizes) {
   5692 				if ((uintptr_t)addr & MMU_PAGEOFFSET32M) {
   5693 					hashno = TTE4M;
   5694 					continue;
   5695 				}
   5696 				if ((uintptr_t)addr & MMU_PAGEOFFSET256M) {
   5697 					hashno = TTE32M;
   5698 					continue;
   5699 				}
   5700 				hashno = TTE256M;
   5701 				continue;
   5702 			} else {
   5703 				hashno = TTE4M;
   5704 				continue;
   5705 			}
   5706 		}
   5707 		ASSERT(hmeblkp);
   5708 		ASSERT(!hmeblkp->hblk_shared);
   5709 		if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) {
   5710 			/*
   5711 			 * If the valid count is zero we can skip the range
   5712 			 * mapped by this hmeblk.
   5713 			 * We free hblks in the case of HAT_UNMAP.  HAT_UNMAP
   5714 			 * is used by segment drivers as a hint
   5715 			 * that the mapping resource won't be used any longer.
   5716 			 * The best example of this is during exit().
   5717 			 */
   5718 			addr = (caddr_t)roundup((uintptr_t)addr + 1,
   5719 			    get_hblk_span(hmeblkp));
   5720 			if ((flags & HAT_UNLOAD_UNMAP) ||
   5721 			    (iskernel && !issegkmap)) {
   5722 				sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
   5723 				    &list, 0);
   5724 			}
   5725 			SFMMU_HASH_UNLOCK(hmebp);
   5726 
   5727 			if (iskernel) {
   5728 				hashno = TTE64K;
   5729 				continue;
   5730 			}
   5731 			if ((uintptr_t)addr & MMU_PAGEOFFSET512K) {
   5732 				ASSERT(hashno == TTE64K);
   5733 				continue;
   5734 			}
   5735 			if ((uintptr_t)addr & MMU_PAGEOFFSET4M) {
   5736 				hashno = TTE512K;
   5737 				continue;
   5738 			}
   5739 			if (mmu_page_sizes == max_mmu_page_sizes) {
   5740 				if ((uintptr_t)addr & MMU_PAGEOFFSET32M) {
   5741 					hashno = TTE4M;
   5742 					continue;
   5743 				}
   5744 				if ((uintptr_t)addr & MMU_PAGEOFFSET256M) {
   5745 					hashno = TTE32M;
   5746 					continue;
   5747 				}
   5748 				hashno = TTE256M;
   5749 				continue;
   5750 			} else {
   5751 				hashno = TTE4M;
   5752 				continue;
   5753 			}
   5754 		}
   5755 		if (hmeblkp->hblk_shw_bit) {
   5756 			/*
   5757 			 * If we encounter a shadow hmeblk we know there is
   5758 			 * smaller sized hmeblks mapping the same address space.
   5759 			 * Decrement the hash size and rehash.
   5760 			 */
   5761 			ASSERT(sfmmup != KHATID);
   5762 			hashno--;
   5763 			SFMMU_HASH_UNLOCK(hmebp);
   5764 			continue;
   5765 		}
   5766 
   5767 		/*
   5768 		 * track callback address ranges.
   5769 		 * only start a new range when it's not contiguous
   5770 		 */
   5771 		if (callback != NULL) {
   5772 			if (addr_count > 0 &&
   5773 			    addr == cb_end_addr[addr_count - 1])
   5774 				--addr_count;
   5775 			else
   5776 				cb_start_addr[addr_count] = addr;
   5777 		}
   5778 
   5779 		addr = sfmmu_hblk_unload(sfmmup, hmeblkp, addr, endaddr,
   5780 		    dmrp, flags);
   5781 
   5782 		if (callback != NULL)
   5783 			cb_end_addr[addr_count++] = addr;
   5784 
   5785 		if (((flags & HAT_UNLOAD_UNMAP) || (iskernel && !issegkmap)) &&
   5786 		    !hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) {
   5787 			sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, &list, 0);
   5788 		}
   5789 		SFMMU_HASH_UNLOCK(hmebp);
   5790 
   5791 		/*
   5792 		 * Notify our caller as to exactly which pages
   5793 		 * have been unloaded. We do these in clumps,
   5794 		 * to minimize the number of xt_sync()s that need to occur.
   5795 		 */
   5796 		if (callback != NULL && addr_count == MAX_CB_ADDR) {
   5797 			DEMAP_RANGE_FLUSH(dmrp);
   5798 			if (dmrp != NULL) {
   5799 				cpuset = sfmmup->sfmmu_cpusran;
   5800 				xt_sync(cpuset);
   5801 			}
   5802 
   5803 			for (a = 0; a < MAX_CB_ADDR; ++a) {
   5804 				callback->hcb_start_addr = cb_start_addr[a];
   5805 				callback->hcb_end_addr = cb_end_addr[a];
   5806 				callback->hcb_function(callback);
   5807 			}
   5808 			addr_count = 0;
   5809 		}
   5810 		if (iskernel) {
   5811 			hashno = TTE64K;
   5812 			continue;
   5813 		}
   5814 		if ((uintptr_t)addr & MMU_PAGEOFFSET512K) {
   5815 			ASSERT(hashno == TTE64K);
   5816 			continue;
   5817 		}
   5818 		if ((uintptr_t)addr & MMU_PAGEOFFSET4M) {
   5819 			hashno = TTE512K;
   5820 			continue;
   5821 		}
   5822 		if (mmu_page_sizes == max_mmu_page_sizes) {
   5823 			if ((uintptr_t)addr & MMU_PAGEOFFSET32M) {
   5824 				hashno = TTE4M;
   5825 				continue;
   5826 			}
   5827 			if ((uintptr_t)addr & MMU_PAGEOFFSET256M) {
   5828 				hashno = TTE32M;
   5829 				continue;
   5830 			}
   5831 			hashno = TTE256M;
   5832 		} else {
   5833 			hashno = TTE4M;
   5834 		}
   5835 	}
   5836 
   5837 	sfmmu_hblks_list_purge(&list, 0);
   5838 	DEMAP_RANGE_FLUSH(dmrp);
   5839 	if (dmrp != NULL) {
   5840 		cpuset = sfmmup->sfmmu_cpusran;
   5841 		xt_sync(cpuset);
   5842 	}
   5843 	if (callback && addr_count != 0) {
   5844 		for (a = 0; a < addr_count; ++a) {
   5845 			callback->hcb_start_addr = cb_start_addr[a];
   5846 			callback->hcb_end_addr = cb_end_addr[a];
   5847 			callback->hcb_function(callback);
   5848 		}
   5849 	}
   5850 
   5851 	/*
   5852 	 * Check TSB and TLB page sizes if the process isn't exiting.
   5853 	 */
   5854 	if (!sfmmup->sfmmu_free)
   5855 		sfmmu_check_page_sizes(sfmmup, 0);
   5856 }
   5857 
   5858 /*
   5859  * Unload all the mappings in the range [addr..addr+len). addr and len must
   5860  * be MMU_PAGESIZE aligned.
   5861  */
   5862 void
   5863 hat_unload(struct hat *sfmmup, caddr_t addr, size_t len, uint_t flags)
   5864 {
   5865 	if (sfmmup->sfmmu_xhat_provider) {
   5866 		XHAT_UNLOAD(sfmmup, addr, len, flags);
   5867 		return;
   5868 	}
   5869 	hat_unload_callback(sfmmup, addr, len, flags, NULL);
   5870 }
   5871 
   5872 
   5873 /*
   5874  * Find the largest mapping size for this page.
   5875  */
   5876 int
   5877 fnd_mapping_sz(page_t *pp)
   5878 {
   5879 	int sz;
   5880 	int p_index;
   5881 
   5882 	p_index = PP_MAPINDEX(pp);
   5883 
   5884 	sz = 0;
   5885 	p_index >>= 1;	/* don't care about 8K bit */
   5886 	for (; p_index; p_index >>= 1) {
   5887 		sz++;
   5888 	}
   5889 
   5890 	return (sz);
   5891 }
   5892 
   5893 /*
   5894  * This function unloads a range of addresses for an hmeblk.
   5895  * It returns the next address to be unloaded.
   5896  * It should be called with the hash lock held.
   5897  */
   5898 static caddr_t
   5899 sfmmu_hblk_unload(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr,
   5900 	caddr_t endaddr, demap_range_t *dmrp, uint_t flags)
   5901 {
   5902 	tte_t	tte, ttemod;
   5903 	struct	sf_hment *sfhmep;
   5904 	int	ttesz;
   5905 	long	ttecnt;
   5906 	page_t *pp;
   5907 	kmutex_t *pml;
   5908 	int ret;
   5909 	int use_demap_range;
   5910 
   5911 	ASSERT(in_hblk_range(hmeblkp, addr));
   5912 	ASSERT(!hmeblkp->hblk_shw_bit);
   5913 	ASSERT(sfmmup != NULL || hmeblkp->hblk_shared);
   5914 	ASSERT(sfmmup == NULL || !hmeblkp->hblk_shared);
   5915 	ASSERT(dmrp == NULL || !hmeblkp->hblk_shared);
   5916 
   5917 #ifdef DEBUG
   5918 	if (get_hblk_ttesz(hmeblkp) != TTE8K &&
   5919 	    (endaddr < get_hblk_endaddr(hmeblkp))) {
   5920 		panic("sfmmu_hblk_unload: partial unload of large page");
   5921 	}
   5922 #endif /* DEBUG */
   5923 
   5924 	endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
   5925 	ttesz = get_hblk_ttesz(hmeblkp);
   5926 
   5927 	use_demap_range = ((dmrp == NULL) ||
   5928 	    (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp)));
   5929 
   5930 	if (use_demap_range) {
   5931 		DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr);
   5932 	} else {
   5933 		DEMAP_RANGE_FLUSH(dmrp);
   5934 	}
   5935 	ttecnt = 0;
   5936 	HBLKTOHME(sfhmep, hmeblkp, addr);
   5937 
   5938 	while (addr < endaddr) {
   5939 		pml = NULL;
   5940 		sfmmu_copytte(&sfhmep->hme_tte, &tte);
   5941 		if (TTE_IS_VALID(&tte)) {
   5942 			pp = sfhmep->hme_page;
   5943 			if (pp != NULL) {
   5944 				pml = sfmmu_mlist_enter(pp);
   5945 			}
   5946 
   5947 			/*
   5948 			 * Verify if hme still points to 'pp' now that
   5949 			 * we have p_mapping lock.
   5950 			 */
   5951 			if (sfhmep->hme_page != pp) {
   5952 				if (pp != NULL && sfhmep->hme_page != NULL) {
   5953 					ASSERT(pml != NULL);
   5954 					sfmmu_mlist_exit(pml);
   5955 					/* Re-start this iteration. */
   5956 					continue;
   5957 				}
   5958 				ASSERT((pp != NULL) &&
   5959 				    (sfhmep->hme_page == NULL));
   5960 				goto tte_unloaded;
   5961 			}
   5962 
   5963 			/*
   5964 			 * This point on we have both HASH and p_mapping
   5965 			 * lock.
   5966 			 */
   5967 			ASSERT(pp == sfhmep->hme_page);
   5968 			ASSERT(pp == NULL || sfmmu_mlist_held(pp));
   5969 
   5970 			/*
   5971 			 * We need to loop on modify tte because it is
   5972 			 * possible for pagesync to come along and
   5973 			 * change the software bits beneath us.
   5974 			 *
   5975 			 * Page_unload can also invalidate the tte after
   5976 			 * we read tte outside of p_mapping lock.
   5977 			 */
   5978 again:
   5979 			ttemod = tte;
   5980 
   5981 			TTE_SET_INVALID(&ttemod);
   5982 			ret = sfmmu_modifytte_try(&tte, &ttemod,
   5983 			    &sfhmep->hme_tte);
   5984 
   5985 			if (ret <= 0) {
   5986 				if (TTE_IS_VALID(&tte)) {
   5987 					ASSERT(ret < 0);
   5988 					goto again;
   5989 				}
   5990 				if (pp != NULL) {
   5991 					panic("sfmmu_hblk_unload: pp = 0x%p "
   5992 					    "tte became invalid under mlist"
   5993 					    " lock = 0x%p", (void *)pp,
   5994 					    (void *)pml);
   5995 				}
   5996 				continue;
   5997 			}
   5998 
   5999 			if (!(flags & HAT_UNLOAD_NOSYNC)) {
   6000 				sfmmu_ttesync(sfmmup, addr, &tte, pp);
   6001 			}
   6002 
   6003 			/*
   6004 			 * Ok- we invalidated the tte. Do the rest of the job.
   6005 			 */
   6006 			ttecnt++;
   6007 
   6008 			if (flags & HAT_UNLOAD_UNLOCK) {
   6009 				ASSERT(hmeblkp->hblk_lckcnt > 0);
   6010 				atomic_add_32(&hmeblkp->hblk_lckcnt, -1);
   6011 				HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK);
   6012 			}
   6013 
   6014 			/*
   6015 			 * Normally we would need to flush the page
   6016 			 * from the virtual cache at this point in
   6017 			 * order to prevent a potential cache alias
   6018 			 * inconsistency.
   6019 			 * The particular scenario we need to worry
   6020 			 * about is:
   6021 			 * Given:  va1 and va2 are two virtual address
   6022 			 * that alias and map the same physical
   6023 			 * address.
   6024 			 * 1.   mapping exists from va1 to pa and data
   6025 			 * has been read into the cache.
   6026 			 * 2.   unload va1.
   6027 			 * 3.   load va2 and modify data using va2.
   6028 			 * 4    unload va2.
   6029 			 * 5.   load va1 and reference data.  Unless we
   6030 			 * flush the data cache when we unload we will
   6031 			 * get stale data.
   6032 			 * Fortunately, page coloring eliminates the
   6033 			 * above scenario by remembering the color a
   6034 			 * physical page was last or is currently
   6035 			 * mapped to.  Now, we delay the flush until
   6036 			 * the loading of translations.  Only when the
   6037 			 * new translation is of a different color
   6038 			 * are we forced to flush.
   6039 			 */
   6040 			if (use_demap_range) {
   6041 				/*
   6042 				 * Mark this page as needing a demap.
   6043 				 */
   6044 				DEMAP_RANGE_MARKPG(dmrp, addr);
   6045 			} else {
   6046 				ASSERT(sfmmup != NULL);
   6047 				ASSERT(!hmeblkp->hblk_shared);
   6048 				sfmmu_tlb_demap(addr, sfmmup, hmeblkp,
   6049 				    sfmmup->sfmmu_free, 0);
   6050 			}
   6051 
   6052 			if (pp) {
   6053 				/*
   6054 				 * Remove the hment from the mapping list
   6055 				 */
   6056 				ASSERT(hmeblkp->hblk_hmecnt > 0);
   6057 
   6058 				/*
   6059 				 * Again, we cannot
   6060 				 * ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS);
   6061 				 */
   6062 				HME_SUB(sfhmep, pp);
   6063 				membar_stst();
   6064 				atomic_add_16(&hmeblkp->hblk_hmecnt, -1);
   6065 			}
   6066 
   6067 			ASSERT(hmeblkp->hblk_vcnt > 0);
   6068 			atomic_add_16(&hmeblkp->hblk_vcnt, -1);
   6069 
   6070 			ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt ||
   6071 			    !hmeblkp->hblk_lckcnt);
   6072 
   6073 #ifdef VAC
   6074 			if (pp && (pp->p_nrm & (P_KPMC | P_KPMS | P_TNC))) {
   6075 				if (PP_ISTNC(pp)) {
   6076 					/*
   6077 					 * If page was temporary
   6078 					 * uncached, try to recache
   6079 					 * it. Note that HME_SUB() was
   6080 					 * called above so p_index and
   6081 					 * mlist had been updated.
   6082 					 */
   6083 					conv_tnc(pp, ttesz);
   6084 				} else if (pp->p_mapping == NULL) {
   6085 					ASSERT(kpm_enable);
   6086 					/*
   6087 					 * Page is marked to be in VAC conflict
   6088 					 * to an existing kpm mapping and/or is
   6089 					 * kpm mapped using only the regular
   6090 					 * pagesize.
   6091 					 */
   6092 					sfmmu_kpm_hme_unload(pp);
   6093 				}
   6094 			}
   6095 #endif	/* VAC */
   6096 		} else if ((pp = sfhmep->hme_page) != NULL) {
   6097 				/*
   6098 				 * TTE is invalid but the hme
   6099 				 * still exists. let pageunload
   6100 				 * complete its job.
   6101 				 */
   6102 				ASSERT(pml == NULL);
   6103 				pml = sfmmu_mlist_enter(pp);
   6104 				if (sfhmep->hme_page != NULL) {
   6105 					sfmmu_mlist_exit(pml);
   6106 					continue;
   6107 				}
   6108 				ASSERT(sfhmep->hme_page == NULL);
   6109 		} else if (hmeblkp->hblk_hmecnt != 0) {
   6110 			/*
   6111 			 * pageunload may have not finished decrementing
   6112 			 * hblk_vcnt and hblk_hmecnt. Find page_t if any and
   6113 			 * wait for pageunload to finish. Rely on pageunload
   6114 			 * to decrement hblk_hmecnt after hblk_vcnt.
   6115 			 */
   6116 			pfn_t pfn = TTE_TO_TTEPFN(&tte);
   6117 			ASSERT(pml == NULL);
   6118 			if (pf_is_memory(pfn)) {
   6119 				pp = page_numtopp_nolock(pfn);
   6120 				if (pp != NULL) {
   6121 					pml = sfmmu_mlist_enter(pp);
   6122 					sfmmu_mlist_exit(pml);
   6123 					pml = NULL;
   6124 				}
   6125 			}
   6126 		}
   6127 
   6128 tte_unloaded:
   6129 		/*
   6130 		 * At this point, the tte we are looking at
   6131 		 * should be unloaded, and hme has been unlinked
   6132 		 * from page too. This is important because in
   6133 		 * pageunload, it does ttesync() then HME_SUB.
   6134 		 * We need to make sure HME_SUB has been completed
   6135 		 * so we know ttesync() has been completed. Otherwise,
   6136 		 * at exit time, after return from hat layer, VM will
   6137 		 * release as structure which hat_setstat() (called
   6138 		 * by ttesync()) needs.
   6139 		 */
   6140 #ifdef DEBUG
   6141 		{
   6142 			tte_t	dtte;
   6143 
   6144 			ASSERT(sfhmep->hme_page == NULL);
   6145 
   6146 			sfmmu_copytte(&sfhmep->hme_tte, &dtte);
   6147 			ASSERT(!TTE_IS_VALID(&dtte));
   6148 		}
   6149 #endif
   6150 
   6151 		if (pml) {
   6152 			sfmmu_mlist_exit(pml);
   6153 		}
   6154 
   6155 		addr += TTEBYTES(ttesz);
   6156 		sfhmep++;
   6157 		DEMAP_RANGE_NEXTPG(dmrp);
   6158 	}
   6159 	/*
   6160 	 * For shared hmeblks this routine is only called when region is freed
   6161 	 * and no longer referenced.  So no need to decrement ttecnt
   6162 	 * in the region structure here.
   6163 	 */
   6164 	if (ttecnt > 0 && sfmmup != NULL) {
   6165 		atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -ttecnt);
   6166 	}
   6167 	return (addr);
   6168 }
   6169 
   6170 /*
   6171  * Invalidate a virtual address range for the local CPU.
   6172  * For best performance ensure that the va range is completely
   6173  * mapped, otherwise the entire TLB will be flushed.
   6174  */
   6175 void
   6176 hat_flush_range(struct hat *sfmmup, caddr_t va, size_t size)
   6177 {
   6178 	ssize_t sz;
   6179 	caddr_t endva = va + size;
   6180 
   6181 	while (va < endva) {
   6182 		sz = hat_getpagesize(sfmmup, va);
   6183 		if (sz < 0) {
   6184 			vtag_flushall();
   6185 			break;
   6186 		}
   6187 		vtag_flushpage(va, (uint64_t)sfmmup);
   6188 		va += sz;
   6189 	}
   6190 }
   6191 
   6192 /*
   6193  * Synchronize all the mappings in the range [addr..addr+len).
   6194  * Can be called with clearflag having two states:
   6195  * HAT_SYNC_DONTZERO means just return the rm stats
   6196  * HAT_SYNC_ZERORM means zero rm bits in the tte and return the stats
   6197  */
   6198 void
   6199 hat_sync(struct hat *sfmmup, caddr_t addr, size_t len, uint_t clearflag)
   6200 {
   6201 	struct hmehash_bucket *hmebp;
   6202 	hmeblk_tag hblktag;
   6203 	int hmeshift, hashno = 1;
   6204 	struct hme_blk *hmeblkp, *list = NULL;
   6205 	caddr_t endaddr;
   6206 	cpuset_t cpuset;
   6207 
   6208 	ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
   6209 	ASSERT((sfmmup == ksfmmup) ||
   6210 	    AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock));
   6211 	ASSERT((len & MMU_PAGEOFFSET) == 0);
   6212 	ASSERT((clearflag == HAT_SYNC_DONTZERO) ||
   6213 	    (clearflag == HAT_SYNC_ZERORM));
   6214 
   6215 	CPUSET_ZERO(cpuset);
   6216 
   6217 	endaddr = addr + len;
   6218 	hblktag.htag_id = sfmmup;
   6219 	hblktag.htag_rid = SFMMU_INVALID_SHMERID;
   6220 
   6221 	/*
   6222 	 * Spitfire supports 4 page sizes.
   6223 	 * Most pages are expected to be of the smallest page
   6224 	 * size (8K) and these will not need to be rehashed. 64K
   6225 	 * pages also don't need to be rehashed because the an hmeblk
   6226 	 * spans 64K of address space. 512K pages might need 1 rehash and
   6227 	 * and 4M pages 2 rehashes.
   6228 	 */
   6229 	while (addr < endaddr) {
   6230 		hmeshift = HME_HASH_SHIFT(hashno);
   6231 		hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
   6232 		hblktag.htag_rehash = hashno;
   6233 		hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
   6234 
   6235 		SFMMU_HASH_LOCK(hmebp);
   6236 
   6237 		HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list);
   6238 		if (hmeblkp != NULL) {
   6239 			ASSERT(!hmeblkp->hblk_shared);
   6240 			/*
   6241 			 * We've encountered a shadow hmeblk so skip the range
   6242 			 * of the next smaller mapping size.
   6243 			 */
   6244 			if (hmeblkp->hblk_shw_bit) {
   6245 				ASSERT(sfmmup != ksfmmup);
   6246 				ASSERT(hashno > 1);
   6247 				addr = (caddr_t)P2END((uintptr_t)addr,
   6248 				    TTEBYTES(hashno - 1));
   6249 			} else {
   6250 				addr = sfmmu_hblk_sync(sfmmup, hmeblkp,
   6251 				    addr, endaddr, clearflag);
   6252 			}
   6253 			SFMMU_HASH_UNLOCK(hmebp);
   6254 			hashno = 1;
   6255 			continue;
   6256 		}
   6257 		SFMMU_HASH_UNLOCK(hmebp);
   6258 
   6259 		if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) {
   6260 			/*
   6261 			 * We have traversed the whole list and rehashed
   6262 			 * if necessary without finding the address to sync.
   6263 			 * This is ok so we increment the address by the
   6264 			 * smallest hmeblk range for kernel mappings and the
   6265 			 * largest hmeblk range, to account for shadow hmeblks,
   6266 			 * for user mappings and continue.
   6267 			 */
   6268 			if (sfmmup == ksfmmup)
   6269 				addr = (caddr_t)P2END((uintptr_t)addr,
   6270 				    TTEBYTES(1));
   6271 			else
   6272 				addr = (caddr_t)P2END((uintptr_t)addr,
   6273 				    TTEBYTES(hashno));
   6274 			hashno = 1;
   6275 		} else {
   6276 			hashno++;
   6277 		}
   6278 	}
   6279 	sfmmu_hblks_list_purge(&list, 0);
   6280 	cpuset = sfmmup->sfmmu_cpusran;
   6281 	xt_sync(cpuset);
   6282 }
   6283 
   6284 static caddr_t
   6285 sfmmu_hblk_sync(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr,
   6286 	caddr_t endaddr, int clearflag)
   6287 {
   6288 	tte_t	tte, ttemod;
   6289 	struct sf_hment *sfhmep;
   6290 	int ttesz;
   6291 	struct page *pp;
   6292 	kmutex_t *pml;
   6293 	int ret;
   6294 
   6295 	ASSERT(hmeblkp->hblk_shw_bit == 0);
   6296 	ASSERT(!hmeblkp->hblk_shared);
   6297 
   6298 	endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
   6299 
   6300 	ttesz = get_hblk_ttesz(hmeblkp);
   6301 	HBLKTOHME(sfhmep, hmeblkp, addr);
   6302 
   6303 	while (addr < endaddr) {
   6304 		sfmmu_copytte(&sfhmep->hme_tte, &tte);
   6305 		if (TTE_IS_VALID(&tte)) {
   6306 			pml = NULL;
   6307 			pp = sfhmep->hme_page;
   6308 			if (pp) {
   6309 				pml = sfmmu_mlist_enter(pp);
   6310 			}
   6311 			if (pp != sfhmep->hme_page) {
   6312 				/*
   6313 				 * tte most have been unloaded
   6314 				 * underneath us.  Recheck
   6315 				 */
   6316 				ASSERT(pml);
   6317 				sfmmu_mlist_exit(pml);
   6318 				continue;
   6319 			}
   6320 
   6321 			ASSERT(pp == NULL || sfmmu_mlist_held(pp));
   6322 
   6323 			if (clearflag == HAT_SYNC_ZERORM) {
   6324 				ttemod = tte;
   6325 				TTE_CLR_RM(&ttemod);
   6326 				ret = sfmmu_modifytte_try(&tte, &ttemod,
   6327 				    &sfhmep->hme_tte);
   6328 				if (ret < 0) {
   6329 					if (pml) {
   6330 						sfmmu_mlist_exit(pml);
   6331 					}
   6332 					continue;
   6333 				}
   6334 
   6335 				if (ret > 0) {
   6336 					sfmmu_tlb_demap(addr, sfmmup,
   6337 					    hmeblkp, 0, 0);
   6338 				}
   6339 			}
   6340 			sfmmu_ttesync(sfmmup, addr, &tte, pp);
   6341 			if (pml) {
   6342 				sfmmu_mlist_exit(pml);
   6343 			}
   6344 		}
   6345 		addr += TTEBYTES(ttesz);
   6346 		sfhmep++;
   6347 	}
   6348 	return (addr);
   6349 }
   6350 
   6351 /*
   6352  * This function will sync a tte to the page struct and it will
   6353  * update the hat stats. Currently it allows us to pass a NULL pp
   6354  * and we will simply update the stats.  We may want to change this
   6355  * so we only keep stats for pages backed by pp's.
   6356  */
   6357 static void
   6358 sfmmu_ttesync(struct hat *sfmmup, caddr_t addr, tte_t *ttep, page_t *pp)
   6359 {
   6360 	uint_t rm = 0;
   6361 	int   	sz;
   6362 	pgcnt_t	npgs;
   6363 
   6364 	ASSERT(TTE_IS_VALID(ttep));
   6365 
   6366 	if (TTE_IS_NOSYNC(ttep)) {
   6367 		return;
   6368 	}
   6369 
   6370 	if (TTE_IS_REF(ttep))  {
   6371 		rm = P_REF;
   6372 	}
   6373 	if (TTE_IS_MOD(ttep))  {
   6374 		rm |= P_MOD;
   6375 	}
   6376 
   6377 	if (rm == 0) {
   6378 		return;
   6379 	}
   6380 
   6381 	sz = TTE_CSZ(ttep);
   6382 	if (sfmmup != NULL && sfmmup->sfmmu_rmstat) {
   6383 		int i;
   6384 		caddr_t	vaddr = addr;
   6385 
   6386 		for (i = 0; i < TTEPAGES(sz); i++, vaddr += MMU_PAGESIZE) {
   6387 			hat_setstat(sfmmup->sfmmu_as, vaddr, MMU_PAGESIZE, rm);
   6388 		}
   6389 
   6390 	}
   6391 
   6392 	/*
   6393 	 * XXX I want to use cas to update nrm bits but they
   6394 	 * currently belong in common/vm and not in hat where
   6395 	 * they should be.
   6396 	 * The nrm bits are protected by the same mutex as
   6397 	 * the one that protects the page's mapping list.
   6398 	 */
   6399 	if (!pp)
   6400 		return;
   6401 	ASSERT(sfmmu_mlist_held(pp));
   6402 	/*
   6403 	 * If the tte is for a large page, we need to sync all the
   6404 	 * pages covered by the tte.
   6405 	 */
   6406 	if (sz != TTE8K) {
   6407 		ASSERT(pp->p_szc != 0);
   6408 		pp = PP_GROUPLEADER(pp, sz);
   6409 		ASSERT(sfmmu_mlist_held(pp));
   6410 	}
   6411 
   6412 	/* Get number of pages from tte size. */
   6413 	npgs = TTEPAGES(sz);
   6414 
   6415 	do {
   6416 		ASSERT(pp);
   6417 		ASSERT(sfmmu_mlist_held(pp));
   6418 		if (((rm & P_REF) != 0 && !PP_ISREF(pp)) ||
   6419 		    ((rm & P_MOD) != 0 && !PP_ISMOD(pp)))
   6420 			hat_page_setattr(pp, rm);
   6421 
   6422 		/*
   6423 		 * Are we done? If not, we must have a large mapping.
   6424 		 * For large mappings we need to sync the rest of the pages
   6425 		 * covered by this tte; goto the next page.
   6426 		 */
   6427 	} while (--npgs > 0 && (pp = PP_PAGENEXT(pp)));
   6428 }
   6429 
   6430 /*
   6431  * Execute pre-callback handler of each pa_hment linked to pp
   6432  *
   6433  * Inputs:
   6434  *   flag: either HAT_PRESUSPEND or HAT_SUSPEND.
   6435  *   capture_cpus: pointer to return value (below)
   6436  *
   6437  * Returns:
   6438  *   Propagates the subsystem callback return values back to the caller;
   6439  *   returns 0 on success.  If capture_cpus is non-NULL, the value returned
   6440  *   is zero if all of the pa_hments are of a type that do not require
   6441  *   capturing CPUs prior to suspending the mapping, else it is 1.
   6442  */
   6443 static int
   6444 hat_pageprocess_precallbacks(struct page *pp, uint_t flag, int *capture_cpus)
   6445 {
   6446 	struct sf_hment	*sfhmep;
   6447 	struct pa_hment *pahmep;
   6448 	int (*f)(caddr_t, uint_t, uint_t, void *);
   6449 	int		ret;
   6450 	id_t		id;
   6451 	int		locked = 0;
   6452 	kmutex_t	*pml;
   6453 
   6454 	ASSERT(PAGE_EXCL(pp));
   6455 	if (!sfmmu_mlist_held(pp)) {
   6456 		pml = sfmmu_mlist_enter(pp);
   6457 		locked = 1;
   6458 	}
   6459 
   6460 	if (capture_cpus)
   6461 		*capture_cpus = 0;
   6462 
   6463 top:
   6464 	for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) {
   6465 		/*
   6466 		 * skip sf_hments corresponding to VA<->PA mappings;
   6467 		 * for pa_hment's, hme_tte.ll is zero
   6468 		 */
   6469 		if (!IS_PAHME(sfhmep))
   6470 			continue;
   6471 
   6472 		pahmep = sfhmep->hme_data;
   6473 		ASSERT(pahmep != NULL);
   6474 
   6475 		/*
   6476 		 * skip if pre-handler has been called earlier in this loop
   6477 		 */
   6478 		if (pahmep->flags & flag)
   6479 			continue;
   6480 
   6481 		id = pahmep->cb_id;
   6482 		ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid);
   6483 		if (capture_cpus && sfmmu_cb_table[id].capture_cpus != 0)
   6484 			*capture_cpus = 1;
   6485 		if ((f = sfmmu_cb_table[id].prehandler) == NULL) {
   6486 			pahmep->flags |= flag;
   6487 			continue;
   6488 		}
   6489 
   6490 		/*
   6491 		 * Drop the mapping list lock to avoid locking order issues.
   6492 		 */
   6493 		if (locked)
   6494 			sfmmu_mlist_exit(pml);
   6495 
   6496 		ret = f(pahmep->addr, pahmep->len, flag, pahmep->pvt);
   6497 		if (ret != 0)
   6498 			return (ret);	/* caller must do the cleanup */
   6499 
   6500 		if (locked) {
   6501 			pml = sfmmu_mlist_enter(pp);
   6502 			pahmep->flags |= flag;
   6503 			goto top;
   6504 		}
   6505 
   6506 		pahmep->flags |= flag;
   6507 	}
   6508 
   6509 	if (locked)
   6510 		sfmmu_mlist_exit(pml);
   6511 
   6512 	return (0);
   6513 }
   6514 
   6515 /*
   6516  * Execute post-callback handler of each pa_hment linked to pp
   6517  *
   6518  * Same overall assumptions and restrictions apply as for
   6519  * hat_pageprocess_precallbacks().
   6520  */
   6521 static void
   6522 hat_pageprocess_postcallbacks(struct page *pp, uint_t flag)
   6523 {
   6524 	pfn_t pgpfn = pp->p_pagenum;
   6525 	pfn_t pgmask = btop(page_get_pagesize(pp->p_szc)) - 1;
   6526 	pfn_t newpfn;
   6527 	struct sf_hment *sfhmep;
   6528 	struct pa_hment *pahmep;
   6529 	int (*f)(caddr_t, uint_t, uint_t, void *, pfn_t);
   6530 	id_t	id;
   6531 	int	locked = 0;
   6532 	kmutex_t *pml;
   6533 
   6534 	ASSERT(PAGE_EXCL(pp));
   6535 	if (!sfmmu_mlist_held(pp)) {
   6536 		pml = sfmmu_mlist_enter(pp);
   6537 		locked = 1;
   6538 	}
   6539 
   6540 top:
   6541 	for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) {
   6542 		/*
   6543 		 * skip sf_hments corresponding to VA<->PA mappings;
   6544 		 * for pa_hment's, hme_tte.ll is zero
   6545 		 */
   6546 		if (!IS_PAHME(sfhmep))
   6547 			continue;
   6548 
   6549 		pahmep = sfhmep->hme_data;
   6550 		ASSERT(pahmep != NULL);
   6551 
   6552 		if ((pahmep->flags & flag) == 0)
   6553 			continue;
   6554 
   6555 		pahmep->flags &= ~flag;
   6556 
   6557 		id = pahmep->cb_id;
   6558 		ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid);
   6559 		if ((f = sfmmu_cb_table[id].posthandler) == NULL)
   6560 			continue;
   6561 
   6562 		/*
   6563 		 * Convert the base page PFN into the constituent PFN
   6564 		 * which is needed by the callback handler.
   6565 		 */
   6566 		newpfn = pgpfn | (btop((uintptr_t)pahmep->addr) & pgmask);
   6567 
   6568 		/*
   6569 		 * Drop the mapping list lock to avoid locking order issues.
   6570 		 */
   6571 		if (locked)
   6572 			sfmmu_mlist_exit(pml);
   6573 
   6574 		if (f(pahmep->addr, pahmep->len, flag, pahmep->pvt, newpfn)
   6575 		    != 0)
   6576 			panic("sfmmu: posthandler failed");
   6577 
   6578 		if (locked) {
   6579 			pml = sfmmu_mlist_enter(pp);
   6580 			goto top;
   6581 		}
   6582 	}
   6583 
   6584 	if (locked)
   6585 		sfmmu_mlist_exit(pml);
   6586 }
   6587 
   6588 /*
   6589  * Suspend locked kernel mapping
   6590  */
   6591 void
   6592 hat_pagesuspend(struct page *pp)
   6593 {
   6594 	struct sf_hment *sfhmep;
   6595 	sfmmu_t *sfmmup;
   6596 	tte_t tte, ttemod;
   6597 	struct hme_blk *hmeblkp;
   6598 	caddr_t addr;
   6599 	int index, cons;
   6600 	cpuset_t cpuset;
   6601 
   6602 	ASSERT(PAGE_EXCL(pp));
   6603 	ASSERT(sfmmu_mlist_held(pp));
   6604 
   6605 	mutex_enter(&kpr_suspendlock);
   6606 
   6607 	/*
   6608 	 * We're about to suspend a kernel mapping so mark this thread as
   6609 	 * non-traceable by DTrace. This prevents us from running into issues
   6610 	 * with probe context trying to touch a suspended page
   6611 	 * in the relocation codepath itself.
   6612 	 */
   6613 	curthread->t_flag |= T_DONTDTRACE;
   6614 
   6615 	index = PP_MAPINDEX(pp);
   6616 	cons = TTE8K;
   6617 
   6618 retry:
   6619 	for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) {
   6620 
   6621 		if (IS_PAHME(sfhmep))
   6622 			continue;
   6623 
   6624 		if (get_hblk_ttesz(sfmmu_hmetohblk(sfhmep)) != cons)
   6625 			continue;
   6626 
   6627 		/*
   6628 		 * Loop until we successfully set the suspend bit in
   6629 		 * the TTE.
   6630 		 */
   6631 again:
   6632 		sfmmu_copytte(&sfhmep->hme_tte, &tte);
   6633 		ASSERT(TTE_IS_VALID(&tte));
   6634 
   6635 		ttemod = tte;
   6636 		TTE_SET_SUSPEND(&ttemod);
   6637 		if (sfmmu_modifytte_try(&tte, &ttemod,
   6638 		    &sfhmep->hme_tte) < 0)
   6639 			goto again;
   6640 
   6641 		/*
   6642 		 * Invalidate TSB entry
   6643 		 */
   6644 		hmeblkp = sfmmu_hmetohblk(sfhmep);
   6645 
   6646 		sfmmup = hblktosfmmu(hmeblkp);
   6647 		ASSERT(sfmmup == ksfmmup);
   6648 		ASSERT(!hmeblkp->hblk_shared);
   6649 
   6650 		addr = tte_to_vaddr(hmeblkp, tte);
   6651 
   6652 		/*
   6653 		 * No need to make sure that the TSB for this sfmmu is
   6654 		 * not being relocated since it is ksfmmup and thus it
   6655 		 * will never be relocated.
   6656 		 */
   6657 		SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0);
   6658 
   6659 		/*
   6660 		 * Update xcall stats
   6661 		 */
   6662 		cpuset = cpu_ready_set;
   6663 		CPUSET_DEL(cpuset, CPU->cpu_id);
   6664 
   6665 		/* LINTED: constant in conditional context */
   6666 		SFMMU_XCALL_STATS(ksfmmup);
   6667 
   6668 		/*
   6669 		 * Flush TLB entry on remote CPU's
   6670 		 */
   6671 		xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr,
   6672 		    (uint64_t)ksfmmup);
   6673 		xt_sync(cpuset);
   6674 
   6675 		/*
   6676 		 * Flush TLB entry on local CPU
   6677 		 */
   6678 		vtag_flushpage(addr, (uint64_t)ksfmmup);
   6679 	}
   6680 
   6681 	while (index != 0) {
   6682 		index = index >> 1;
   6683 		if (index != 0)
   6684 			cons++;
   6685 		if (index & 0x1) {
   6686 			pp = PP_GROUPLEADER(pp, cons);
   6687 			goto retry;
   6688 		}
   6689 	}
   6690 }
   6691 
   6692 #ifdef	DEBUG
   6693 
   6694 #define	N_PRLE	1024
   6695 struct prle {
   6696 	page_t *targ;
   6697 	page_t *repl;
   6698 	int status;
   6699 	int pausecpus;
   6700 	hrtime_t whence;
   6701 };
   6702 
   6703 static struct prle page_relocate_log[N_PRLE];
   6704 static int prl_entry;
   6705 static kmutex_t prl_mutex;
   6706 
   6707 #define	PAGE_RELOCATE_LOG(t, r, s, p)					\
   6708 	mutex_enter(&prl_mutex);					\
   6709 	page_relocate_log[prl_entry].targ = *(t);			\
   6710 	page_relocate_log[prl_entry].repl = *(r);			\
   6711 	page_relocate_log[prl_entry].status = (s);			\
   6712 	page_relocate_log[prl_entry].pausecpus = (p);			\
   6713 	page_relocate_log[prl_entry].whence = gethrtime();		\
   6714 	prl_entry = (prl_entry == (N_PRLE - 1))? 0 : prl_entry + 1;	\
   6715 	mutex_exit(&prl_mutex);
   6716 
   6717 #else	/* !DEBUG */
   6718 #define	PAGE_RELOCATE_LOG(t, r, s, p)
   6719 #endif
   6720 
   6721 /*
   6722  * Core Kernel Page Relocation Algorithm
   6723  *
   6724  * Input:
   6725  *
   6726  * target : 	constituent pages are SE_EXCL locked.
   6727  * replacement:	constituent pages are SE_EXCL locked.
   6728  *
   6729  * Output:
   6730  *
   6731  * nrelocp:	number of pages relocated
   6732  */
   6733 int
   6734 hat_page_relocate(page_t **target, page_t **replacement, spgcnt_t *nrelocp)
   6735 {
   6736 	page_t		*targ, *repl;
   6737 	page_t		*tpp, *rpp;
   6738 	kmutex_t	*low, *high;
   6739 	spgcnt_t	npages, i;
   6740 	page_t		*pl = NULL;
   6741 	int		old_pil;
   6742 	cpuset_t	cpuset;
   6743 	int		cap_cpus;
   6744 	int		ret;
   6745 #ifdef VAC
   6746 	int		cflags = 0;
   6747 #endif
   6748 
   6749 	if (hat_kpr_enabled == 0 || !kcage_on || PP_ISNORELOC(*target)) {
   6750 		PAGE_RELOCATE_LOG(target, replacement, EAGAIN, -1);
   6751 		return (EAGAIN);
   6752 	}
   6753 
   6754 	mutex_enter(&kpr_mutex);
   6755 	kreloc_thread = curthread;
   6756 
   6757 	targ = *target;
   6758 	repl = *replacement;
   6759 	ASSERT(repl != NULL);
   6760 	ASSERT(targ->p_szc == repl->p_szc);
   6761 
   6762 	npages = page_get_pagecnt(targ->p_szc);
   6763 
   6764 	/*
   6765 	 * unload VA<->PA mappings that are not locked
   6766 	 */
   6767 	tpp = targ;
   6768 	for (i = 0; i < npages; i++) {
   6769 		(void) hat_pageunload(tpp, SFMMU_KERNEL_RELOC);
   6770 		tpp++;
   6771 	}
   6772 
   6773 	/*
   6774 	 * Do "presuspend" callbacks, in a context from which we can still
   6775 	 * block as needed. Note that we don't hold the mapping list lock
   6776 	 * of "targ" at this point due to potential locking order issues;
   6777 	 * we assume that between the hat_pageunload() above and holding
   6778 	 * the SE_EXCL lock that the mapping list *cannot* change at this
   6779 	 * point.
   6780 	 */
   6781 	ret = hat_pageprocess_precallbacks(targ, HAT_PRESUSPEND, &cap_cpus);
   6782 	if (ret != 0) {
   6783 		/*
   6784 		 * EIO translates to fatal error, for all others cleanup
   6785 		 * and return EAGAIN.
   6786 		 */
   6787 		ASSERT(ret != EIO);
   6788 		hat_pageprocess_postcallbacks(targ, HAT_POSTUNSUSPEND);
   6789 		PAGE_RELOCATE_LOG(target, replacement, ret, -1);
   6790 		kreloc_thread = NULL;
   6791 		mutex_exit(&kpr_mutex);
   6792 		return (EAGAIN);
   6793 	}
   6794 
   6795 	/*
   6796 	 * acquire p_mapping list lock for both the target and replacement
   6797 	 * root pages.
   6798 	 *
   6799 	 * low and high refer to the need to grab the mlist locks in a
   6800 	 * specific order in order to prevent race conditions.  Thus the
   6801 	 * lower lock must be grabbed before the higher lock.
   6802 	 *
   6803 	 * This will block hat_unload's accessing p_mapping list.  Since
   6804 	 * we have SE_EXCL lock, hat_memload and hat_pageunload will be
   6805 	 * blocked.  Thus, no one else will be accessing the p_mapping list
   6806 	 * while we suspend and reload the locked mapping below.
   6807 	 */
   6808 	tpp = targ;
   6809 	rpp = repl;
   6810 	sfmmu_mlist_reloc_enter(tpp, rpp, &low, &high);
   6811 
   6812 	kpreempt_disable();
   6813 
   6814 	/*
   6815 	 * We raise our PIL to 13 so that we don't get captured by
   6816 	 * another CPU or pinned by an interrupt thread.  We can't go to
   6817 	 * PIL 14 since the nexus driver(s) may need to interrupt at
   6818 	 * that level in the case of IOMMU pseudo mappings.
   6819 	 */
   6820 	cpuset = cpu_ready_set;
   6821 	CPUSET_DEL(cpuset, CPU->cpu_id);
   6822 	if (!cap_cpus || CPUSET_ISNULL(cpuset)) {
   6823 		old_pil = splr(XCALL_PIL);
   6824 	} else {
   6825 		old_pil = -1;
   6826 		xc_attention(cpuset);
   6827 	}
   6828 	ASSERT(getpil() == XCALL_PIL);
   6829 
   6830 	/*
   6831 	 * Now do suspend callbacks. In the case of an IOMMU mapping
   6832 	 * this will suspend all DMA activity to the page while it is
   6833 	 * being relocated. Since we are well above LOCK_LEVEL and CPUs
   6834 	 * may be captured at this point we should have acquired any needed
   6835 	 * locks in the presuspend callback.
   6836 	 */
   6837 	ret = hat_pageprocess_precallbacks(targ, HAT_SUSPEND, NULL);
   6838 	if (ret != 0) {
   6839 		repl = targ;
   6840 		goto suspend_fail;
   6841 	}
   6842 
   6843 	/*
   6844 	 * Raise the PIL yet again, this time to block all high-level
   6845 	 * interrupts on this CPU. This is necessary to prevent an
   6846 	 * interrupt routine from pinning the thread which holds the
   6847 	 * mapping suspended and then touching the suspended page.
   6848 	 *
   6849 	 * Once the page is suspended we also need to be careful to
   6850 	 * avoid calling any functions which touch any seg_kmem memory
   6851 	 * since that memory may be backed by the very page we are
   6852 	 * relocating in here!
   6853 	 */
   6854 	hat_pagesuspend(targ);
   6855 
   6856 	/*
   6857 	 * Now that we are confident everybody has stopped using this page,
   6858 	 * copy the page contents.  Note we use a physical copy to prevent
   6859 	 * locking issues and to avoid fpRAS because we can't handle it in
   6860 	 * this context.
   6861 	 */
   6862 	for (i = 0; i < npages; i++, tpp++, rpp++) {
   6863 #ifdef VAC
   6864 		/*
   6865 		 * If the replacement has a different vcolor than
   6866 		 * the one being replacd, we need to handle VAC
   6867 		 * consistency for it just as we were setting up
   6868 		 * a new mapping to it.
   6869 		 */
   6870 		if ((PP_GET_VCOLOR(rpp) != NO_VCOLOR) &&
   6871 		    (tpp->p_vcolor != rpp->p_vcolor) &&
   6872 		    !CacheColor_IsFlushed(cflags, PP_GET_VCOLOR(rpp))) {
   6873 			CacheColor_SetFlushed(cflags, PP_GET_VCOLOR(rpp));
   6874 			sfmmu_cache_flushcolor(PP_GET_VCOLOR(rpp),
   6875 			    rpp->p_pagenum);
   6876 		}
   6877 #endif
   6878 		/*
   6879 		 * Copy the contents of the page.
   6880 		 */
   6881 		ppcopy_kernel(tpp, rpp);
   6882 	}
   6883 
   6884 	tpp = targ;
   6885 	rpp = repl;
   6886 	for (i = 0; i < npages; i++, tpp++, rpp++) {
   6887 		/*
   6888 		 * Copy attributes.  VAC consistency was handled above,
   6889 		 * if required.
   6890 		 */
   6891 		rpp->p_nrm = tpp->p_nrm;
   6892 		tpp->p_nrm = 0;
   6893 		rpp->p_index = tpp->p_index;
   6894 		tpp->p_index = 0;
   6895 #ifdef VAC
   6896 		rpp->p_vcolor = tpp->p_vcolor;
   6897 #endif
   6898 	}
   6899 
   6900 	/*
   6901 	 * First, unsuspend the page, if we set the suspend bit, and transfer
   6902 	 * the mapping list from the target page to the replacement page.
   6903 	 * Next process postcallbacks; since pa_hment's are linked only to the
   6904 	 * p_mapping list of root page, we don't iterate over the constituent
   6905 	 * pages.
   6906 	 */
   6907 	hat_pagereload(targ, repl);
   6908 
   6909 suspend_fail:
   6910 	hat_pageprocess_postcallbacks(repl, HAT_UNSUSPEND);
   6911 
   6912 	/*
   6913 	 * Now lower our PIL and release any captured CPUs since we
   6914 	 * are out of the "danger zone".  After this it will again be
   6915 	 * safe to acquire adaptive mutex locks, or to drop them...
   6916 	 */
   6917 	if (old_pil != -1) {
   6918 		splx(old_pil);
   6919 	} else {
   6920 		xc_dismissed(cpuset);
   6921 	}
   6922 
   6923 	kpreempt_enable();
   6924 
   6925 	sfmmu_mlist_reloc_exit(low, high);
   6926 
   6927 	/*
   6928 	 * Postsuspend callbacks should drop any locks held across
   6929 	 * the suspend callbacks.  As before, we don't hold the mapping
   6930 	 * list lock at this point.. our assumption is that the mapping
   6931 	 * list still can't change due to our holding SE_EXCL lock and
   6932 	 * there being no unlocked mappings left. Hence the restriction
   6933 	 * on calling context to hat_delete_callback()
   6934 	 */
   6935 	hat_pageprocess_postcallbacks(repl, HAT_POSTUNSUSPEND);
   6936 	if (ret != 0) {
   6937 		/*
   6938 		 * The second presuspend call failed: we got here through
   6939 		 * the suspend_fail label above.
   6940 		 */
   6941 		ASSERT(ret != EIO);
   6942 		PAGE_RELOCATE_LOG(target, replacement, ret, cap_cpus);
   6943 		kreloc_thread = NULL;
   6944 		mutex_exit(&kpr_mutex);
   6945 		return (EAGAIN);
   6946 	}
   6947 
   6948 	/*
   6949 	 * Now that we're out of the performance critical section we can
   6950 	 * take care of updating the hash table, since we still
   6951 	 * hold all the pages locked SE_EXCL at this point we
   6952 	 * needn't worry about things changing out from under us.
   6953 	 */
   6954 	tpp = targ;
   6955 	rpp = repl;
   6956 	for (i = 0; i < npages; i++, tpp++, rpp++) {
   6957 
   6958 		/*
   6959 		 * replace targ with replacement in page_hash table
   6960 		 */
   6961 		targ = tpp;
   6962 		page_relocate_hash(rpp, targ);
   6963 
   6964 		/*
   6965 		 * concatenate target; caller of platform_page_relocate()
   6966 		 * expects target to be concatenated after returning.
   6967 		 */
   6968 		ASSERT(targ->p_next == targ);
   6969 		ASSERT(targ->p_prev == targ);
   6970 		page_list_concat(&pl, &targ);
   6971 	}
   6972 
   6973 	ASSERT(*target == pl);
   6974 	*nrelocp = npages;
   6975 	PAGE_RELOCATE_LOG(target, replacement, 0, cap_cpus);
   6976 	kreloc_thread = NULL;
   6977 	mutex_exit(&kpr_mutex);
   6978 	return (0);
   6979 }
   6980 
   6981 /*
   6982  * Called when stray pa_hments are found attached to a page which is
   6983  * being freed.  Notify the subsystem which attached the pa_hment of
   6984  * the error if it registered a suitable handler, else panic.
   6985  */
   6986 static void
   6987 sfmmu_pahment_leaked(struct pa_hment *pahmep)
   6988 {
   6989 	id_t cb_id = pahmep->cb_id;
   6990 
   6991 	ASSERT(cb_id >= (id_t)0 && cb_id < sfmmu_cb_nextid);
   6992 	if (sfmmu_cb_table[cb_id].errhandler != NULL) {
   6993 		if (sfmmu_cb_table[cb_id].errhandler(pahmep->addr, pahmep->len,
   6994 		    HAT_CB_ERR_LEAKED, pahmep->pvt) == 0)
   6995 			return;		/* non-fatal */
   6996 	}
   6997 	panic("pa_hment leaked: 0x%p", (void *)pahmep);
   6998 }
   6999 
   7000 /*
   7001  * Remove all mappings to page 'pp'.
   7002  */
   7003 int
   7004 hat_pageunload(struct page *pp, uint_t forceflag)
   7005 {
   7006 	struct page *origpp = pp;
   7007 	struct sf_hment *sfhme, *tmphme;
   7008 	struct hme_blk *hmeblkp;
   7009 	kmutex_t *pml;
   7010 #ifdef VAC
   7011 	kmutex_t *pmtx;
   7012 #endif
   7013 	cpuset_t cpuset, tset;
   7014 	int index, cons;
   7015 	int xhme_blks;
   7016 	int pa_hments;
   7017 
   7018 	ASSERT(PAGE_EXCL(pp));
   7019 
   7020 retry_xhat:
   7021 	tmphme = NULL;
   7022 	xhme_blks = 0;
   7023 	pa_hments = 0;
   7024 	CPUSET_ZERO(cpuset);
   7025 
   7026 	pml = sfmmu_mlist_enter(pp);
   7027 
   7028 #ifdef VAC
   7029 	if (pp->p_kpmref)
   7030 		sfmmu_kpm_pageunload(pp);
   7031 	ASSERT(!PP_ISMAPPED_KPM(pp));
   7032 #endif
   7033 	/*
   7034 	 * Clear vpm reference. Since the page is exclusively locked
   7035 	 * vpm cannot be referencing it.
   7036 	 */
   7037 	if (vpm_enable) {
   7038 		pp->p_vpmref = 0;
   7039 	}
   7040 
   7041 	index = PP_MAPINDEX(pp);
   7042 	cons = TTE8K;
   7043 retry:
   7044 	for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
   7045 		tmphme = sfhme->hme_next;
   7046 
   7047 		if (IS_PAHME(sfhme)) {
   7048 			ASSERT(sfhme->hme_data != NULL);
   7049 			pa_hments++;
   7050 			continue;
   7051 		}
   7052 
   7053 		hmeblkp = sfmmu_hmetohblk(sfhme);
   7054 		if (hmeblkp->hblk_xhat_bit) {
   7055 			struct xhat_hme_blk *xblk =
   7056 			    (struct xhat_hme_blk *)hmeblkp;
   7057 
   7058 			(void) XHAT_PAGEUNLOAD(xblk->xhat_hme_blk_hat,
   7059 			    pp, forceflag, XBLK2PROVBLK(xblk));
   7060 
   7061 			xhme_blks = 1;
   7062 			continue;
   7063 		}
   7064 
   7065 		/*
   7066 		 * If there are kernel mappings don't unload them, they will
   7067 		 * be suspended.
   7068 		 */
   7069 		if (forceflag == SFMMU_KERNEL_RELOC && hmeblkp->hblk_lckcnt &&
   7070 		    hmeblkp->hblk_tag.htag_id == ksfmmup)
   7071 			continue;
   7072 
   7073 		tset = sfmmu_pageunload(pp, sfhme, cons);
   7074 		CPUSET_OR(cpuset, tset);
   7075 	}
   7076 
   7077 	while (index != 0) {
   7078 		index = index >> 1;
   7079 		if (index != 0)
   7080 			cons++;
   7081 		if (index & 0x1) {
   7082 			/* Go to leading page */
   7083 			pp = PP_GROUPLEADER(pp, cons);
   7084 			ASSERT(sfmmu_mlist_held(pp));
   7085 			goto retry;
   7086 		}
   7087 	}
   7088 
   7089 	/*
   7090 	 * cpuset may be empty if the page was only mapped by segkpm,
   7091 	 * in which case we won't actually cross-trap.
   7092 	 */
   7093 	xt_sync(cpuset);
   7094 
   7095 	/*
   7096 	 * The page should have no mappings at this point, unless
   7097 	 * we were called from hat_page_relocate() in which case we
   7098 	 * leave the locked mappings which will be suspended later.
   7099 	 */
   7100 	ASSERT(!PP_ISMAPPED(origpp) || xhme_blks || pa_hments ||
   7101 	    (forceflag == SFMMU_KERNEL_RELOC));
   7102 
   7103 #ifdef VAC
   7104 	if (PP_ISTNC(pp)) {
   7105 		if (cons == TTE8K) {
   7106 			pmtx = sfmmu_page_enter(pp);
   7107 			PP_CLRTNC(pp);
   7108 			sfmmu_page_exit(pmtx);
   7109 		} else {
   7110 			conv_tnc(pp, cons);
   7111 		}
   7112 	}
   7113 #endif	/* VAC */
   7114 
   7115 	if (pa_hments && forceflag != SFMMU_KERNEL_RELOC) {
   7116 		/*
   7117 		 * Unlink any pa_hments and free them, calling back
   7118 		 * the responsible subsystem to notify it of the error.
   7119 		 * This can occur in situations such as drivers leaking
   7120 		 * DMA handles: naughty, but common enough that we'd like
   7121 		 * to keep the system running rather than bringing it
   7122 		 * down with an obscure error like "pa_hment leaked"
   7123 		 * which doesn't aid the user in debugging their driver.
   7124 		 */
   7125 		for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
   7126 			tmphme = sfhme->hme_next;
   7127 			if (IS_PAHME(sfhme)) {
   7128 				struct pa_hment *pahmep = sfhme->hme_data;
   7129 				sfmmu_pahment_leaked(pahmep);
   7130 				HME_SUB(sfhme, pp);
   7131 				kmem_cache_free(pa_hment_cache, pahmep);
   7132 			}
   7133 		}
   7134 
   7135 		ASSERT(!PP_ISMAPPED(origpp) || xhme_blks);
   7136 	}
   7137 
   7138 	sfmmu_mlist_exit(pml);
   7139 
   7140 	/*
   7141 	 * XHAT may not have finished unloading pages
   7142 	 * because some other thread was waiting for
   7143 	 * mlist lock and XHAT_PAGEUNLOAD let it do
   7144 	 * the job.
   7145 	 */
   7146 	if (xhme_blks) {
   7147 		pp = origpp;
   7148 		goto retry_xhat;
   7149 	}
   7150 
   7151 	return (0);
   7152 }
   7153 
   7154 cpuset_t
   7155 sfmmu_pageunload(page_t *pp, struct sf_hment *sfhme, int cons)
   7156 {
   7157 	struct hme_blk *hmeblkp;
   7158 	sfmmu_t *sfmmup;
   7159 	tte_t tte, ttemod;
   7160 #ifdef DEBUG
   7161 	tte_t orig_old;
   7162 #endif /* DEBUG */
   7163 	caddr_t addr;
   7164 	int ttesz;
   7165 	int ret;
   7166 	cpuset_t cpuset;
   7167 
   7168 	ASSERT(pp != NULL);
   7169 	ASSERT(sfmmu_mlist_held(pp));
   7170 	ASSERT(!PP_ISKAS(pp));
   7171 
   7172 	CPUSET_ZERO(cpuset);
   7173 
   7174 	hmeblkp = sfmmu_hmetohblk(sfhme);
   7175 
   7176 readtte:
   7177 	sfmmu_copytte(&sfhme->hme_tte, &tte);
   7178 	if (TTE_IS_VALID(&tte)) {
   7179 		sfmmup = hblktosfmmu(hmeblkp);
   7180 		ttesz = get_hblk_ttesz(hmeblkp);
   7181 		/*
   7182 		 * Only unload mappings of 'cons' size.
   7183 		 */
   7184 		if (ttesz != cons)
   7185 			return (cpuset);
   7186 
   7187 		/*
   7188 		 * Note that we have p_mapping lock, but no hash lock here.
   7189 		 * hblk_unload() has to have both hash lock AND p_mapping
   7190 		 * lock before it tries to modify tte. So, the tte could
   7191 		 * not become invalid in the sfmmu_modifytte_try() below.
   7192 		 */
   7193 		ttemod = tte;
   7194 #ifdef DEBUG
   7195 		orig_old = tte;
   7196 #endif /* DEBUG */
   7197 
   7198 		TTE_SET_INVALID(&ttemod);
   7199 		ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte);
   7200 		if (ret < 0) {
   7201 #ifdef DEBUG
   7202 			/* only R/M bits can change. */
   7203 			chk_tte(&orig_old, &tte, &ttemod, hmeblkp);
   7204 #endif /* DEBUG */
   7205 			goto readtte;
   7206 		}
   7207 
   7208 		if (ret == 0) {
   7209 			panic("pageunload: cas failed?");
   7210 		}
   7211 
   7212 		addr = tte_to_vaddr(hmeblkp, tte);
   7213 
   7214 		if (hmeblkp->hblk_shared) {
   7215 			sf_srd_t *srdp = (sf_srd_t *)sfmmup;
   7216 			uint_t rid = hmeblkp->hblk_tag.htag_rid;
   7217 			sf_region_t *rgnp;
   7218 			ASSERT(SFMMU_IS_SHMERID_VALID(rid));
   7219 			ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   7220 			ASSERT(srdp != NULL);
   7221 			rgnp = srdp->srd_hmergnp[rid];
   7222 			SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid);
   7223 			cpuset = sfmmu_rgntlb_demap(addr, rgnp, hmeblkp, 1);
   7224 			sfmmu_ttesync(NULL, addr, &tte, pp);
   7225 			ASSERT(rgnp->rgn_ttecnt[ttesz] > 0);
   7226 			atomic_add_long(&rgnp->rgn_ttecnt[ttesz], -1);
   7227 		} else {
   7228 			sfmmu_ttesync(sfmmup, addr, &tte, pp);
   7229 			atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -1);
   7230 
   7231 			/*
   7232 			 * We need to flush the page from the virtual cache
   7233 			 * in order to prevent a virtual cache alias
   7234 			 * inconsistency. The particular scenario we need
   7235 			 * to worry about is:
   7236 			 * Given:  va1 and va2 are two virtual address that
   7237 			 * alias and will map the same physical address.
   7238 			 * 1.   mapping exists from va1 to pa and data has
   7239 			 *	been read into the cache.
   7240 			 * 2.   unload va1.
   7241 			 * 3.   load va2 and modify data using va2.
   7242 			 * 4    unload va2.
   7243 			 * 5.   load va1 and reference data.  Unless we flush
   7244 			 *	the data cache when we unload we will get
   7245 			 *	stale data.
   7246 			 * This scenario is taken care of by using virtual
   7247 			 * page coloring.
   7248 			 */
   7249 			if (sfmmup->sfmmu_ismhat) {
   7250 				/*
   7251 				 * Flush TSBs, TLBs and caches
   7252 				 * of every process
   7253 				 * sharing this ism segment.
   7254 				 */
   7255 				sfmmu_hat_lock_all();
   7256 				mutex_enter(&ism_mlist_lock);
   7257 				kpreempt_disable();
   7258 				sfmmu_ismtlbcache_demap(addr, sfmmup, hmeblkp,
   7259 				    pp->p_pagenum, CACHE_NO_FLUSH);
   7260 				kpreempt_enable();
   7261 				mutex_exit(&ism_mlist_lock);
   7262 				sfmmu_hat_unlock_all();
   7263 				cpuset = cpu_ready_set;
   7264 			} else {
   7265 				sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0);
   7266 				cpuset = sfmmup->sfmmu_cpusran;
   7267 			}
   7268 		}
   7269 
   7270 		/*
   7271 		 * Hme_sub has to run after ttesync() and a_rss update.
   7272 		 * See hblk_unload().
   7273 		 */
   7274 		HME_SUB(sfhme, pp);
   7275 		membar_stst();
   7276 
   7277 		/*
   7278 		 * We can not make ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS)
   7279 		 * since pteload may have done a HME_ADD() right after
   7280 		 * we did the HME_SUB() above. Hmecnt is now maintained
   7281 		 * by cas only. no lock guranteed its value. The only
   7282 		 * gurantee we have is the hmecnt should not be less than
   7283 		 * what it should be so the hblk will not be taken away.
   7284 		 * It's also important that we decremented the hmecnt after
   7285 		 * we are done with hmeblkp so that this hmeblk won't be
   7286 		 * stolen.
   7287 		 */
   7288 		ASSERT(hmeblkp->hblk_hmecnt > 0);
   7289 		ASSERT(hmeblkp->hblk_vcnt > 0);
   7290 		atomic_add_16(&hmeblkp->hblk_vcnt, -1);
   7291 		atomic_add_16(&hmeblkp->hblk_hmecnt, -1);
   7292 		/*
   7293 		 * This is bug 4063182.
   7294 		 * XXX: fixme
   7295 		 * ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt ||
   7296 		 *	!hmeblkp->hblk_lckcnt);
   7297 		 */
   7298 	} else {
   7299 		panic("invalid tte? pp %p &tte %p",
   7300 		    (void *)pp, (void *)&tte);
   7301 	}
   7302 
   7303 	return (cpuset);
   7304 }
   7305 
   7306 /*
   7307  * While relocating a kernel page, this function will move the mappings
   7308  * from tpp to dpp and modify any associated data with these mappings.
   7309  * It also unsuspends the suspended kernel mapping.
   7310  */
   7311 static void
   7312 hat_pagereload(struct page *tpp, struct page *dpp)
   7313 {
   7314 	struct sf_hment *sfhme;
   7315 	tte_t tte, ttemod;
   7316 	int index, cons;
   7317 
   7318 	ASSERT(getpil() == PIL_MAX);
   7319 	ASSERT(sfmmu_mlist_held(tpp));
   7320 	ASSERT(sfmmu_mlist_held(dpp));
   7321 
   7322 	index = PP_MAPINDEX(tpp);
   7323 	cons = TTE8K;
   7324 
   7325 	/* Update real mappings to the page */
   7326 retry:
   7327 	for (sfhme = tpp->p_mapping; sfhme != NULL; sfhme = sfhme->hme_next) {
   7328 		if (IS_PAHME(sfhme))
   7329 			continue;
   7330 		sfmmu_copytte(&sfhme->hme_tte, &tte);
   7331 		ttemod = tte;
   7332 
   7333 		/*
   7334 		 * replace old pfn with new pfn in TTE
   7335 		 */
   7336 		PFN_TO_TTE(ttemod, dpp->p_pagenum);
   7337 
   7338 		/*
   7339 		 * clear suspend bit
   7340 		 */
   7341 		ASSERT(TTE_IS_SUSPEND(&ttemod));
   7342 		TTE_CLR_SUSPEND(&ttemod);
   7343 
   7344 		if (sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte) < 0)
   7345 			panic("hat_pagereload(): sfmmu_modifytte_try() failed");
   7346 
   7347 		/*
   7348 		 * set hme_page point to new page
   7349 		 */
   7350 		sfhme->hme_page = dpp;
   7351 	}
   7352 
   7353 	/*
   7354 	 * move p_mapping list from old page to new page
   7355 	 */
   7356 	dpp->p_mapping = tpp->p_mapping;
   7357 	tpp->p_mapping = NULL;
   7358 	dpp->p_share = tpp->p_share;
   7359 	tpp->p_share = 0;
   7360 
   7361 	while (index != 0) {
   7362 		index = index >> 1;
   7363 		if (index != 0)
   7364 			cons++;
   7365 		if (index & 0x1) {
   7366 			tpp = PP_GROUPLEADER(tpp, cons);
   7367 			dpp = PP_GROUPLEADER(dpp, cons);
   7368 			goto retry;
   7369 		}
   7370 	}
   7371 
   7372 	curthread->t_flag &= ~T_DONTDTRACE;
   7373 	mutex_exit(&kpr_suspendlock);
   7374 }
   7375 
   7376 uint_t
   7377 hat_pagesync(struct page *pp, uint_t clearflag)
   7378 {
   7379 	struct sf_hment *sfhme, *tmphme = NULL;
   7380 	struct hme_blk *hmeblkp;
   7381 	kmutex_t *pml;
   7382 	cpuset_t cpuset, tset;
   7383 	int	index, cons;
   7384 	extern	ulong_t po_share;
   7385 	page_t	*save_pp = pp;
   7386 	int	stop_on_sh = 0;
   7387 	uint_t	shcnt;
   7388 
   7389 	CPUSET_ZERO(cpuset);
   7390 
   7391 	if (PP_ISRO(pp) && (clearflag & HAT_SYNC_STOPON_MOD)) {
   7392 		return (PP_GENERIC_ATTR(pp));
   7393 	}
   7394 
   7395 	if ((clearflag & HAT_SYNC_ZERORM) == 0) {
   7396 		if ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(pp)) {
   7397 			return (PP_GENERIC_ATTR(pp));
   7398 		}
   7399 		if ((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(pp)) {
   7400 			return (PP_GENERIC_ATTR(pp));
   7401 		}
   7402 		if (clearflag & HAT_SYNC_STOPON_SHARED) {
   7403 			if (pp->p_share > po_share) {
   7404 				hat_page_setattr(pp, P_REF);
   7405 				return (PP_GENERIC_ATTR(pp));
   7406 			}
   7407 			stop_on_sh = 1;
   7408 			shcnt = 0;
   7409 		}
   7410 	}
   7411 
   7412 	clearflag &= ~HAT_SYNC_STOPON_SHARED;
   7413 	pml = sfmmu_mlist_enter(pp);
   7414 	index = PP_MAPINDEX(pp);
   7415 	cons = TTE8K;
   7416 retry:
   7417 	for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
   7418 		/*
   7419 		 * We need to save the next hment on the list since
   7420 		 * it is possible for pagesync to remove an invalid hment
   7421 		 * from the list.
   7422 		 */
   7423 		tmphme = sfhme->hme_next;
   7424 		if (IS_PAHME(sfhme))
   7425 			continue;
   7426 		/*
   7427 		 * If we are looking for large mappings and this hme doesn't
   7428 		 * reach the range we are seeking, just ignore it.
   7429 		 */
   7430 		hmeblkp = sfmmu_hmetohblk(sfhme);
   7431 		if (hmeblkp->hblk_xhat_bit)
   7432 			continue;
   7433 
   7434 		if (hme_size(sfhme) < cons)
   7435 			continue;
   7436 
   7437 		if (stop_on_sh) {
   7438 			if (hmeblkp->hblk_shared) {
   7439 				sf_srd_t *srdp = hblktosrd(hmeblkp);
   7440 				uint_t rid = hmeblkp->hblk_tag.htag_rid;
   7441 				sf_region_t *rgnp;
   7442 				ASSERT(SFMMU_IS_SHMERID_VALID(rid));
   7443 				ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   7444 				ASSERT(srdp != NULL);
   7445 				rgnp = srdp->srd_hmergnp[rid];
   7446 				SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp,
   7447 				    rgnp, rid);
   7448 				shcnt += rgnp->rgn_refcnt;
   7449 			} else {
   7450 				shcnt++;
   7451 			}
   7452 			if (shcnt > po_share) {
   7453 				/*
   7454 				 * tell the pager to spare the page this time
   7455 				 * around.
   7456 				 */
   7457 				hat_page_setattr(save_pp, P_REF);
   7458 				index = 0;
   7459 				break;
   7460 			}
   7461 		}
   7462 		tset = sfmmu_pagesync(pp, sfhme,
   7463 		    clearflag & ~HAT_SYNC_STOPON_RM);
   7464 		CPUSET_OR(cpuset, tset);
   7465 
   7466 		/*
   7467 		 * If clearflag is HAT_SYNC_DONTZERO, break out as soon
   7468 		 * as the "ref" or "mod" is set or share cnt exceeds po_share.
   7469 		 */
   7470 		if ((clearflag & ~HAT_SYNC_STOPON_RM) == HAT_SYNC_DONTZERO &&
   7471 		    (((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(save_pp)) ||
   7472 		    ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(save_pp)))) {
   7473 			index = 0;
   7474 			break;
   7475 		}
   7476 	}
   7477 
   7478 	while (index) {
   7479 		index = index >> 1;
   7480 		cons++;
   7481 		if (index & 0x1) {
   7482 			/* Go to leading page */
   7483 			pp = PP_GROUPLEADER(pp, cons);
   7484 			goto retry;
   7485 		}
   7486 	}
   7487 
   7488 	xt_sync(cpuset);
   7489 	sfmmu_mlist_exit(pml);
   7490 	return (PP_GENERIC_ATTR(save_pp));
   7491 }
   7492 
   7493 /*
   7494  * Get all the hardware dependent attributes for a page struct
   7495  */
   7496 static cpuset_t
   7497 sfmmu_pagesync(struct page *pp, struct sf_hment *sfhme,
   7498 	uint_t clearflag)
   7499 {
   7500 	caddr_t addr;
   7501 	tte_t tte, ttemod;
   7502 	struct hme_blk *hmeblkp;
   7503 	int ret;
   7504 	sfmmu_t *sfmmup;
   7505 	cpuset_t cpuset;
   7506 
   7507 	ASSERT(pp != NULL);
   7508 	ASSERT(sfmmu_mlist_held(pp));
   7509 	ASSERT((clearflag == HAT_SYNC_DONTZERO) ||
   7510 	    (clearflag == HAT_SYNC_ZERORM));
   7511 
   7512 	SFMMU_STAT(sf_pagesync);
   7513 
   7514 	CPUSET_ZERO(cpuset);
   7515 
   7516 sfmmu_pagesync_retry:
   7517 
   7518 	sfmmu_copytte(&sfhme->hme_tte, &tte);
   7519 	if (TTE_IS_VALID(&tte)) {
   7520 		hmeblkp = sfmmu_hmetohblk(sfhme);
   7521 		sfmmup = hblktosfmmu(hmeblkp);
   7522 		addr = tte_to_vaddr(hmeblkp, tte);
   7523 		if (clearflag == HAT_SYNC_ZERORM) {
   7524 			ttemod = tte;
   7525 			TTE_CLR_RM(&ttemod);
   7526 			ret = sfmmu_modifytte_try(&tte, &ttemod,
   7527 			    &sfhme->hme_tte);
   7528 			if (ret < 0) {
   7529 				/*
   7530 				 * cas failed and the new value is not what
   7531 				 * we want.
   7532 				 */
   7533 				goto sfmmu_pagesync_retry;
   7534 			}
   7535 
   7536 			if (ret > 0) {
   7537 				/* we win the cas */
   7538 				if (hmeblkp->hblk_shared) {
   7539 					sf_srd_t *srdp = (sf_srd_t *)sfmmup;
   7540 					uint_t rid =
   7541 					    hmeblkp->hblk_tag.htag_rid;
   7542 					sf_region_t *rgnp;
   7543 					ASSERT(SFMMU_IS_SHMERID_VALID(rid));
   7544 					ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   7545 					ASSERT(srdp != NULL);
   7546 					rgnp = srdp->srd_hmergnp[rid];
   7547 					SFMMU_VALIDATE_SHAREDHBLK(hmeblkp,
   7548 					    srdp, rgnp, rid);
   7549 					cpuset = sfmmu_rgntlb_demap(addr,
   7550 					    rgnp, hmeblkp, 1);
   7551 				} else {
   7552 					sfmmu_tlb_demap(addr, sfmmup, hmeblkp,
   7553 					    0, 0);
   7554 					cpuset = sfmmup->sfmmu_cpusran;
   7555 				}
   7556 			}
   7557 		}
   7558 		sfmmu_ttesync(hmeblkp->hblk_shared ? NULL : sfmmup, addr,
   7559 		    &tte, pp);
   7560 	}
   7561 	return (cpuset);
   7562 }
   7563 
   7564 /*
   7565  * Remove write permission from a mappings to a page, so that
   7566  * we can detect the next modification of it. This requires modifying
   7567  * the TTE then invalidating (demap) any TLB entry using that TTE.
   7568  * This code is similar to sfmmu_pagesync().
   7569  */
   7570 static cpuset_t
   7571 sfmmu_pageclrwrt(struct page *pp, struct sf_hment *sfhme)
   7572 {
   7573 	caddr_t addr;
   7574 	tte_t tte;
   7575 	tte_t ttemod;
   7576 	struct hme_blk *hmeblkp;
   7577 	int ret;
   7578 	sfmmu_t *sfmmup;
   7579 	cpuset_t cpuset;
   7580 
   7581 	ASSERT(pp != NULL);
   7582 	ASSERT(sfmmu_mlist_held(pp));
   7583 
   7584 	CPUSET_ZERO(cpuset);
   7585 	SFMMU_STAT(sf_clrwrt);
   7586 
   7587 retry:
   7588 
   7589 	sfmmu_copytte(&sfhme->hme_tte, &tte);
   7590 	if (TTE_IS_VALID(&tte) && TTE_IS_WRITABLE(&tte)) {
   7591 		hmeblkp = sfmmu_hmetohblk(sfhme);
   7592 
   7593 		/*
   7594 		 * xhat mappings should never be to a VMODSORT page.
   7595 		 */
   7596 		ASSERT(hmeblkp->hblk_xhat_bit == 0);
   7597 
   7598 		sfmmup = hblktosfmmu(hmeblkp);
   7599 		addr = tte_to_vaddr(hmeblkp, tte);
   7600 
   7601 		ttemod = tte;
   7602 		TTE_CLR_WRT(&ttemod);
   7603 		TTE_CLR_MOD(&ttemod);
   7604 		ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte);
   7605 
   7606 		/*
   7607 		 * if cas failed and the new value is not what
   7608 		 * we want retry
   7609 		 */
   7610 		if (ret < 0)
   7611 			goto retry;
   7612 
   7613 		/* we win the cas */
   7614 		if (ret > 0) {
   7615 			if (hmeblkp->hblk_shared) {
   7616 				sf_srd_t *srdp = (sf_srd_t *)sfmmup;
   7617 				uint_t rid = hmeblkp->hblk_tag.htag_rid;
   7618 				sf_region_t *rgnp;
   7619 				ASSERT(SFMMU_IS_SHMERID_VALID(rid));
   7620 				ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   7621 				ASSERT(srdp != NULL);
   7622 				rgnp = srdp->srd_hmergnp[rid];
   7623 				SFMMU_VALIDATE_SHAREDHBLK(hmeblkp,
   7624 				    srdp, rgnp, rid);
   7625 				cpuset = sfmmu_rgntlb_demap(addr,
   7626 				    rgnp, hmeblkp, 1);
   7627 			} else {
   7628 				sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0);
   7629 				cpuset = sfmmup->sfmmu_cpusran;
   7630 			}
   7631 		}
   7632 	}
   7633 
   7634 	return (cpuset);
   7635 }
   7636 
   7637 /*
   7638  * Walk all mappings of a page, removing write permission and clearing the
   7639  * ref/mod bits. This code is similar to hat_pagesync()
   7640  */
   7641 static void
   7642 hat_page_clrwrt(page_t *pp)
   7643 {
   7644 	struct sf_hment *sfhme;
   7645 	struct sf_hment *tmphme = NULL;
   7646 	kmutex_t *pml;
   7647 	cpuset_t cpuset;
   7648 	cpuset_t tset;
   7649 	int	index;
   7650 	int	 cons;
   7651 
   7652 	CPUSET_ZERO(cpuset);
   7653 
   7654 	pml = sfmmu_mlist_enter(pp);
   7655 	index = PP_MAPINDEX(pp);
   7656 	cons = TTE8K;
   7657 retry:
   7658 	for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
   7659 		tmphme = sfhme->hme_next;
   7660 
   7661 		/*
   7662 		 * If we are looking for large mappings and this hme doesn't
   7663 		 * reach the range we are seeking, just ignore its.
   7664 		 */
   7665 
   7666 		if (hme_size(sfhme) < cons)
   7667 			continue;
   7668 
   7669 		tset = sfmmu_pageclrwrt(pp, sfhme);
   7670 		CPUSET_OR(cpuset, tset);
   7671 	}
   7672 
   7673 	while (index) {
   7674 		index = index >> 1;
   7675 		cons++;
   7676 		if (index & 0x1) {
   7677 			/* Go to leading page */
   7678 			pp = PP_GROUPLEADER(pp, cons);
   7679 			goto retry;
   7680 		}
   7681 	}
   7682 
   7683 	xt_sync(cpuset);
   7684 	sfmmu_mlist_exit(pml);
   7685 }
   7686 
   7687 /*
   7688  * Set the given REF/MOD/RO bits for the given page.
   7689  * For a vnode with a sorted v_pages list, we need to change
   7690  * the attributes and the v_pages list together under page_vnode_mutex.
   7691  */
   7692 void
   7693 hat_page_setattr(page_t *pp, uint_t flag)
   7694 {
   7695 	vnode_t		*vp = pp->p_vnode;
   7696 	page_t		**listp;
   7697 	kmutex_t	*pmtx;
   7698 	kmutex_t	*vphm = NULL;
   7699 	int		noshuffle;
   7700 
   7701 	noshuffle = flag & P_NSH;
   7702 	flag &= ~P_NSH;
   7703 
   7704 	ASSERT(!(flag & ~(P_MOD | P_REF | P_RO)));
   7705 
   7706 	/*
   7707 	 * nothing to do if attribute already set
   7708 	 */
   7709 	if ((pp->p_nrm & flag) == flag)
   7710 		return;
   7711 
   7712 	if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp) &&
   7713 	    !noshuffle) {
   7714 		vphm = page_vnode_mutex(vp);
   7715 		mutex_enter(vphm);
   7716 	}
   7717 
   7718 	pmtx = sfmmu_page_enter(pp);
   7719 	pp->p_nrm |= flag;
   7720 	sfmmu_page_exit(pmtx);
   7721 
   7722 	if (vphm != NULL) {
   7723 		/*
   7724 		 * Some File Systems examine v_pages for NULL w/o
   7725 		 * grabbing the vphm mutex. Must not let it become NULL when
   7726 		 * pp is the only page on the list.
   7727 		 */
   7728 		if (pp->p_vpnext != pp) {
   7729 			page_vpsub(&vp->v_pages, pp);
   7730 			if (vp->v_pages != NULL)
   7731 				listp = &vp->v_pages->p_vpprev->p_vpnext;
   7732 			else
   7733 				listp = &vp->v_pages;
   7734 			page_vpadd(listp, pp);
   7735 		}
   7736 		mutex_exit(vphm);
   7737 	}
   7738 }
   7739 
   7740 void
   7741 hat_page_clrattr(page_t *pp, uint_t flag)
   7742 {
   7743 	vnode_t		*vp = pp->p_vnode;
   7744 	kmutex_t	*pmtx;
   7745 
   7746 	ASSERT(!(flag & ~(P_MOD | P_REF | P_RO)));
   7747 
   7748 	pmtx = sfmmu_page_enter(pp);
   7749 
   7750 	/*
   7751 	 * Caller is expected to hold page's io lock for VMODSORT to work
   7752 	 * correctly with pvn_vplist_dirty() and pvn_getdirty() when mod
   7753 	 * bit is cleared.
   7754 	 * We don't have assert to avoid tripping some existing third party
   7755 	 * code. The dirty page is moved back to top of the v_page list
   7756 	 * after IO is done in pvn_write_done().
   7757 	 */
   7758 	pp->p_nrm &= ~flag;
   7759 	sfmmu_page_exit(pmtx);
   7760 
   7761 	if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp)) {
   7762 
   7763 		/*
   7764 		 * VMODSORT works by removing write permissions and getting
   7765 		 * a fault when a page is made dirty. At this point
   7766 		 * we need to remove write permission from all mappings
   7767 		 * to this page.
   7768 		 */
   7769 		hat_page_clrwrt(pp);
   7770 	}
   7771 }
   7772 
   7773 uint_t
   7774 hat_page_getattr(page_t *pp, uint_t flag)
   7775 {
   7776 	ASSERT(!(flag & ~(P_MOD | P_REF | P_RO)));
   7777 	return ((uint_t)(pp->p_nrm & flag));
   7778 }
   7779 
   7780 /*
   7781  * DEBUG kernels: verify that a kernel va<->pa translation
   7782  * is safe by checking the underlying page_t is in a page
   7783  * relocation-safe state.
   7784  */
   7785 #ifdef	DEBUG
   7786 void
   7787 sfmmu_check_kpfn(pfn_t pfn)
   7788 {
   7789 	page_t *pp;
   7790 	int index, cons;
   7791 
   7792 	if (hat_check_vtop == 0)
   7793 		return;
   7794 
   7795 	if (hat_kpr_enabled == 0 || kvseg.s_base == NULL || panicstr)
   7796 		return;
   7797 
   7798 	pp = page_numtopp_nolock(pfn);
   7799 	if (!pp)
   7800 		return;
   7801 
   7802 	if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp))
   7803 		return;
   7804 
   7805 	/*
   7806 	 * Handed a large kernel page, we dig up the root page since we
   7807 	 * know the root page might have the lock also.
   7808 	 */
   7809 	if (pp->p_szc != 0) {
   7810 		index = PP_MAPINDEX(pp);
   7811 		cons = TTE8K;
   7812 again:
   7813 		while (index != 0) {
   7814 			index >>= 1;
   7815 			if (index != 0)
   7816 				cons++;
   7817 			if (index & 0x1) {
   7818 				pp = PP_GROUPLEADER(pp, cons);
   7819 				goto again;
   7820 			}
   7821 		}
   7822 	}
   7823 
   7824 	if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp))
   7825 		return;
   7826 
   7827 	/*
   7828 	 * Pages need to be locked or allocated "permanent" (either from
   7829 	 * static_arena arena or explicitly setting PG_NORELOC when calling
   7830 	 * page_create_va()) for VA->PA translations to be valid.
   7831 	 */
   7832 	if (!PP_ISNORELOC(pp))
   7833 		panic("Illegal VA->PA translation, pp 0x%p not permanent",
   7834 		    (void *)pp);
   7835 	else
   7836 		panic("Illegal VA->PA translation, pp 0x%p not locked",
   7837 		    (void *)pp);
   7838 }
   7839 #endif	/* DEBUG */
   7840 
   7841 /*
   7842  * Returns a page frame number for a given virtual address.
   7843  * Returns PFN_INVALID to indicate an invalid mapping
   7844  */
   7845 pfn_t
   7846 hat_getpfnum(struct hat *hat, caddr_t addr)
   7847 {
   7848 	pfn_t pfn;
   7849 	tte_t tte;
   7850 
   7851 	/*
   7852 	 * We would like to
   7853 	 * ASSERT(AS_LOCK_HELD(as, &as->a_lock));
   7854 	 * but we can't because the iommu driver will call this
   7855 	 * routine at interrupt time and it can't grab the as lock
   7856 	 * or it will deadlock: A thread could have the as lock
   7857 	 * and be waiting for io.  The io can't complete
   7858 	 * because the interrupt thread is blocked trying to grab
   7859 	 * the as lock.
   7860 	 */
   7861 
   7862 	ASSERT(hat->sfmmu_xhat_provider == NULL);
   7863 
   7864 	if (hat == ksfmmup) {
   7865 		if (IS_KMEM_VA_LARGEPAGE(addr)) {
   7866 			ASSERT(segkmem_lpszc > 0);
   7867 			pfn = sfmmu_kvaszc2pfn(addr, segkmem_lpszc);
   7868 			if (pfn != PFN_INVALID) {
   7869 				sfmmu_check_kpfn(pfn);
   7870 				return (pfn);
   7871 			}
   7872 		} else if (segkpm && IS_KPM_ADDR(addr)) {
   7873 			return (sfmmu_kpm_vatopfn(addr));
   7874 		}
   7875 		while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte))
   7876 		    == PFN_SUSPENDED) {
   7877 			sfmmu_vatopfn_suspended(addr, ksfmmup, &tte);
   7878 		}
   7879 		sfmmu_check_kpfn(pfn);
   7880 		return (pfn);
   7881 	} else {
   7882 		return (sfmmu_uvatopfn(addr, hat, NULL));
   7883 	}
   7884 }
   7885 
   7886 /*
   7887  * hat_getkpfnum() is an obsolete DDI routine, and its use is discouraged.
   7888  * Use hat_getpfnum(kas.a_hat, ...) instead.
   7889  *
   7890  * We'd like to return PFN_INVALID if the mappings have underlying page_t's
   7891  * but can't right now due to the fact that some software has grown to use
   7892  * this interface incorrectly. So for now when the interface is misused,
   7893  * return a warning to the user that in the future it won't work in the
   7894  * way they're abusing it, and carry on (after disabling page relocation).
   7895  */
   7896 pfn_t
   7897 hat_getkpfnum(caddr_t addr)
   7898 {
   7899 	pfn_t pfn;
   7900 	tte_t tte;
   7901 	int badcaller = 0;
   7902 	extern int segkmem_reloc;
   7903 
   7904 	if (segkpm && IS_KPM_ADDR(addr)) {
   7905 		badcaller = 1;
   7906 		pfn = sfmmu_kpm_vatopfn(addr);
   7907 	} else {
   7908 		while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte))
   7909 		    == PFN_SUSPENDED) {
   7910 			sfmmu_vatopfn_suspended(addr, ksfmmup, &tte);
   7911 		}
   7912 		badcaller = pf_is_memory(pfn);
   7913 	}
   7914 
   7915 	if (badcaller) {
   7916 		/*
   7917 		 * We can't return PFN_INVALID or the caller may panic
   7918 		 * or corrupt the system.  The only alternative is to
   7919 		 * disable page relocation at this point for all kernel
   7920 		 * memory.  This will impact any callers of page_relocate()
   7921 		 * such as FMA or DR.
   7922 		 *
   7923 		 * RFE: Add junk here to spit out an ereport so the sysadmin
   7924 		 * can be advised that he should upgrade his device driver
   7925 		 * so that this doesn't happen.
   7926 		 */
   7927 		hat_getkpfnum_badcall(caller());
   7928 		if (hat_kpr_enabled && segkmem_reloc) {
   7929 			hat_kpr_enabled = 0;
   7930 			segkmem_reloc = 0;
   7931 			cmn_err(CE_WARN, "Kernel Page Relocation is DISABLED");
   7932 		}
   7933 	}
   7934 	return (pfn);
   7935 }
   7936 
   7937 /*
   7938  * This routine will return both pfn and tte for the vaddr.
   7939  */
   7940 static pfn_t
   7941 sfmmu_uvatopfn(caddr_t vaddr, struct hat *sfmmup, tte_t *ttep)
   7942 {
   7943 	struct hmehash_bucket *hmebp;
   7944 	hmeblk_tag hblktag;
   7945 	int hmeshift, hashno = 1;
   7946 	struct hme_blk *hmeblkp = NULL;
   7947 	tte_t tte;
   7948 
   7949 	struct sf_hment *sfhmep;
   7950 	pfn_t pfn;
   7951 
   7952 	/* support for ISM */
   7953 	ism_map_t	*ism_map;
   7954 	ism_blk_t	*ism_blkp;
   7955 	int		i;
   7956 	sfmmu_t *ism_hatid = NULL;
   7957 	sfmmu_t *locked_hatid = NULL;
   7958 	sfmmu_t	*sv_sfmmup = sfmmup;
   7959 	caddr_t	sv_vaddr = vaddr;
   7960 	sf_srd_t *srdp;
   7961 
   7962 	if (ttep == NULL) {
   7963 		ttep = &tte;
   7964 	} else {
   7965 		ttep->ll = 0;
   7966 	}
   7967 
   7968 	ASSERT(sfmmup != ksfmmup);
   7969 	SFMMU_STAT(sf_user_vtop);
   7970 	/*
   7971 	 * Set ism_hatid if vaddr falls in a ISM segment.
   7972 	 */
   7973 	ism_blkp = sfmmup->sfmmu_iblk;
   7974 	if (ism_blkp != NULL) {
   7975 		sfmmu_ismhat_enter(sfmmup, 0);
   7976 		locked_hatid = sfmmup;
   7977 	}
   7978 	while (ism_blkp != NULL && ism_hatid == NULL) {
   7979 		ism_map = ism_blkp->iblk_maps;
   7980 		for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) {
   7981 			if (vaddr >= ism_start(ism_map[i]) &&
   7982 			    vaddr < ism_end(ism_map[i])) {
   7983 				sfmmup = ism_hatid = ism_map[i].imap_ismhat;
   7984 				vaddr = (caddr_t)(vaddr -
   7985 				    ism_start(ism_map[i]));
   7986 				break;
   7987 			}
   7988 		}
   7989 		ism_blkp = ism_blkp->iblk_next;
   7990 	}
   7991 	if (locked_hatid) {
   7992 		sfmmu_ismhat_exit(locked_hatid, 0);
   7993 	}
   7994 
   7995 	hblktag.htag_id = sfmmup;
   7996 	hblktag.htag_rid = SFMMU_INVALID_SHMERID;
   7997 	do {
   7998 		hmeshift = HME_HASH_SHIFT(hashno);
   7999 		hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift);
   8000 		hblktag.htag_rehash = hashno;
   8001 		hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift);
   8002 
   8003 		SFMMU_HASH_LOCK(hmebp);
   8004 
   8005 		HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp);
   8006 		if (hmeblkp != NULL) {
   8007 			ASSERT(!hmeblkp->hblk_shared);
   8008 			HBLKTOHME(sfhmep, hmeblkp, vaddr);
   8009 			sfmmu_copytte(&sfhmep->hme_tte, ttep);
   8010 			SFMMU_HASH_UNLOCK(hmebp);
   8011 			if (TTE_IS_VALID(ttep)) {
   8012 				pfn = TTE_TO_PFN(vaddr, ttep);
   8013 				return (pfn);
   8014 			}
   8015 			break;
   8016 		}
   8017 		SFMMU_HASH_UNLOCK(hmebp);
   8018 		hashno++;
   8019 	} while (HME_REHASH(sfmmup) && (hashno <= mmu_hashcnt));
   8020 
   8021 	if (SF_HMERGNMAP_ISNULL(sv_sfmmup)) {
   8022 		return (PFN_INVALID);
   8023 	}
   8024 	srdp = sv_sfmmup->sfmmu_srdp;
   8025 	ASSERT(srdp != NULL);
   8026 	ASSERT(srdp->srd_refcnt != 0);
   8027 	hblktag.htag_id = srdp;
   8028 	hashno = 1;
   8029 	do {
   8030 		hmeshift = HME_HASH_SHIFT(hashno);
   8031 		hblktag.htag_bspage = HME_HASH_BSPAGE(sv_vaddr, hmeshift);
   8032 		hblktag.htag_rehash = hashno;
   8033 		hmebp = HME_HASH_FUNCTION(srdp, sv_vaddr, hmeshift);
   8034 
   8035 		SFMMU_HASH_LOCK(hmebp);
   8036 		for (hmeblkp = hmebp->hmeblkp; hmeblkp != NULL;
   8037 		    hmeblkp = hmeblkp->hblk_next) {
   8038 			uint_t rid;
   8039 			sf_region_t *rgnp;
   8040 			caddr_t rsaddr;
   8041 			caddr_t readdr;
   8042 
   8043 			if (!HTAGS_EQ_SHME(hmeblkp->hblk_tag, hblktag,
   8044 			    sv_sfmmup->sfmmu_hmeregion_map)) {
   8045 				continue;
   8046 			}
   8047 			ASSERT(hmeblkp->hblk_shared);
   8048 			rid = hmeblkp->hblk_tag.htag_rid;
   8049 			ASSERT(SFMMU_IS_SHMERID_VALID(rid));
   8050 			ASSERT(rid < SFMMU_MAX_HME_REGIONS);
   8051 			rgnp = srdp->srd_hmergnp[rid];
   8052 			SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid);
   8053 			HBLKTOHME(sfhmep, hmeblkp, sv_vaddr);
   8054 			sfmmu_copytte(&sfhmep->hme_tte, ttep);
   8055 			rsaddr = rgnp->rgn_saddr;
   8056 			readdr = rsaddr + rgnp->rgn_size;
   8057 #ifdef DEBUG
   8058 			if (TTE_IS_VALID(ttep) ||
   8059 			    get_hblk_ttesz(hmeblkp) > TTE8K) {
   8060 				caddr_t eva = tte_to_evaddr(hmeblkp, ttep);
   8061 				ASSERT(eva > sv_vaddr);
   8062 				ASSERT(sv_vaddr >= rsaddr);
   8063 				ASSERT(sv_vaddr < readdr);
   8064 				ASSERT(eva <= readdr);
   8065 			}
   8066 #endif /* DEBUG */
   8067 			/*
   8068 			 * Continue the search if we
   8069 			 * found an invalid 8K tte outside of the area
   8070 			 * covered by this hmeblk's region.
   8071 			 */
   8072 			if (TTE_IS_VALID(ttep)) {
   8073 				SFMMU_HASH_UNLOCK(hmebp);
   8074 				pfn = TTE_TO_PFN(sv_vaddr, ttep);
   8075 				return (pfn);
   8076 			} else if (get_hblk_ttesz(hmeblkp) > TTE8K ||
   8077 			    (sv_vaddr >= rsaddr && sv_vaddr < readdr)) {
   8078 				SFMMU_HASH_UNLOCK(hmebp);
   8079 				pfn = PFN_INVALID;
   8080 				return (pfn);
   8081 			}
   8082 		}
   8083 		SFMMU_HASH_UNLOCK(hmebp);
   8084 		hashno++;
   8085 	} while (hashno <= mmu_hashcnt);
   8086 	return (PFN_INVALID);
   8087 }
   8088 
   8089 
   8090 /*
   8091  * For compatability with AT&T and later optimizations
   8092  */
   8093 /* ARGSUSED */
   8094 void
   8095 hat_map(struct hat *hat, caddr_t addr, size_t len, uint_t flags)
   8096 {
   8097 	ASSERT(hat != NULL);
   8098 	ASSERT(hat->sfmmu_xhat_provider == NULL);
   8099 }
   8100 
   8101 /*
   8102  * Return the number of mappings to a particular page.  This number is an
   8103  * approximation of the number of people sharing the page.
   8104  *
   8105  * shared hmeblks or ism hmeblks are counted as 1 mapping here.
   8106  * hat_page_checkshare() can be used to compare threshold to share
   8107  * count that reflects the number of region sharers albeit at higher cost.
   8108  */
   8109 ulong_t
   8110 hat_page_getshare(page_t *pp)
   8111 {
   8112 	page_t *spp = pp;	/* start page */
   8113 	kmutex_t *pml;
   8114 	ulong_t	cnt;
   8115 	int index, sz = TTE64K;
   8116 
   8117 	/*
   8118 	 * We need to grab the mlist lock to make sure any outstanding
   8119 	 * load/unloads complete.  Otherwise we could return zero
   8120 	 * even though the unload(s) hasn't finished yet.
   8121 	 */
   8122 	pml = sfmmu_mlist_enter(spp);
   8123 	cnt = spp->p_share;
   8124 
   8125 #ifdef VAC
   8126 	if (kpm_enable)
   8127 		cnt += spp->p_kpmref;
   8128 #endif
   8129 	if (vpm_enable && pp->p_vpmref) {
   8130 		cnt += 1;
   8131 	}
   8132 
   8133