Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * LOCALITY GROUP (LGROUP) PLATFORM SUPPORT FOR X86/AMD64 PLATFORMS
     29  * ================================================================
     30  * Multiprocessor AMD and Intel systems may have Non Uniform Memory Access
     31  * (NUMA).  A NUMA machine consists of one or more "nodes" that each consist of
     32  * one or more CPUs and some local memory.  The CPUs in each node can access
     33  * the memory in the other nodes but at a higher latency than accessing their
     34  * local memory.  Typically, a system with only one node has Uniform Memory
     35  * Access (UMA), but it may be possible to have a one node system that has
     36  * some global memory outside of the node which is higher latency.
     37  *
     38  * Module Description
     39  * ------------------
     40  * This module provides a platform interface for determining which CPUs and
     41  * which memory (and how much) are in a NUMA node and how far each node is from
     42  * each other.  The interface is used by the Virtual Memory (VM) system and the
     43  * common lgroup framework.  The VM system uses the plat_*() routines to fill
     44  * in its memory node (memnode) array with the physical address range spanned
     45  * by each NUMA node to know which memory belongs to which node, so it can
     46  * build and manage a physical page free list for each NUMA node and allocate
     47  * local memory from each node as needed.  The common lgroup framework uses the
     48  * exported lgrp_plat_*() routines to figure out which CPUs and memory belong
     49  * to each node (leaf lgroup) and how far each node is from each other, so it
     50  * can build the latency (lgroup) topology for the machine in order to optimize
     51  * for locality.  Also, an lgroup platform handle instead of lgroups are used
     52  * in the interface with this module, so this module shouldn't need to know
     53  * anything about lgroups.  Instead, it just needs to know which CPUs, memory,
     54  * etc. are in each NUMA node, how far each node is from each other, and to use
     55  * a unique lgroup platform handle to refer to each node through the interface.
     56  *
     57  * Determining NUMA Configuration
     58  * ------------------------------
     59  * By default, this module will try to determine the NUMA configuration of the
     60  * machine by reading the ACPI System Resource Affinity Table (SRAT) and System
     61  * Locality Information Table (SLIT).  The SRAT contains info to tell which
     62  * CPUs and memory are local to a given proximity domain (NUMA node).  The SLIT
     63  * is a matrix that gives the distance between each system locality (which is
     64  * a NUMA node and should correspond to proximity domains in the SRAT).  For
     65  * more details on the SRAT and SLIT, please refer to an ACPI 3.0 or newer
     66  * specification.
     67  *
     68  * If the SRAT doesn't exist on a system with AMD Opteron processors, we
     69  * examine registers in PCI configuration space to determine how many nodes are
     70  * in the system and which CPUs and memory are in each node.
     71  * do while booting the kernel.
     72  *
     73  * NOTE: Using these PCI configuration space registers to determine this
     74  *       locality info is not guaranteed to work or be compatible across all
     75  *	 Opteron processor families.
     76  *
     77  * If the SLIT does not exist or look right, the kernel will probe to determine
     78  * the distance between nodes as long as the NUMA CPU and memory configuration
     79  * has been determined (see lgrp_plat_probe() for details).
     80  *
     81  * Data Structures
     82  * ---------------
     83  * The main data structures used by this code are the following:
     84  *
     85  * - lgrp_plat_cpu_node[]		CPU to node ID mapping table indexed by
     86  *					CPU ID (only used for SRAT)
     87  *
     88  * - lgrp_plat_lat_stats.latencies[][]	Table of latencies between same and
     89  *					different nodes indexed by node ID
     90  *
     91  * - lgrp_plat_node_cnt			Number of NUMA nodes in system
     92  *
     93  * - lgrp_plat_node_domain[]		Node ID to proximity domain ID mapping
     94  *					table indexed by node ID (only used
     95  *					for SRAT)
     96  *
     97  * - lgrp_plat_node_memory[]		Table with physical address range for
     98  *					each node indexed by node ID
     99  *
    100  * The code is implemented to make the following always be true:
    101  *
    102  *	lgroup platform handle == node ID == memnode ID
    103  *
    104  * Moreover, it allows for the proximity domain ID to be equal to all of the
    105  * above as long as the proximity domains IDs are numbered from 0 to <number of
    106  * nodes - 1>.  This is done by hashing each proximity domain ID into the range
    107  * from 0 to <number of nodes - 1>.  Then proximity ID N will hash into node ID
    108  * N and proximity domain ID N will be entered into lgrp_plat_node_domain[N]
    109  * and be assigned node ID N.  If the proximity domain IDs aren't numbered
    110  * from 0 to <number of nodes - 1>, then hashing the proximity domain IDs into
    111  * lgrp_plat_node_domain[] will still work for assigning proximity domain IDs
    112  * to node IDs.  However, the proximity domain IDs may not map to the
    113  * equivalent node ID since we want to keep the node IDs numbered from 0 to
    114  * <number of nodes - 1> to minimize cost of searching and potentially space.
    115  *
    116  * The code below really tries to do the above.  However, the virtual memory
    117  * system expects the memnodes which describe the physical address range for
    118  * each NUMA node to be arranged in ascending order by physical address.  (:-(
    119  * Otherwise, the kernel will panic in different semi-random places in the VM
    120  * system.
    121  *
    122  * Consequently, this module has to try to sort the nodes in ascending order by
    123  * each node's starting physical address to try to meet this "constraint" in
    124  * the VM system (see lgrp_plat_node_sort()).  Also, the lowest numbered
    125  * proximity domain ID in the system is deteremined and used to make the lowest
    126  * numbered proximity domain map to node 0 in hopes that the proximity domains
    127  * are sorted in ascending order by physical address already even if their IDs
    128  * don't start at 0 (see NODE_DOMAIN_HASH() and lgrp_plat_srat_domains()).
    129  * Finally, it is important to note that these workarounds may not be
    130  * sufficient if/when memory hotplugging is supported and the VM system may
    131  * ultimately need to be fixed to handle this....
    132  */
    133 
    134 
    135 #include <sys/archsystm.h>	/* for {in,out}{b,w,l}() */
    136 #include <sys/bootconf.h>
    137 #include <sys/cmn_err.h>
    138 #include <sys/controlregs.h>
    139 #include <sys/cpupart.h>
    140 #include <sys/cpuvar.h>
    141 #include <sys/lgrp.h>
    142 #include <sys/machsystm.h>
    143 #include <sys/memlist.h>
    144 #include <sys/memnode.h>
    145 #include <sys/mman.h>
    146 #include <sys/pci_cfgspace.h>
    147 #include <sys/pci_impl.h>
    148 #include <sys/param.h>
    149 #include <sys/pghw.h>
    150 #include <sys/promif.h>		/* for prom_printf() */
    151 #include <sys/sysmacros.h>
    152 #include <sys/systm.h>
    153 #include <sys/thread.h>
    154 #include <sys/types.h>
    155 #include <sys/var.h>
    156 #include <sys/x86_archext.h>	/* for x86_feature and X86_AMD */
    157 #include <vm/hat_i86.h>
    158 #include <vm/seg_kmem.h>
    159 #include <vm/vm_dep.h>
    160 
    161 #include "acpi_fw.h"		/* for SRAT and SLIT */
    162 
    163 
    164 #define	MAX_NODES		8
    165 #define	NLGRP			(MAX_NODES * (MAX_NODES - 1) + 1)
    166 
    167 /*
    168  * Constants for configuring probing
    169  */
    170 #define	LGRP_PLAT_PROBE_NROUNDS		64	/* default laps for probing */
    171 #define	LGRP_PLAT_PROBE_NSAMPLES	1	/* default samples to take */
    172 #define	LGRP_PLAT_PROBE_NREADS		256	/* number of vendor ID reads */
    173 
    174 /*
    175  * Flags for probing
    176  */
    177 #define	LGRP_PLAT_PROBE_ENABLE		0x1	/* enable probing */
    178 #define	LGRP_PLAT_PROBE_PGCPY		0x2	/* probe using page copy */
    179 #define	LGRP_PLAT_PROBE_VENDOR		0x4	/* probe vendor ID register */
    180 
    181 /*
    182  * Hash proximity domain ID into node to domain mapping table "mod" number of
    183  * nodes to minimize span of entries used and try to have lowest numbered
    184  * proximity domain be node 0
    185  */
    186 #define	NODE_DOMAIN_HASH(domain, node_cnt) \
    187 	((lgrp_plat_prox_domain_min == UINT32_MAX) ? (domain) % node_cnt : \
    188 	    ((domain) - lgrp_plat_prox_domain_min) % node_cnt)
    189 
    190 
    191 /*
    192  * CPU to node ID mapping structure (only used with SRAT)
    193  */
    194 typedef	struct cpu_node_map {
    195 	int		exists;
    196 	uint_t		node;
    197 	uint32_t	apicid;
    198 	uint32_t	prox_domain;
    199 } cpu_node_map_t;
    200 
    201 /*
    202  * Latency statistics
    203  */
    204 typedef struct lgrp_plat_latency_stats {
    205 	hrtime_t	latencies[MAX_NODES][MAX_NODES];
    206 	hrtime_t	latency_max;
    207 	hrtime_t	latency_min;
    208 } lgrp_plat_latency_stats_t;
    209 
    210 /*
    211  * Memory configuration for probing
    212  */
    213 typedef struct lgrp_plat_probe_mem_config {
    214 	size_t	probe_memsize;		/* how much memory to probe per node */
    215 	caddr_t	probe_va[MAX_NODES];	/* where memory mapped for probing */
    216 	pfn_t	probe_pfn[MAX_NODES];	/* physical pages to map for probing */
    217 } lgrp_plat_probe_mem_config_t;
    218 
    219 /*
    220  * Statistics kept for probing
    221  */
    222 typedef struct lgrp_plat_probe_stats {
    223 	hrtime_t	flush_cost;
    224 	hrtime_t	probe_cost;
    225 	hrtime_t	probe_cost_total;
    226 	hrtime_t	probe_error_code;
    227 	hrtime_t	probe_errors[MAX_NODES][MAX_NODES];
    228 	int		probe_suspect[MAX_NODES][MAX_NODES];
    229 	hrtime_t	probe_max[MAX_NODES][MAX_NODES];
    230 	hrtime_t	probe_min[MAX_NODES][MAX_NODES];
    231 } lgrp_plat_probe_stats_t;
    232 
    233 /*
    234  * Node to proximity domain ID mapping structure (only used with SRAT)
    235  */
    236 typedef	struct node_domain_map {
    237 	int		exists;
    238 	uint32_t	prox_domain;
    239 } node_domain_map_t;
    240 
    241 /*
    242  * Node ID and starting and ending page for physical memory in node
    243  */
    244 typedef	struct node_phys_addr_map {
    245 	pfn_t		start;
    246 	pfn_t		end;
    247 	int		exists;
    248 	uint32_t	prox_domain;
    249 } node_phys_addr_map_t;
    250 
    251 /*
    252  * Number of CPUs for which we got APIC IDs
    253  */
    254 static int				lgrp_plat_apic_ncpus = 0;
    255 
    256 /*
    257  * CPU to node ID mapping table (only used for SRAT) and its max number of
    258  * entries
    259  */
    260 static cpu_node_map_t			*lgrp_plat_cpu_node = NULL;
    261 static uint_t				lgrp_plat_cpu_node_nentries = 0;
    262 
    263 /*
    264  * Latency statistics
    265  */
    266 lgrp_plat_latency_stats_t		lgrp_plat_lat_stats;
    267 
    268 /*
    269  * Whether memory is interleaved across nodes causing MPO to be disabled
    270  */
    271 static int				lgrp_plat_mem_intrlv = 0;
    272 
    273 /*
    274  * Node ID to proximity domain ID mapping table (only used for SRAT)
    275  */
    276 static node_domain_map_t		lgrp_plat_node_domain[MAX_NODES];
    277 
    278 /*
    279  * Physical address range for memory in each node
    280  */
    281 static node_phys_addr_map_t		lgrp_plat_node_memory[MAX_NODES];
    282 
    283 /*
    284  * Statistics gotten from probing
    285  */
    286 static lgrp_plat_probe_stats_t		lgrp_plat_probe_stats;
    287 
    288 /*
    289  * Memory configuration for probing
    290  */
    291 static lgrp_plat_probe_mem_config_t	lgrp_plat_probe_mem_config;
    292 
    293 /*
    294  * Lowest proximity domain ID seen in ACPI SRAT
    295  */
    296 static uint32_t				lgrp_plat_prox_domain_min = UINT32_MAX;
    297 
    298 /*
    299  * Error code from processing ACPI SRAT
    300  */
    301 static int				lgrp_plat_srat_error = 0;
    302 
    303 /*
    304  * Error code from processing ACPI SLIT
    305  */
    306 static int				lgrp_plat_slit_error = 0;
    307 
    308 /*
    309  * Allocate lgroup array statically
    310  */
    311 static lgrp_t				lgrp_space[NLGRP];
    312 static int				nlgrps_alloc;
    313 
    314 
    315 /*
    316  * Enable finding and using minimum proximity domain ID when hashing
    317  */
    318 int			lgrp_plat_domain_min_enable = 1;
    319 
    320 /*
    321  * Number of nodes in system
    322  */
    323 uint_t			lgrp_plat_node_cnt = 1;
    324 
    325 /*
    326  * Enable sorting nodes in ascending order by starting physical address
    327  */
    328 int			lgrp_plat_node_sort_enable = 1;
    329 
    330 /*
    331  * Configuration Parameters for Probing
    332  * - lgrp_plat_probe_flags	Flags to specify enabling probing, probe
    333  *				operation, etc.
    334  * - lgrp_plat_probe_nrounds	How many rounds of probing to do
    335  * - lgrp_plat_probe_nsamples	Number of samples to take when probing each
    336  *				node
    337  * - lgrp_plat_probe_nreads	Number of times to read vendor ID from
    338  *				Northbridge for each probe
    339  */
    340 uint_t			lgrp_plat_probe_flags = 0;
    341 int			lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS;
    342 int			lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES;
    343 int			lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS;
    344 
    345 /*
    346  * Enable use of ACPI System Resource Affinity Table (SRAT) and System
    347  * Locality Information Table (SLIT)
    348  */
    349 int			lgrp_plat_srat_enable = 1;
    350 int			lgrp_plat_slit_enable = 1;
    351 
    352 /*
    353  * mnode_xwa: set to non-zero value to initiate workaround if large pages are
    354  * found to be crossing memory node boundaries. The workaround will eliminate
    355  * a base size page at the end of each memory node boundary to ensure that
    356  * a large page with constituent pages that span more than 1 memory node
    357  * can never be formed.
    358  *
    359  */
    360 int	mnode_xwa = 1;
    361 
    362 /*
    363  * Static array to hold lgroup statistics
    364  */
    365 struct lgrp_stats	lgrp_stats[NLGRP];
    366 
    367 
    368 /*
    369  * Forward declarations of platform interface routines
    370  */
    371 void		plat_build_mem_nodes(struct memlist *list);
    372 
    373 int		plat_lgrphand_to_mem_node(lgrp_handle_t hand);
    374 
    375 lgrp_handle_t	plat_mem_node_to_lgrphand(int mnode);
    376 
    377 int		plat_mnode_xcheck(pfn_t pfncnt);
    378 
    379 int		plat_pfn_to_mem_node(pfn_t pfn);
    380 
    381 /*
    382  * Forward declarations of lgroup platform interface routines
    383  */
    384 lgrp_t		*lgrp_plat_alloc(lgrp_id_t lgrpid);
    385 
    386 void		lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg);
    387 
    388 lgrp_handle_t	lgrp_plat_cpu_to_hand(processorid_t id);
    389 
    390 void		lgrp_plat_init(lgrp_init_stages_t stage);
    391 
    392 int		lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to);
    393 
    394 int		lgrp_plat_max_lgrps(void);
    395 
    396 pgcnt_t		lgrp_plat_mem_size(lgrp_handle_t plathand,
    397     lgrp_mem_query_t query);
    398 
    399 lgrp_handle_t	lgrp_plat_pfn_to_hand(pfn_t pfn);
    400 
    401 void		lgrp_plat_probe(void);
    402 
    403 lgrp_handle_t	lgrp_plat_root_hand(void);
    404 
    405 
    406 /*
    407  * Forward declarations of local routines
    408  */
    409 static int	is_opteron(void);
    410 
    411 static int	lgrp_plat_cpu_node_update(node_domain_map_t *node_domain,
    412     int node_cnt, cpu_node_map_t *cpu_node, int nentries, uint32_t apicid,
    413     uint32_t domain);
    414 
    415 static int	lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node,
    416     int cpu_node_nentries);
    417 
    418 static int	lgrp_plat_domain_to_node(node_domain_map_t *node_domain,
    419     int node_cnt, uint32_t domain);
    420 
    421 static void	lgrp_plat_get_numa_config(void);
    422 
    423 static void	lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory,
    424     lgrp_plat_latency_stats_t *lat_stats,
    425     lgrp_plat_probe_stats_t *probe_stats);
    426 
    427 static int	lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory,
    428     lgrp_plat_latency_stats_t *lat_stats);
    429 
    430 static void	lgrp_plat_main_init(void);
    431 
    432 static pgcnt_t	lgrp_plat_mem_size_default(lgrp_handle_t, lgrp_mem_query_t);
    433 
    434 static int	lgrp_plat_node_domain_update(node_domain_map_t *node_domain,
    435     int node_cnt, uint32_t domain);
    436 
    437 static int	lgrp_plat_node_memory_update(node_domain_map_t *node_domain,
    438     int node_cnt, node_phys_addr_map_t *node_memory, uint64_t start,
    439     uint64_t end, uint32_t domain);
    440 
    441 static void	lgrp_plat_node_sort(node_domain_map_t *node_domain,
    442     int node_cnt, cpu_node_map_t *cpu_node, int cpu_count,
    443     node_phys_addr_map_t *node_memory);
    444 
    445 static hrtime_t	lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node,
    446     int cpu_node_nentries, lgrp_plat_probe_mem_config_t *probe_mem_config,
    447     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats);
    448 
    449 static int	lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node);
    450 
    451 static int	lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt,
    452     node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats);
    453 
    454 static int	lgrp_plat_process_srat(struct srat *tp,
    455     uint32_t *prox_domain_min, node_domain_map_t *node_domain,
    456     cpu_node_map_t *cpu_node, int cpu_count,
    457     node_phys_addr_map_t *node_memory);
    458 
    459 static void	lgrp_plat_release_bootstrap(void);
    460 
    461 static int	lgrp_plat_srat_domains(struct srat *tp,
    462     uint32_t *prox_domain_min);
    463 
    464 static void	lgrp_plat_2level_setup(node_phys_addr_map_t *node_memory,
    465     lgrp_plat_latency_stats_t *lat_stats);
    466 
    467 static void	opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv,
    468     node_phys_addr_map_t *node_memory);
    469 
    470 static hrtime_t	opt_probe_vendor(int dest_node, int nreads);
    471 
    472 
    473 /*
    474  * PLATFORM INTERFACE ROUTINES
    475  */
    476 
    477 /*
    478  * Configure memory nodes for machines with more than one node (ie NUMA)
    479  */
    480 void
    481 plat_build_mem_nodes(struct memlist *list)
    482 {
    483 	pfn_t		cur_start;	/* start addr of subrange */
    484 	pfn_t		cur_end;	/* end addr of subrange */
    485 	pfn_t		start;		/* start addr of whole range */
    486 	pfn_t		end;		/* end addr of whole range */
    487 	pgcnt_t		endcnt;		/* pages to sacrifice */
    488 
    489 	/*
    490 	 * Boot install lists are arranged <addr, len>, ...
    491 	 */
    492 	while (list) {
    493 		int	node;
    494 
    495 		start = list->address >> PAGESHIFT;
    496 		end = (list->address + list->size - 1) >> PAGESHIFT;
    497 
    498 		if (start > physmax) {
    499 			list = list->next;
    500 			continue;
    501 		}
    502 		if (end > physmax)
    503 			end = physmax;
    504 
    505 		/*
    506 		 * When there is only one memnode, just add memory to memnode
    507 		 */
    508 		if (max_mem_nodes == 1) {
    509 			mem_node_add_slice(start, end);
    510 			list = list->next;
    511 			continue;
    512 		}
    513 
    514 		/*
    515 		 * mem_node_add_slice() expects to get a memory range that
    516 		 * is within one memnode, so need to split any memory range
    517 		 * that spans multiple memnodes into subranges that are each
    518 		 * contained within one memnode when feeding them to
    519 		 * mem_node_add_slice()
    520 		 */
    521 		cur_start = start;
    522 		do {
    523 			node = plat_pfn_to_mem_node(cur_start);
    524 
    525 			/*
    526 			 * Panic if DRAM address map registers or SRAT say
    527 			 * memory in node doesn't exist or address from
    528 			 * boot installed memory list entry isn't in this node.
    529 			 * This shouldn't happen and rest of code can't deal
    530 			 * with this if it does.
    531 			 */
    532 			if (node < 0 || node >= lgrp_plat_node_cnt ||
    533 			    !lgrp_plat_node_memory[node].exists ||
    534 			    cur_start < lgrp_plat_node_memory[node].start ||
    535 			    cur_start > lgrp_plat_node_memory[node].end) {
    536 				cmn_err(CE_PANIC, "Don't know which memnode "
    537 				    "to add installed memory address 0x%lx\n",
    538 				    cur_start);
    539 			}
    540 
    541 			/*
    542 			 * End of current subrange should not span memnodes
    543 			 */
    544 			cur_end = end;
    545 			endcnt = 0;
    546 			if (lgrp_plat_node_memory[node].exists &&
    547 			    cur_end > lgrp_plat_node_memory[node].end) {
    548 				cur_end = lgrp_plat_node_memory[node].end;
    549 				if (mnode_xwa > 1) {
    550 					/*
    551 					 * sacrifice the last page in each
    552 					 * node to eliminate large pages
    553 					 * that span more than 1 memory node.
    554 					 */
    555 					endcnt = 1;
    556 					physinstalled--;
    557 				}
    558 			}
    559 
    560 			mem_node_add_slice(cur_start, cur_end - endcnt);
    561 
    562 			/*
    563 			 * Next subrange starts after end of current one
    564 			 */
    565 			cur_start = cur_end + 1;
    566 		} while (cur_end < end);
    567 
    568 		list = list->next;
    569 	}
    570 	mem_node_physalign = 0;
    571 	mem_node_pfn_shift = 0;
    572 }
    573 
    574 
    575 int
    576 plat_lgrphand_to_mem_node(lgrp_handle_t hand)
    577 {
    578 	if (max_mem_nodes == 1)
    579 		return (0);
    580 
    581 	return ((int)hand);
    582 }
    583 
    584 
    585 /*
    586  * plat_mnode_xcheck: checks the node memory ranges to see if there is a pfncnt
    587  * range of pages aligned on pfncnt that crosses an node boundary. Returns 1 if
    588  * a crossing is found and returns 0 otherwise.
    589  */
    590 int
    591 plat_mnode_xcheck(pfn_t pfncnt)
    592 {
    593 	int	node, prevnode = -1, basenode;
    594 	pfn_t	ea, sa;
    595 
    596 	for (node = 0; node < lgrp_plat_node_cnt; node++) {
    597 
    598 		if (lgrp_plat_node_memory[node].exists == 0)
    599 			continue;
    600 
    601 		if (prevnode == -1) {
    602 			prevnode = node;
    603 			basenode = node;
    604 			continue;
    605 		}
    606 
    607 		/* assume x86 node pfn ranges are in increasing order */
    608 		ASSERT(lgrp_plat_node_memory[node].start >
    609 		    lgrp_plat_node_memory[prevnode].end);
    610 
    611 		/*
    612 		 * continue if the starting address of node is not contiguous
    613 		 * with the previous node.
    614 		 */
    615 
    616 		if (lgrp_plat_node_memory[node].start !=
    617 		    (lgrp_plat_node_memory[prevnode].end + 1)) {
    618 			basenode = node;
    619 			prevnode = node;
    620 			continue;
    621 		}
    622 
    623 		/* check if the starting address of node is pfncnt aligned */
    624 		if ((lgrp_plat_node_memory[node].start & (pfncnt - 1)) != 0) {
    625 
    626 			/*
    627 			 * at this point, node starts at an unaligned boundary
    628 			 * and is contiguous with the previous node(s) to
    629 			 * basenode. Check if there is an aligned contiguous
    630 			 * range of length pfncnt that crosses this boundary.
    631 			 */
    632 
    633 			sa = P2ALIGN(lgrp_plat_node_memory[prevnode].end,
    634 			    pfncnt);
    635 			ea = P2ROUNDUP((lgrp_plat_node_memory[node].start),
    636 			    pfncnt);
    637 
    638 			ASSERT((ea - sa) == pfncnt);
    639 			if (sa >= lgrp_plat_node_memory[basenode].start &&
    640 			    ea <= (lgrp_plat_node_memory[node].end + 1)) {
    641 				/*
    642 				 * large page found to cross mnode boundary.
    643 				 * Return Failure if workaround not enabled.
    644 				 */
    645 				if (mnode_xwa == 0)
    646 					return (1);
    647 				mnode_xwa++;
    648 			}
    649 		}
    650 		prevnode = node;
    651 	}
    652 	return (0);
    653 }
    654 
    655 
    656 lgrp_handle_t
    657 plat_mem_node_to_lgrphand(int mnode)
    658 {
    659 	if (max_mem_nodes == 1)
    660 		return (LGRP_DEFAULT_HANDLE);
    661 
    662 	return ((lgrp_handle_t)mnode);
    663 }
    664 
    665 
    666 int
    667 plat_pfn_to_mem_node(pfn_t pfn)
    668 {
    669 	int	node;
    670 
    671 	if (max_mem_nodes == 1)
    672 		return (0);
    673 
    674 	for (node = 0; node < lgrp_plat_node_cnt; node++) {
    675 		/*
    676 		 * Skip nodes with no memory
    677 		 */
    678 		if (!lgrp_plat_node_memory[node].exists)
    679 			continue;
    680 
    681 		if (pfn >= lgrp_plat_node_memory[node].start &&
    682 		    pfn <= lgrp_plat_node_memory[node].end)
    683 			return (node);
    684 	}
    685 
    686 	/*
    687 	 * Didn't find memnode where this PFN lives which should never happen
    688 	 */
    689 	ASSERT(node < lgrp_plat_node_cnt);
    690 	return (-1);
    691 }
    692 
    693 
    694 /*
    695  * LGROUP PLATFORM INTERFACE ROUTINES
    696  */
    697 
    698 /*
    699  * Allocate additional space for an lgroup.
    700  */
    701 /* ARGSUSED */
    702 lgrp_t *
    703 lgrp_plat_alloc(lgrp_id_t lgrpid)
    704 {
    705 	lgrp_t *lgrp;
    706 
    707 	lgrp = &lgrp_space[nlgrps_alloc++];
    708 	if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP)
    709 		return (NULL);
    710 	return (lgrp);
    711 }
    712 
    713 
    714 /*
    715  * Platform handling for (re)configuration changes
    716  */
    717 /* ARGSUSED */
    718 void
    719 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg)
    720 {
    721 }
    722 
    723 
    724 /*
    725  * Return the platform handle for the lgroup containing the given CPU
    726  */
    727 /* ARGSUSED */
    728 lgrp_handle_t
    729 lgrp_plat_cpu_to_hand(processorid_t id)
    730 {
    731 	lgrp_handle_t	hand;
    732 
    733 	if (lgrp_plat_node_cnt == 1)
    734 		return (LGRP_DEFAULT_HANDLE);
    735 
    736 	hand = (lgrp_handle_t)lgrp_plat_cpu_to_node(cpu[id],
    737 	    lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries);
    738 
    739 	ASSERT(hand != (lgrp_handle_t)-1);
    740 	if (hand == (lgrp_handle_t)-1)
    741 		return (LGRP_NULL_HANDLE);
    742 
    743 	return (hand);
    744 }
    745 
    746 
    747 /*
    748  * Platform-specific initialization of lgroups
    749  */
    750 void
    751 lgrp_plat_init(lgrp_init_stages_t stage)
    752 {
    753 #if defined(__xpv)
    754 #else	/* __xpv */
    755 	u_longlong_t	value;
    756 #endif	/* __xpv */
    757 
    758 	switch (stage) {
    759 	case LGRP_INIT_STAGE1:
    760 #if defined(__xpv)
    761 		/*
    762 		 * XXPV	For now, the hypervisor treats all memory equally.
    763 		 */
    764 		lgrp_plat_node_cnt = max_mem_nodes = 1;
    765 #else	/* __xpv */
    766 		/*
    767 		 * Get boot property for lgroup topology height limit
    768 		 */
    769 		if (bootprop_getval(BP_LGRP_TOPO_LEVELS, &value) == 0)
    770 			(void) lgrp_topo_ht_limit_set((int)value);
    771 
    772 		/*
    773 		 * Get boot property for enabling/disabling SRAT
    774 		 */
    775 		if (bootprop_getval(BP_LGRP_SRAT_ENABLE, &value) == 0)
    776 			lgrp_plat_srat_enable = (int)value;
    777 
    778 		/*
    779 		 * Get boot property for enabling/disabling SLIT
    780 		 */
    781 		if (bootprop_getval(BP_LGRP_SLIT_ENABLE, &value) == 0)
    782 			lgrp_plat_slit_enable = (int)value;
    783 
    784 		/*
    785 		 * Initialize as a UMA machine
    786 		 */
    787 		if (lgrp_topo_ht_limit() == 1) {
    788 			lgrp_plat_node_cnt = max_mem_nodes = 1;
    789 			return;
    790 		}
    791 
    792 		lgrp_plat_get_numa_config();
    793 #endif	/* __xpv */
    794 		break;
    795 
    796 	case LGRP_INIT_STAGE3:
    797 		lgrp_plat_probe();
    798 		lgrp_plat_release_bootstrap();
    799 		break;
    800 
    801 	case LGRP_INIT_STAGE4:
    802 		lgrp_plat_main_init();
    803 		break;
    804 
    805 	default:
    806 		break;
    807 	}
    808 }
    809 
    810 
    811 /*
    812  * Return latency between "from" and "to" lgroups
    813  *
    814  * This latency number can only be used for relative comparison
    815  * between lgroups on the running system, cannot be used across platforms,
    816  * and may not reflect the actual latency.  It is platform and implementation
    817  * specific, so platform gets to decide its value.  It would be nice if the
    818  * number was at least proportional to make comparisons more meaningful though.
    819  */
    820 /* ARGSUSED */
    821 int
    822 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to)
    823 {
    824 	lgrp_handle_t	src, dest;
    825 	int		node;
    826 
    827 	if (max_mem_nodes == 1)
    828 		return (0);
    829 
    830 	/*
    831 	 * Return max latency for root lgroup
    832 	 */
    833 	if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)
    834 		return (lgrp_plat_lat_stats.latency_max);
    835 
    836 	src = from;
    837 	dest = to;
    838 
    839 	/*
    840 	 * Return 0 for nodes (lgroup platform handles) out of range
    841 	 */
    842 	if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES)
    843 		return (0);
    844 
    845 	/*
    846 	 * Probe from current CPU if its lgroup latencies haven't been set yet
    847 	 * and we are trying to get latency from current CPU to some node
    848 	 */
    849 	node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
    850 	    lgrp_plat_cpu_node_nentries);
    851 	ASSERT(node >= 0 && node < lgrp_plat_node_cnt);
    852 	if (lgrp_plat_lat_stats.latencies[src][src] == 0 && node == src)
    853 		lgrp_plat_probe();
    854 
    855 	return (lgrp_plat_lat_stats.latencies[src][dest]);
    856 }
    857 
    858 
    859 /*
    860  * Return the maximum number of lgrps supported by the platform.
    861  * Before lgrp topology is known it returns an estimate based on the number of
    862  * nodes. Once topology is known it returns the actual maximim number of lgrps
    863  * created. Since x86/x64 doesn't support Dynamic Reconfiguration (DR) and
    864  * dynamic addition of new nodes, this number may not grow during system
    865  * lifetime (yet).
    866  */
    867 int
    868 lgrp_plat_max_lgrps(void)
    869 {
    870 	return (lgrp_topo_initialized ?
    871 	    lgrp_alloc_max + 1 :
    872 	    lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1);
    873 }
    874 
    875 
    876 /*
    877  * Return the number of free pages in an lgroup.
    878  *
    879  * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize
    880  * pages on freelists.  For query of LGRP_MEM_SIZE_AVAIL, return the
    881  * number of allocatable base pagesize pages corresponding to the
    882  * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..)
    883  * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical
    884  * memory installed, regardless of whether or not it's usable.
    885  */
    886 pgcnt_t
    887 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query)
    888 {
    889 	int	mnode;
    890 	pgcnt_t npgs = (pgcnt_t)0;
    891 	extern struct memlist *phys_avail;
    892 	extern struct memlist *phys_install;
    893 
    894 
    895 	if (plathand == LGRP_DEFAULT_HANDLE)
    896 		return (lgrp_plat_mem_size_default(plathand, query));
    897 
    898 	if (plathand != LGRP_NULL_HANDLE) {
    899 		mnode = plat_lgrphand_to_mem_node(plathand);
    900 		if (mnode >= 0 && mem_node_config[mnode].exists) {
    901 			switch (query) {
    902 			case LGRP_MEM_SIZE_FREE:
    903 				npgs = MNODE_PGCNT(mnode);
    904 				break;
    905 			case LGRP_MEM_SIZE_AVAIL:
    906 				npgs = mem_node_memlist_pages(mnode,
    907 				    phys_avail);
    908 				break;
    909 			case LGRP_MEM_SIZE_INSTALL:
    910 				npgs = mem_node_memlist_pages(mnode,
    911 				    phys_install);
    912 				break;
    913 			default:
    914 				break;
    915 			}
    916 		}
    917 	}
    918 	return (npgs);
    919 }
    920 
    921 
    922 /*
    923  * Return the platform handle of the lgroup that contains the physical memory
    924  * corresponding to the given page frame number
    925  */
    926 /* ARGSUSED */
    927 lgrp_handle_t
    928 lgrp_plat_pfn_to_hand(pfn_t pfn)
    929 {
    930 	int	mnode;
    931 
    932 	if (max_mem_nodes == 1)
    933 		return (LGRP_DEFAULT_HANDLE);
    934 
    935 	if (pfn > physmax)
    936 		return (LGRP_NULL_HANDLE);
    937 
    938 	mnode = plat_pfn_to_mem_node(pfn);
    939 	if (mnode < 0)
    940 		return (LGRP_NULL_HANDLE);
    941 
    942 	return (MEM_NODE_2_LGRPHAND(mnode));
    943 }
    944 
    945 
    946 /*
    947  * Probe memory in each node from current CPU to determine latency topology
    948  *
    949  * The probing code will probe the vendor ID register on the Northbridge of
    950  * Opteron processors and probe memory for other processors by default.
    951  *
    952  * Since probing is inherently error prone, the code takes laps across all the
    953  * nodes probing from each node to each of the other nodes some number of
    954  * times.  Furthermore, each node is probed some number of times before moving
    955  * onto the next one during each lap.  The minimum latency gotten between nodes
    956  * is kept as the latency between the nodes.
    957  *
    958  * After all that,  the probe times are adjusted by normalizing values that are
    959  * close to each other and local latencies are made the same.  Lastly, the
    960  * latencies are verified to make sure that certain conditions are met (eg.
    961  * local < remote, latency(a, b) == latency(b, a), etc.).
    962  *
    963  * If any of the conditions aren't met, the code will export a NUMA
    964  * configuration with the local CPUs and memory given by the SRAT or PCI config
    965  * space registers and one remote memory latency since it can't tell exactly
    966  * how far each node is from each other.
    967  */
    968 void
    969 lgrp_plat_probe(void)
    970 {
    971 	int				from;
    972 	int				i;
    973 	lgrp_plat_latency_stats_t	*lat_stats;
    974 	boolean_t			probed;
    975 	hrtime_t			probe_time;
    976 	int				to;
    977 
    978 	if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) ||
    979 	    max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2)
    980 		return;
    981 
    982 	/*
    983 	 * Determine ID of node containing current CPU
    984 	 */
    985 	from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
    986 	    lgrp_plat_cpu_node_nentries);
    987 	ASSERT(from >= 0 && from < lgrp_plat_node_cnt);
    988 	if (srat_ptr && lgrp_plat_srat_enable && !lgrp_plat_srat_error)
    989 		ASSERT(lgrp_plat_node_domain[from].exists);
    990 
    991 	/*
    992 	 * Don't need to probe if got times already
    993 	 */
    994 	lat_stats = &lgrp_plat_lat_stats;
    995 	if (lat_stats->latencies[from][from] != 0)
    996 		return;
    997 
    998 	/*
    999 	 * Read vendor ID in Northbridge or read and write page(s)
   1000 	 * in each node from current CPU and remember how long it takes,
   1001 	 * so we can build latency topology of machine later.
   1002 	 * This should approximate the memory latency between each node.
   1003 	 */
   1004 	probed = B_FALSE;
   1005 	for (i = 0; i < lgrp_plat_probe_nrounds; i++) {
   1006 		for (to = 0; to < lgrp_plat_node_cnt; to++) {
   1007 			/*
   1008 			 * Get probe time and skip over any nodes that can't be
   1009 			 * probed yet or don't have memory
   1010 			 */
   1011 			probe_time = lgrp_plat_probe_time(to,
   1012 			    lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries,
   1013 			    &lgrp_plat_probe_mem_config, &lgrp_plat_lat_stats,
   1014 			    &lgrp_plat_probe_stats);
   1015 			if (probe_time == 0)
   1016 				continue;
   1017 
   1018 			probed = B_TRUE;
   1019 
   1020 			/*
   1021 			 * Keep lowest probe time as latency between nodes
   1022 			 */
   1023 			if (lat_stats->latencies[from][to] == 0 ||
   1024 			    probe_time < lat_stats->latencies[from][to])
   1025 				lat_stats->latencies[from][to] = probe_time;
   1026 
   1027 			/*
   1028 			 * Update overall minimum and maximum probe times
   1029 			 * across all nodes
   1030 			 */
   1031 			if (probe_time < lat_stats->latency_min ||
   1032 			    lat_stats->latency_min == -1)
   1033 				lat_stats->latency_min = probe_time;
   1034 			if (probe_time > lat_stats->latency_max)
   1035 				lat_stats->latency_max = probe_time;
   1036 		}
   1037 	}
   1038 
   1039 	/*
   1040 	 * Bail out if weren't able to probe any nodes from current CPU
   1041 	 */
   1042 	if (probed == B_FALSE)
   1043 		return;
   1044 
   1045 	/*
   1046 	 * - Fix up latencies such that local latencies are same,
   1047 	 *   latency(i, j) == latency(j, i), etc. (if possible)
   1048 	 *
   1049 	 * - Verify that latencies look ok
   1050 	 *
   1051 	 * - Fallback to just optimizing for local and remote if
   1052 	 *   latencies didn't look right
   1053 	 */
   1054 	lgrp_plat_latency_adjust(lgrp_plat_node_memory, &lgrp_plat_lat_stats,
   1055 	    &lgrp_plat_probe_stats);
   1056 	lgrp_plat_probe_stats.probe_error_code =
   1057 	    lgrp_plat_latency_verify(lgrp_plat_node_memory,
   1058 	    &lgrp_plat_lat_stats);
   1059 	if (lgrp_plat_probe_stats.probe_error_code)
   1060 		lgrp_plat_2level_setup(lgrp_plat_node_memory,
   1061 		    &lgrp_plat_lat_stats);
   1062 }
   1063 
   1064 
   1065 /*
   1066  * Return platform handle for root lgroup
   1067  */
   1068 lgrp_handle_t
   1069 lgrp_plat_root_hand(void)
   1070 {
   1071 	return (LGRP_DEFAULT_HANDLE);
   1072 }
   1073 
   1074 
   1075 /*
   1076  * INTERNAL ROUTINES
   1077  */
   1078 
   1079 
   1080 /*
   1081  * Update CPU to node mapping for given CPU and proximity domain (and returns
   1082  * negative numbers for errors and positive ones for success)
   1083  */
   1084 static int
   1085 lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, int node_cnt,
   1086     cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, uint32_t domain)
   1087 {
   1088 	uint_t	i;
   1089 	int	node;
   1090 
   1091 	/*
   1092 	 * Get node number for proximity domain
   1093 	 */
   1094 	node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain);
   1095 	if (node == -1) {
   1096 		node = lgrp_plat_node_domain_update(node_domain, node_cnt,
   1097 		    domain);
   1098 		if (node == -1)
   1099 			return (-1);
   1100 	}
   1101 
   1102 	/*
   1103 	 * Search for entry with given APIC ID and fill in its node and
   1104 	 * proximity domain IDs (if they haven't been set already)
   1105 	 */
   1106 	for (i = 0; i < nentries; i++) {
   1107 		/*
   1108 		 * Skip nonexistent entries and ones without matching APIC ID
   1109 		 */
   1110 		if (!cpu_node[i].exists || cpu_node[i].apicid != apicid)
   1111 			continue;
   1112 
   1113 		/*
   1114 		 * Just return if entry completely and correctly filled in
   1115 		 * already
   1116 		 */
   1117 		if (cpu_node[i].prox_domain == domain &&
   1118 		    cpu_node[i].node == node)
   1119 			return (1);
   1120 
   1121 		/*
   1122 		 * Fill in node and proximity domain IDs
   1123 		 */
   1124 		cpu_node[i].prox_domain = domain;
   1125 		cpu_node[i].node = node;
   1126 
   1127 		return (0);
   1128 	}
   1129 
   1130 	/*
   1131 	 * Return error when entry for APIC ID wasn't found in table
   1132 	 */
   1133 	return (-2);
   1134 }
   1135 
   1136 
   1137 /*
   1138  * Get node ID for given CPU
   1139  */
   1140 static int
   1141 lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node,
   1142     int cpu_node_nentries)
   1143 {
   1144 	processorid_t	cpuid;
   1145 
   1146 	if (cp == NULL)
   1147 		return (-1);
   1148 
   1149 	cpuid = cp->cpu_id;
   1150 	if (cpuid < 0 || cpuid >= max_ncpus)
   1151 		return (-1);
   1152 
   1153 	/*
   1154 	 * SRAT doesn't exist, isn't enabled, or there was an error processing
   1155 	 * it, so return node ID for Opteron and -1 otherwise.
   1156 	 */
   1157 	if (srat_ptr == NULL || !lgrp_plat_srat_enable ||
   1158 	    lgrp_plat_srat_error) {
   1159 		if (is_opteron())
   1160 			return (pg_plat_hw_instance_id(cp, PGHW_PROCNODE));
   1161 		return (-1);
   1162 	}
   1163 
   1164 	/*
   1165 	 * Return -1 when CPU to node ID mapping entry doesn't exist for given
   1166 	 * CPU
   1167 	 */
   1168 	if (cpuid >= cpu_node_nentries || !cpu_node[cpuid].exists)
   1169 		return (-1);
   1170 
   1171 	return (cpu_node[cpuid].node);
   1172 }
   1173 
   1174 
   1175 /*
   1176  * Return node number for given proximity domain/system locality
   1177  */
   1178 static int
   1179 lgrp_plat_domain_to_node(node_domain_map_t *node_domain, int node_cnt,
   1180     uint32_t domain)
   1181 {
   1182 	uint_t	node;
   1183 	uint_t	start;
   1184 
   1185 	/*
   1186 	 * Hash proximity domain ID into node to domain mapping table (array),
   1187 	 * search for entry with matching proximity domain ID, and return index
   1188 	 * of matching entry as node ID.
   1189 	 */
   1190 	node = start = NODE_DOMAIN_HASH(domain, node_cnt);
   1191 	do {
   1192 		if (node_domain[node].prox_domain == domain &&
   1193 		    node_domain[node].exists)
   1194 			return (node);
   1195 		node = (node + 1) % node_cnt;
   1196 	} while (node != start);
   1197 	return (-1);
   1198 }
   1199 
   1200 
   1201 /*
   1202  * Get NUMA configuration of machine
   1203  */
   1204 static void
   1205 lgrp_plat_get_numa_config(void)
   1206 {
   1207 	uint_t		probe_op;
   1208 
   1209 	/*
   1210 	 * Read boot property with CPU to APIC ID mapping table/array to
   1211 	 * determine number of CPUs
   1212 	 */
   1213 	lgrp_plat_apic_ncpus = lgrp_plat_process_cpu_apicids(NULL);
   1214 
   1215 	/*
   1216 	 * Determine which CPUs and memory are local to each other and number
   1217 	 * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT)
   1218 	 */
   1219 	if (lgrp_plat_apic_ncpus > 0) {
   1220 		int	retval;
   1221 
   1222 		/*
   1223 		 * Temporarily allocate boot memory to use for CPU to node
   1224 		 * mapping since kernel memory allocator isn't alive yet
   1225 		 */
   1226 		lgrp_plat_cpu_node = (cpu_node_map_t *)BOP_ALLOC(bootops,
   1227 		    NULL, lgrp_plat_apic_ncpus * sizeof (cpu_node_map_t),
   1228 		    sizeof (int));
   1229 
   1230 		ASSERT(lgrp_plat_cpu_node != NULL);
   1231 		if (lgrp_plat_cpu_node) {
   1232 			lgrp_plat_cpu_node_nentries = lgrp_plat_apic_ncpus;
   1233 			bzero(lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries *
   1234 			    sizeof (cpu_node_map_t));
   1235 		}
   1236 
   1237 		/*
   1238 		 * Fill in CPU to node ID mapping table with APIC ID for each
   1239 		 * CPU
   1240 		 */
   1241 		(void) lgrp_plat_process_cpu_apicids(lgrp_plat_cpu_node);
   1242 
   1243 		retval = lgrp_plat_process_srat(srat_ptr,
   1244 		    &lgrp_plat_prox_domain_min,
   1245 		    lgrp_plat_node_domain, lgrp_plat_cpu_node,
   1246 		    lgrp_plat_apic_ncpus, lgrp_plat_node_memory);
   1247 		if (retval <= 0) {
   1248 			lgrp_plat_srat_error = retval;
   1249 			lgrp_plat_node_cnt = 1;
   1250 		} else {
   1251 			lgrp_plat_srat_error = 0;
   1252 			lgrp_plat_node_cnt = retval;
   1253 		}
   1254 	}
   1255 
   1256 	/*
   1257 	 * Try to use PCI config space registers on Opteron if there's an error
   1258 	 * processing CPU to APIC ID mapping or SRAT
   1259 	 */
   1260 	if ((lgrp_plat_apic_ncpus <= 0 || lgrp_plat_srat_error != 0) &&
   1261 	    is_opteron())
   1262 		opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv,
   1263 		    lgrp_plat_node_memory);
   1264 
   1265 	/*
   1266 	 * Don't bother to setup system for multiple lgroups and only use one
   1267 	 * memory node when memory is interleaved between any nodes or there is
   1268 	 * only one NUMA node
   1269 	 *
   1270 	 * NOTE: May need to change this for Dynamic Reconfiguration (DR)
   1271 	 *	 when and if it happens for x86/x64
   1272 	 */
   1273 	if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) {
   1274 		lgrp_plat_node_cnt = max_mem_nodes = 1;
   1275 		(void) lgrp_topo_ht_limit_set(1);
   1276 		return;
   1277 	}
   1278 
   1279 	/*
   1280 	 * Leaf lgroups on x86/x64 architectures contain one physical
   1281 	 * processor chip. Tune lgrp_expand_proc_thresh and
   1282 	 * lgrp_expand_proc_diff so that lgrp_choose() will spread
   1283 	 * things out aggressively.
   1284 	 */
   1285 	lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2;
   1286 	lgrp_expand_proc_diff = 0;
   1287 
   1288 	/*
   1289 	 * There should be one memnode (physical page free list(s)) for
   1290 	 * each node
   1291 	 */
   1292 	max_mem_nodes = lgrp_plat_node_cnt;
   1293 
   1294 	/*
   1295 	 * Initialize min and max latency before reading SLIT or probing
   1296 	 */
   1297 	lgrp_plat_lat_stats.latency_min = -1;
   1298 	lgrp_plat_lat_stats.latency_max = 0;
   1299 
   1300 	/*
   1301 	 * Determine how far each NUMA node is from each other by
   1302 	 * reading ACPI System Locality Information Table (SLIT) if it
   1303 	 * exists
   1304 	 */
   1305 	lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr,
   1306 	    lgrp_plat_node_cnt, lgrp_plat_node_memory,
   1307 	    &lgrp_plat_lat_stats);
   1308 	if (lgrp_plat_slit_error == 0)
   1309 		return;
   1310 
   1311 	/*
   1312 	 * Probe to determine latency between NUMA nodes when SLIT
   1313 	 * doesn't exist or make sense
   1314 	 */
   1315 	lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE;
   1316 
   1317 	/*
   1318 	 * Specify whether to probe using vendor ID register or page copy
   1319 	 * if hasn't been specified already or is overspecified
   1320 	 */
   1321 	probe_op = lgrp_plat_probe_flags &
   1322 	    (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
   1323 
   1324 	if (probe_op == 0 ||
   1325 	    probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) {
   1326 		lgrp_plat_probe_flags &=
   1327 		    ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
   1328 		if (is_opteron())
   1329 			lgrp_plat_probe_flags |=
   1330 			    LGRP_PLAT_PROBE_VENDOR;
   1331 		else
   1332 			lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY;
   1333 	}
   1334 
   1335 	/*
   1336 	 * Probing errors can mess up the lgroup topology and
   1337 	 * force us fall back to a 2 level lgroup topology.
   1338 	 * Here we bound how tall the lgroup topology can grow
   1339 	 * in hopes of avoiding any anamolies in probing from
   1340 	 * messing up the lgroup topology by limiting the
   1341 	 * accuracy of the latency topology.
   1342 	 *
   1343 	 * Assume that nodes will at least be configured in a
   1344 	 * ring, so limit height of lgroup topology to be less
   1345 	 * than number of nodes on a system with 4 or more
   1346 	 * nodes
   1347 	 */
   1348 	if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() ==
   1349 	    lgrp_topo_ht_limit_default())
   1350 		(void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1);
   1351 }
   1352 
   1353 
   1354 /*
   1355  * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to
   1356  * be considered same
   1357  */
   1358 #define	LGRP_LAT_TOLERANCE_SHIFT	4
   1359 
   1360 int	lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT;
   1361 
   1362 
   1363 /*
   1364  * Adjust latencies between nodes to be symmetric, normalize latencies between
   1365  * any nodes that are within some tolerance to be same, and make local
   1366  * latencies be same
   1367  */
   1368 static void
   1369 lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory,
   1370     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats)
   1371 {
   1372 	int				i;
   1373 	int				j;
   1374 	int				k;
   1375 	int				l;
   1376 	u_longlong_t			max;
   1377 	u_longlong_t			min;
   1378 	u_longlong_t			t;
   1379 	u_longlong_t			t1;
   1380 	u_longlong_t			t2;
   1381 	const lgrp_config_flag_t	cflag = LGRP_CONFIG_LAT_CHANGE_ALL;
   1382 	int				lat_corrected[MAX_NODES][MAX_NODES];
   1383 
   1384 	/*
   1385 	 * Nothing to do when this is an UMA machine or don't have args needed
   1386 	 */
   1387 	if (max_mem_nodes == 1)
   1388 		return;
   1389 
   1390 	ASSERT(node_memory != NULL && lat_stats != NULL &&
   1391 	    probe_stats != NULL);
   1392 
   1393 	/*
   1394 	 * Make sure that latencies are symmetric between any two nodes
   1395 	 * (ie. latency(node0, node1) == latency(node1, node0))
   1396 	 */
   1397 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
   1398 		if (!node_memory[i].exists)
   1399 			continue;
   1400 
   1401 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
   1402 			if (!node_memory[j].exists)
   1403 				continue;
   1404 
   1405 			t1 = lat_stats->latencies[i][j];
   1406 			t2 = lat_stats->latencies[j][i];
   1407 
   1408 			if (t1 == 0 || t2 == 0 || t1 == t2)
   1409 				continue;
   1410 
   1411 			/*
   1412 			 * Latencies should be same
   1413 			 * - Use minimum of two latencies which should be same
   1414 			 * - Track suspect probe times not within tolerance of
   1415 			 *   min value
   1416 			 * - Remember how much values are corrected by
   1417 			 */
   1418 			if (t1 > t2) {
   1419 				t = t2;
   1420 				probe_stats->probe_errors[i][j] += t1 - t2;
   1421 				if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) {
   1422 					probe_stats->probe_suspect[i][j]++;
   1423 					probe_stats->probe_suspect[j][i]++;
   1424 				}
   1425 			} else if (t2 > t1) {
   1426 				t = t1;
   1427 				probe_stats->probe_errors[j][i] += t2 - t1;
   1428 				if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) {
   1429 					probe_stats->probe_suspect[i][j]++;
   1430 					probe_stats->probe_suspect[j][i]++;
   1431 				}
   1432 			}
   1433 
   1434 			lat_stats->latencies[i][j] =
   1435 			    lat_stats->latencies[j][i] = t;
   1436 			lgrp_config(cflag, t1, t);
   1437 			lgrp_config(cflag, t2, t);
   1438 		}
   1439 	}
   1440 
   1441 	/*
   1442 	 * Keep track of which latencies get corrected
   1443 	 */
   1444 	for (i = 0; i < MAX_NODES; i++)
   1445 		for (j = 0; j < MAX_NODES; j++)
   1446 			lat_corrected[i][j] = 0;
   1447 
   1448 	/*
   1449 	 * For every two nodes, see whether there is another pair of nodes which
   1450 	 * are about the same distance apart and make the latencies be the same
   1451 	 * if they are close enough together
   1452 	 */
   1453 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
   1454 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
   1455 			if (!node_memory[j].exists)
   1456 				continue;
   1457 			/*
   1458 			 * Pick one pair of nodes (i, j)
   1459 			 * and get latency between them
   1460 			 */
   1461 			t1 = lat_stats->latencies[i][j];
   1462 
   1463 			/*
   1464 			 * Skip this pair of nodes if there isn't a latency
   1465 			 * for it yet
   1466 			 */
   1467 			if (t1 == 0)
   1468 				continue;
   1469 
   1470 			for (k = 0; k < lgrp_plat_node_cnt; k++) {
   1471 				for (l = 0; l < lgrp_plat_node_cnt; l++) {
   1472 					if (!node_memory[l].exists)
   1473 						continue;
   1474 					/*
   1475 					 * Pick another pair of nodes (k, l)
   1476 					 * not same as (i, j) and get latency
   1477 					 * between them
   1478 					 */
   1479 					if (k == i && l == j)
   1480 						continue;
   1481 
   1482 					t2 = lat_stats->latencies[k][l];
   1483 
   1484 					/*
   1485 					 * Skip this pair of nodes if there
   1486 					 * isn't a latency for it yet
   1487 					 */
   1488 
   1489 					if (t2 == 0)
   1490 						continue;
   1491 
   1492 					/*
   1493 					 * Skip nodes (k, l) if they already
   1494 					 * have same latency as (i, j) or
   1495 					 * their latency isn't close enough to
   1496 					 * be considered/made the same
   1497 					 */
   1498 					if (t1 == t2 || (t1 > t2 && t1 - t2 >
   1499 					    t1 >> lgrp_plat_probe_lt_shift) ||
   1500 					    (t2 > t1 && t2 - t1 >
   1501 					    t2 >> lgrp_plat_probe_lt_shift))
   1502 						continue;
   1503 
   1504 					/*
   1505 					 * Make latency(i, j) same as
   1506 					 * latency(k, l), try to use latency
   1507 					 * that has been adjusted already to get
   1508 					 * more consistency (if possible), and
   1509 					 * remember which latencies were
   1510 					 * adjusted for next time
   1511 					 */
   1512 					if (lat_corrected[i][j]) {
   1513 						t = t1;
   1514 						lgrp_config(cflag, t2, t);
   1515 						t2 = t;
   1516 					} else if (lat_corrected[k][l]) {
   1517 						t = t2;
   1518 						lgrp_config(cflag, t1, t);
   1519 						t1 = t;
   1520 					} else {
   1521 						if (t1 > t2)
   1522 							t = t2;
   1523 						else
   1524 							t = t1;
   1525 						lgrp_config(cflag, t1, t);
   1526 						lgrp_config(cflag, t2, t);
   1527 						t1 = t2 = t;
   1528 					}
   1529 
   1530 					lat_stats->latencies[i][j] =
   1531 					    lat_stats->latencies[k][l] = t;
   1532 
   1533 					lat_corrected[i][j] =
   1534 					    lat_corrected[k][l] = 1;
   1535 				}
   1536 			}
   1537 		}
   1538 	}
   1539 
   1540 	/*
   1541 	 * Local latencies should be same
   1542 	 * - Find min and max local latencies
   1543 	 * - Make all local latencies be minimum
   1544 	 */
   1545 	min = -1;
   1546 	max = 0;
   1547 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
   1548 		if (!node_memory[i].exists)
   1549 			continue;
   1550 		t = lat_stats->latencies[i][i];
   1551 		if (t == 0)
   1552 			continue;
   1553 		if (min == -1 || t < min)
   1554 			min = t;
   1555 		if (t > max)
   1556 			max = t;
   1557 	}
   1558 	if (min != max) {
   1559 		for (i = 0; i < lgrp_plat_node_cnt; i++) {
   1560 			int	local;
   1561 
   1562 			if (!node_memory[i].exists)
   1563 				continue;
   1564 
   1565 			local = lat_stats->latencies[i][i];
   1566 			if (local == 0)
   1567 				continue;
   1568 
   1569 			/*
   1570 			 * Track suspect probe times that aren't within
   1571 			 * tolerance of minimum local latency and how much
   1572 			 * probe times are corrected by
   1573 			 */
   1574 			if (local - min > min >> lgrp_plat_probe_lt_shift)
   1575 				probe_stats->probe_suspect[i][i]++;
   1576 
   1577 			probe_stats->probe_errors[i][i] += local - min;
   1578 
   1579 			/*
   1580 			 * Make local latencies be minimum
   1581 			 */
   1582 			lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min);
   1583 			lat_stats->latencies[i][i] = min;
   1584 		}
   1585 	}
   1586 
   1587 	/*
   1588 	 * Determine max probe time again since just adjusted latencies
   1589 	 */
   1590 	lat_stats->latency_max = 0;
   1591 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
   1592 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
   1593 			if (!node_memory[j].exists)
   1594 				continue;
   1595 			t = lat_stats->latencies[i][j];
   1596 			if (t > lat_stats->latency_max)
   1597 				lat_stats->latency_max = t;
   1598 		}
   1599 	}
   1600 }
   1601 
   1602 
   1603 /*
   1604  * Verify following about latencies between nodes:
   1605  *
   1606  * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a))
   1607  * - Local latencies same
   1608  * - Local < remote
   1609  * - Number of latencies seen is reasonable
   1610  * - Number of occurrences of a given latency should be more than 1
   1611  *
   1612  * Returns:
   1613  *	0	Success
   1614  *	-1	Not symmetric
   1615  *	-2	Local latencies not same
   1616  *	-3	Local >= remote
   1617  */
   1618 static int
   1619 lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory,
   1620     lgrp_plat_latency_stats_t *lat_stats)
   1621 {
   1622 	int				i;
   1623 	int				j;
   1624 	u_longlong_t			t1;
   1625 	u_longlong_t			t2;
   1626 
   1627 	ASSERT(node_memory != NULL && lat_stats != NULL);
   1628 
   1629 	/*
   1630 	 * Nothing to do when this is an UMA machine, lgroup topology is
   1631 	 * limited to 2 levels, or there aren't any probe times yet
   1632 	 */
   1633 	if (max_mem_nodes == 1 || lgrp_topo_levels < 2 ||
   1634 	    lat_stats->latencies[0][0] == 0)
   1635 		return (0);
   1636 
   1637 	/*
   1638 	 * Make sure that latencies are symmetric between any two nodes
   1639 	 * (ie. latency(node0, node1) == latency(node1, node0))
   1640 	 */
   1641 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
   1642 		if (!node_memory[i].exists)
   1643 			continue;
   1644 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
   1645 			if (!node_memory[j].exists)
   1646 				continue;
   1647 			t1 = lat_stats->latencies[i][j];
   1648 			t2 = lat_stats->latencies[j][i];
   1649 
   1650 			if (t1 == 0 || t2 == 0 || t1 == t2)
   1651 				continue;
   1652 
   1653 			return (-1);
   1654 		}
   1655 	}
   1656 
   1657 	/*
   1658 	 * Local latencies should be same
   1659 	 */
   1660 	t1 = lat_stats->latencies[0][0];
   1661 	for (i = 1; i < lgrp_plat_node_cnt; i++) {
   1662 		if (!node_memory[i].exists)
   1663 			continue;
   1664 
   1665 		t2 = lat_stats->latencies[i][i];
   1666 		if (t2 == 0)
   1667 			continue;
   1668 
   1669 		if (t1 == 0) {
   1670 			t1 = t2;
   1671 			continue;
   1672 		}
   1673 
   1674 		if (t1 != t2)
   1675 			return (-2);
   1676 	}
   1677 
   1678 	/*
   1679 	 * Local latencies should be less than remote
   1680 	 */
   1681 	if (t1) {
   1682 		for (i = 0; i < lgrp_plat_node_cnt; i++) {
   1683 			for (j = 0; j < lgrp_plat_node_cnt; j++) {
   1684 				if (!node_memory[j].exists)
   1685 					continue;
   1686 				t2 = lat_stats->latencies[i][j];
   1687 				if (i == j || t2 == 0)
   1688 					continue;
   1689 
   1690 				if (t1 >= t2)
   1691 					return (-3);
   1692 			}
   1693 		}
   1694 	}
   1695 
   1696 	return (0);
   1697 }
   1698 
   1699 
   1700 /*
   1701  * Platform-specific initialization
   1702  */
   1703 static void
   1704 lgrp_plat_main_init(void)
   1705 {
   1706 	int	curnode;
   1707 	int	ht_limit;
   1708 	int	i;
   1709 
   1710 	/*
   1711 	 * Print a notice that MPO is disabled when memory is interleaved
   1712 	 * across nodes....Would do this when it is discovered, but can't
   1713 	 * because it happens way too early during boot....
   1714 	 */
   1715 	if (lgrp_plat_mem_intrlv)
   1716 		cmn_err(CE_NOTE,
   1717 		    "MPO disabled because memory is interleaved\n");
   1718 
   1719 	/*
   1720 	 * Don't bother to do any probing if it is disabled, there is only one
   1721 	 * node, or the height of the lgroup topology less than or equal to 2
   1722 	 */
   1723 	ht_limit = lgrp_topo_ht_limit();
   1724 	if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) ||
   1725 	    max_mem_nodes == 1 || ht_limit <= 2) {
   1726 		/*
   1727 		 * Setup lgroup latencies for 2 level lgroup topology
   1728 		 * (ie. local and remote only) if they haven't been set yet
   1729 		 */
   1730 		if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 &&
   1731 		    lgrp_plat_lat_stats.latency_max == 0)
   1732 			lgrp_plat_2level_setup(lgrp_plat_node_memory,
   1733 			    &lgrp_plat_lat_stats);
   1734 		return;
   1735 	}
   1736 
   1737 	if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) {
   1738 		/*
   1739 		 * Should have been able to probe from CPU 0 when it was added
   1740 		 * to lgroup hierarchy, but may not have been able to then
   1741 		 * because it happens so early in boot that gethrtime() hasn't
   1742 		 * been initialized.  (:-(
   1743 		 */
   1744 		curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
   1745 		    lgrp_plat_cpu_node_nentries);
   1746 		ASSERT(curnode >= 0 && curnode < lgrp_plat_node_cnt);
   1747 		if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0)
   1748 			lgrp_plat_probe();
   1749 
   1750 		return;
   1751 	}
   1752 
   1753 	/*
   1754 	 * When probing memory, use one page for every sample to determine
   1755 	 * lgroup topology and taking multiple samples
   1756 	 */
   1757 	if (lgrp_plat_probe_mem_config.probe_memsize == 0)
   1758 		lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE *
   1759 		    lgrp_plat_probe_nsamples;
   1760 
   1761 	/*
   1762 	 * Map memory in each node needed for probing to determine latency
   1763 	 * topology
   1764 	 */
   1765 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
   1766 		int	mnode;
   1767 
   1768 		/*
   1769 		 * Skip this node and leave its probe page NULL
   1770 		 * if it doesn't have any memory
   1771 		 */
   1772 		mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i);
   1773 		if (!mem_node_config[mnode].exists) {
   1774 			lgrp_plat_probe_mem_config.probe_va[i] = NULL;
   1775 			continue;
   1776 		}
   1777 
   1778 		/*
   1779 		 * Allocate one kernel virtual page
   1780 		 */
   1781 		lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena,
   1782 		    lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP);
   1783 		if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) {
   1784 			cmn_err(CE_WARN,
   1785 			    "lgrp_plat_main_init: couldn't allocate memory");
   1786 			return;
   1787 		}
   1788 
   1789 		/*
   1790 		 * Get PFN for first page in each node
   1791 		 */
   1792 		lgrp_plat_probe_mem_config.probe_pfn[i] =
   1793 		    mem_node_config[mnode].physbase;
   1794 
   1795 		/*
   1796 		 * Map virtual page to first page in node
   1797 		 */
   1798 		hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i],
   1799 		    lgrp_plat_probe_mem_config.probe_memsize,
   1800 		    lgrp_plat_probe_mem_config.probe_pfn[i],
   1801 		    PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE,
   1802 		    HAT_LOAD_NOCONSIST);
   1803 	}
   1804 
   1805 	/*
   1806 	 * Probe from current CPU
   1807 	 */
   1808 	lgrp_plat_probe();
   1809 }
   1810 
   1811 
   1812 /*
   1813  * Return the number of free, allocatable, or installed
   1814  * pages in an lgroup
   1815  * This is a copy of the MAX_MEM_NODES == 1 version of the routine
   1816  * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup
   1817  */
   1818 /* ARGSUSED */
   1819 static pgcnt_t
   1820 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query)
   1821 {
   1822 	struct memlist *mlist;
   1823 	pgcnt_t npgs = 0;
   1824 	extern struct memlist *phys_avail;
   1825 	extern struct memlist *phys_install;
   1826 
   1827 	switch (query) {
   1828 	case LGRP_MEM_SIZE_FREE:
   1829 		return ((pgcnt_t)freemem);
   1830 	case LGRP_MEM_SIZE_AVAIL:
   1831 		memlist_read_lock();
   1832 		for (mlist = phys_avail; mlist; mlist = mlist->next)
   1833 			npgs += btop(mlist->size);
   1834 		memlist_read_unlock();
   1835 		return (npgs);
   1836 	case LGRP_MEM_SIZE_INSTALL:
   1837 		memlist_read_lock();
   1838 		for (mlist = phys_install; mlist; mlist = mlist->next)
   1839 			npgs += btop(mlist->size);
   1840 		memlist_read_unlock();
   1841 		return (npgs);
   1842 	default:
   1843 		return ((pgcnt_t)0);
   1844 	}
   1845 }
   1846 
   1847 
   1848 /*
   1849  * Update node to proximity domain mappings for given domain and return node ID
   1850  */
   1851 static int
   1852 lgrp_plat_node_domain_update(node_domain_map_t *node_domain, int node_cnt,
   1853     uint32_t domain)
   1854 {
   1855 	uint_t	node;
   1856 	uint_t	start;
   1857 
   1858 	/*
   1859 	 * Hash proximity domain ID into node to domain mapping table (array)
   1860 	 * and add entry for it into first non-existent or matching entry found
   1861 	 */
   1862 	node = start = NODE_DOMAIN_HASH(domain, node_cnt);
   1863 	do {
   1864 		/*
   1865 		 * Entry doesn't exist yet, so create one for this proximity
   1866 		 * domain and return node ID which is index into mapping table.
   1867 		 */
   1868 		if (!node_domain[node].exists) {
   1869 			node_domain[node].exists = 1;
   1870 			node_domain[node].prox_domain = domain;
   1871 			return (node);
   1872 		}
   1873 
   1874 		/*
   1875 		 * Entry exists for this proximity domain already, so just
   1876 		 * return node ID (index into table).
   1877 		 */
   1878 		if (node_domain[node].prox_domain == domain)
   1879 			return (node);
   1880 		node = NODE_DOMAIN_HASH(node + 1, node_cnt);
   1881 	} while (node != start);
   1882 
   1883 	/*
   1884 	 * Ran out of supported number of entries which shouldn't happen....
   1885 	 */
   1886 	ASSERT(node != start);
   1887 	return (-1);
   1888 }
   1889 
   1890 
   1891 /*
   1892  * Update node memory information for given proximity domain with specified
   1893  * starting and ending physical address range (and return positive numbers for
   1894  * success and negative ones for errors)
   1895  */
   1896 static int
   1897 lgrp_plat_node_memory_update(node_domain_map_t *node_domain, int node_cnt,
   1898     node_phys_addr_map_t *node_memory, uint64_t start, uint64_t end,
   1899     uint32_t domain)
   1900 {
   1901 	int	node;
   1902 
   1903 	/*
   1904 	 * Get node number for proximity domain
   1905 	 */
   1906 	node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain);
   1907 	if (node == -1) {
   1908 		node = lgrp_plat_node_domain_update(node_domain, node_cnt,
   1909 		    domain);
   1910 		if (node == -1)
   1911 			return (-1);
   1912 	}
   1913 
   1914 	/*
   1915 	 * Create entry in table for node if it doesn't exist
   1916 	 */
   1917 	if (!node_memory[node].exists) {
   1918 		node_memory[node].exists = 1;
   1919 		node_memory[node].start = btop(start);
   1920 		node_memory[node].end = btop(end);
   1921 		node_memory[node].prox_domain = domain;
   1922 		return (0);
   1923 	}
   1924 
   1925 	/*
   1926 	 * Entry already exists for this proximity domain
   1927 	 *
   1928 	 * There may be more than one SRAT memory entry for a domain, so we may
   1929 	 * need to update existing start or end address for the node.
   1930 	 */
   1931 	if (node_memory[node].prox_domain == domain) {
   1932 		if (btop(start) < node_memory[node].start)
   1933 			node_memory[node].start = btop(start);
   1934 		if (btop(end) > node_memory[node].end)
   1935 			node_memory[node].end = btop(end);
   1936 		return (1);
   1937 	}
   1938 	return (-2);
   1939 }
   1940 
   1941 
   1942 /*
   1943  * Have to sort node by starting physical address because VM system (physical
   1944  * page free list management) assumes and expects memnodes to be sorted in
   1945  * ascending order by physical address.  If not, the kernel will panic in
   1946  * potentially a number of different places.  (:-(
   1947  * NOTE: This workaround will not be sufficient if/when hotplugging memory is
   1948  *	 supported on x86/x64.
   1949  */
   1950 static void
   1951 lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt,
   1952     cpu_node_map_t *cpu_node, int cpu_count, node_phys_addr_map_t *node_memory)
   1953 {
   1954 	boolean_t	found;
   1955 	int		i;
   1956 	int		j;
   1957 	int		n;
   1958 	boolean_t	sorted;
   1959 	boolean_t	swapped;
   1960 
   1961 	if (!lgrp_plat_node_sort_enable || node_cnt <= 1 ||
   1962 	    node_domain == NULL || node_memory == NULL)
   1963 		return;
   1964 
   1965 	/*
   1966 	 * Sorted already?
   1967 	 */
   1968 	sorted = B_TRUE;
   1969 	for (i = 0; i < node_cnt - 1; i++) {
   1970 		/*
   1971 		 * Skip entries that don't exist
   1972 		 */
   1973 		if (!node_memory[i].exists)
   1974 			continue;
   1975 
   1976 		/*
   1977 		 * Try to find next existing entry to compare against
   1978 		 */
   1979 		found = B_FALSE;
   1980 		for (j = i + 1; j < node_cnt; j++) {
   1981 			if (node_memory[j].exists) {
   1982 				found = B_TRUE;
   1983 				break;
   1984 			}
   1985 		}
   1986 
   1987 		/*
   1988 		 * Done if no more existing entries to compare against
   1989 		 */
   1990 		if (found == B_FALSE)
   1991 			break;
   1992 
   1993 		/*
   1994 		 * Not sorted if starting address of current entry is bigger
   1995 		 * than starting address of next existing entry
   1996 		 */
   1997 		if (node_memory[i].start > node_memory[j].start) {
   1998 			sorted = B_FALSE;
   1999 			break;
   2000 		}
   2001 	}
   2002 
   2003 	/*
   2004 	 * Don't need to sort if sorted already
   2005 	 */
   2006 	if (sorted == B_TRUE)
   2007 		return;
   2008 
   2009 	/*
   2010 	 * Just use bubble sort since number of nodes is small
   2011 	 */
   2012 	n = node_cnt;
   2013 	do {
   2014 		swapped = B_FALSE;
   2015 		n--;
   2016 		for (i = 0; i < n; i++) {
   2017 			/*
   2018 			 * Skip entries that don't exist
   2019 			 */
   2020 			if (!node_memory[i].exists)
   2021 				continue;
   2022 
   2023 			/*
   2024 			 * Try to find next existing entry to compare against
   2025 			 */
   2026 			found = B_FALSE;
   2027 			for (j = i + 1; j <= n; j++) {
   2028 				if (node_memory[j].exists) {
   2029 					found = B_TRUE;
   2030 					break;
   2031 				}
   2032 			}
   2033 
   2034 			/*
   2035 			 * Done if no more existing entries to compare against
   2036 			 */
   2037 			if (found == B_FALSE)
   2038 				break;
   2039 
   2040 			if (node_memory[i].start > node_memory[j].start) {
   2041 				node_phys_addr_map_t	save_addr;
   2042 				node_domain_map_t	save_node;
   2043 
   2044 				/*
   2045 				 * Swap node to proxmity domain ID assignments
   2046 				 */
   2047 				bcopy(&node_domain[i], &save_node,
   2048 				    sizeof (node_domain_map_t));
   2049 				bcopy(&node_domain[j], &node_domain[i],
   2050 				    sizeof (node_domain_map_t));
   2051 				bcopy(&save_node, &node_domain[j],
   2052 				    sizeof (node_domain_map_t));
   2053 
   2054 				/*
   2055 				 * Swap node to physical memory assignments
   2056 				 */
   2057 				bcopy(&node_memory[i], &save_addr,
   2058 				    sizeof (node_phys_addr_map_t));
   2059 				bcopy(&node_memory[j], &node_memory[i],
   2060 				    sizeof (node_phys_addr_map_t));
   2061 				bcopy(&save_addr, &node_memory[j],
   2062 				    sizeof (node_phys_addr_map_t));
   2063 				swapped = B_TRUE;
   2064 			}
   2065 		}
   2066 	} while (swapped == B_TRUE);
   2067 
   2068 	/*
   2069 	 * Check to make sure that CPUs assigned to correct node IDs now since
   2070 	 * node to proximity domain ID assignments may have been changed above
   2071 	 */
   2072 	if (n == node_cnt - 1 || cpu_node == NULL || cpu_count < 1)
   2073 		return;
   2074 	for (i = 0; i < cpu_count; i++) {
   2075 		int		node;
   2076 
   2077 		node = lgrp_plat_domain_to_node(node_domain, node_cnt,
   2078 		    cpu_node[i].prox_domain);
   2079 		if (cpu_node[i].node != node)
   2080 			cpu_node[i].node = node;
   2081 	}
   2082 
   2083 }
   2084 
   2085 
   2086 /*
   2087  * Return time needed to probe from current CPU to memory in given node
   2088  */
   2089 static hrtime_t
   2090 lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, int cpu_node_nentries,
   2091     lgrp_plat_probe_mem_config_t *probe_mem_config,
   2092     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats)
   2093 {
   2094 	caddr_t			buf;
   2095 	hrtime_t		elapsed;
   2096 	hrtime_t		end;
   2097 	int			from;
   2098 	int			i;
   2099 	int			ipl;
   2100 	hrtime_t		max;
   2101 	hrtime_t		min;
   2102 	hrtime_t		start;
   2103 	extern int		use_sse_pagecopy;
   2104 
   2105 	/*
   2106 	 * Determine ID of node containing current CPU
   2107 	 */
   2108 	from = lgrp_plat_cpu_to_node(CPU, cpu_node, cpu_node_nentries);
   2109 	ASSERT(from >= 0 && from < lgrp_plat_node_cnt);
   2110 
   2111 	/*
   2112 	 * Do common work for probing main memory
   2113 	 */
   2114 	if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_PGCPY) {
   2115 		/*
   2116 		 * Skip probing any nodes without memory and
   2117 		 * set probe time to 0
   2118 		 */
   2119 		if (probe_mem_config->probe_va[to] == NULL) {
   2120 			lat_stats->latencies[from][to] = 0;
   2121 			return (0);
   2122 		}
   2123 
   2124 		/*
   2125 		 * Invalidate caches once instead of once every sample
   2126 		 * which should cut cost of probing by a lot
   2127 		 */
   2128 		probe_stats->flush_cost = gethrtime();
   2129 		invalidate_cache();
   2130 		probe_stats->flush_cost = gethrtime() -
   2131 		    probe_stats->flush_cost;
   2132 		probe_stats->probe_cost_total += probe_stats->flush_cost;
   2133 	}
   2134 
   2135 	/*
   2136 	 * Probe from current CPU to given memory using specified operation
   2137 	 * and take specified number of samples
   2138 	 */
   2139 	max = 0;
   2140 	min = -1;
   2141 	for (i = 0; i < lgrp_plat_probe_nsamples; i++) {
   2142 		probe_stats->probe_cost = gethrtime();
   2143 
   2144 		/*
   2145 		 * Can't measure probe time if gethrtime() isn't working yet
   2146 		 */
   2147 		if (probe_stats->probe_cost == 0 && gethrtime() == 0)
   2148 			return (0);
   2149 
   2150 		if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) {
   2151 			/*
   2152 			 * Measure how long it takes to read vendor ID from
   2153 			 * Northbridge
   2154 			 */
   2155 			elapsed = opt_probe_vendor(to, lgrp_plat_probe_nreads);
   2156 		} else {
   2157 			/*
   2158 			 * Measure how long it takes to copy page
   2159 			 * on top of itself
   2160 			 */
   2161 			buf = probe_mem_config->probe_va[to] + (i * PAGESIZE);
   2162 
   2163 			kpreempt_disable();
   2164 			ipl = splhigh();
   2165 			start = gethrtime();
   2166 			if (use_sse_pagecopy)
   2167 				hwblkpagecopy(buf, buf);
   2168 			else
   2169 				bcopy(buf, buf, PAGESIZE);
   2170 			end = gethrtime();
   2171 			elapsed = end - start;
   2172 			splx(ipl);
   2173 			kpreempt_enable();
   2174 		}
   2175 
   2176 		probe_stats->probe_cost = gethrtime() -
   2177 		    probe_stats->probe_cost;
   2178 		probe_stats->probe_cost_total += probe_stats->probe_cost;
   2179 
   2180 		if (min == -1 || elapsed < min)
   2181 			min = elapsed;
   2182 		if (elapsed > max)
   2183 			max = elapsed;
   2184 	}
   2185 
   2186 	/*
   2187 	 * Update minimum and maximum probe times between
   2188 	 * these two nodes
   2189 	 */
   2190 	if (min < probe_stats->probe_min[from][to] ||
   2191 	    probe_stats->probe_min[from][to] == 0)
   2192 		probe_stats->probe_min[from][to] = min;
   2193 
   2194 	if (max > probe_stats->probe_max[from][to])
   2195 		probe_stats->probe_max[from][to] = max;
   2196 
   2197 	return (min);
   2198 }
   2199 
   2200 
   2201 /*
   2202  * Read boot property with CPU to APIC ID array, fill in CPU to node ID
   2203  * mapping table with APIC ID for each CPU (if pointer to table isn't NULL),
   2204  * and return number of CPU APIC IDs.
   2205  *
   2206  * NOTE: This code assumes that CPU IDs are assigned in order that they appear
   2207  *       in in cpu_apicid_array boot property which is based on and follows
   2208  *	 same ordering as processor list in ACPI MADT.  If the code in
   2209  *	 usr/src/uts/i86pc/io/pcplusmp/apic.c that reads MADT and assigns
   2210  *	 CPU IDs ever changes, then this code will need to change too....
   2211  */
   2212 static int
   2213 lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node)
   2214 {
   2215 	int	boot_prop_len;
   2216 	char	*boot_prop_name = BP_CPU_APICID_ARRAY;
   2217 	uint8_t	cpu_apicid_array[UINT8_MAX + 1];
   2218 	int	i;
   2219 	int	n;
   2220 
   2221 	/*
   2222 	 * Check length of property value
   2223 	 */
   2224 	boot_prop_len = BOP_GETPROPLEN(bootops, boot_prop_name);
   2225 	if (boot_prop_len <= 0 || boot_prop_len > sizeof (cpu_apicid_array))
   2226 		return (-1);
   2227 
   2228 	/*
   2229 	 * Calculate number of entries in array and return when there's just
   2230 	 * one CPU since that's not very interesting for NUMA
   2231 	 */
   2232 	n = boot_prop_len / sizeof (uint8_t);
   2233 	if (n == 1)
   2234 		return (-2);
   2235 
   2236 	/*
   2237 	 * Get CPU to APIC ID property value
   2238 	 */
   2239 	if (BOP_GETPROP(bootops, boot_prop_name, cpu_apicid_array) < 0)
   2240 		return (-3);
   2241 
   2242 	/*
   2243 	 * Just return number of CPU APIC IDs if CPU to node mapping table is
   2244 	 * NULL
   2245 	 */
   2246 	if (cpu_node == NULL)
   2247 		return (n);
   2248 
   2249 	/*
   2250 	 * Fill in CPU to node ID mapping table with APIC ID for each CPU
   2251 	 */
   2252 	for (i = 0; i < n; i++) {
   2253 		cpu_node[i].exists = 1;
   2254 		cpu_node[i].apicid = cpu_apicid_array[i];
   2255 	}
   2256 
   2257 	/*
   2258 	 * Return number of CPUs based on number of APIC IDs
   2259 	 */
   2260 	return (n);
   2261 }
   2262 
   2263 
   2264 /*
   2265  * Read ACPI System Locality Information Table (SLIT) to determine how far each
   2266  * NUMA node is from each other
   2267  */
   2268 static int
   2269 lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt,
   2270     node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats)
   2271 {
   2272 	int		i;
   2273 	int		j;
   2274 	int		localities;
   2275 	hrtime_t	max;
   2276 	hrtime_t	min;
   2277 	int		retval;
   2278 	uint8_t		*slit_entries;
   2279 
   2280 	if (tp == NULL || !lgrp_plat_slit_enable)
   2281 		return (1);
   2282 
   2283 	if (lat_stats == NULL)
   2284 		return (2);
   2285 
   2286 	localities = tp->number;
   2287 	if (localities != node_cnt)
   2288 		return (3);
   2289 
   2290 	min = lat_stats->latency_min;
   2291 	max = lat_stats->latency_max;
   2292 
   2293 	/*
   2294 	 * Fill in latency matrix based on SLIT entries
   2295 	 */
   2296 	slit_entries = tp->entry;
   2297 	for (i = 0; i < localities; i++) {
   2298 		for (j = 0; j < localities; j++) {
   2299 			uint8_t	latency;
   2300 
   2301 			latency = slit_entries[(i * localities) + j];
   2302 			lat_stats->latencies[i][j] = latency;
   2303 			if (latency < min || min == -1)
   2304 				min = latency;
   2305 			if (latency > max)
   2306 				max = latency;
   2307 		}
   2308 	}
   2309 
   2310 	/*
   2311 	 * Verify that latencies/distances given in SLIT look reasonable
   2312 	 */
   2313 	retval = lgrp_plat_latency_verify(node_memory, lat_stats);
   2314 
   2315 	if (retval) {
   2316 		/*
   2317 		 * Reinitialize (zero) latency table since SLIT doesn't look
   2318 		 * right
   2319 		 */
   2320 		for (i = 0; i < localities; i++) {
   2321 			for (j = 0; j < localities; j++)
   2322 				lat_stats->latencies[i][j] = 0;
   2323 		}
   2324 	} else {
   2325 		/*
   2326 		 * Update min and max latencies seen since SLIT looks valid
   2327 		 */
   2328 		lat_stats->latency_min = min;
   2329 		lat_stats->latency_max = max;
   2330 	}
   2331 
   2332 	return (retval);
   2333 }
   2334 
   2335 
   2336 /*
   2337  * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs
   2338  * and memory are local to each other in the same NUMA node and return number
   2339  * of nodes
   2340  */
   2341 static int
   2342 lgrp_plat_process_srat(struct srat *tp, uint32_t *prox_domain_min,
   2343     node_domain_map_t *node_domain, cpu_node_map_t *cpu_node, int cpu_count,
   2344     node_phys_addr_map_t *node_memory)
   2345 {
   2346 	struct srat_item	*srat_end;
   2347 	int			i;
   2348 	struct srat_item	*item;
   2349 	int			node_cnt;
   2350 	int			proc_entry_count;
   2351 
   2352 	/*
   2353 	 * Nothing to do when no SRAT or disabled
   2354 	 */
   2355 	if (tp == NULL || !lgrp_plat_srat_enable)
   2356 		return (-1);
   2357 
   2358 	/*
   2359 	 * Determine number of nodes by counting number of proximity domains in
   2360 	 * SRAT and return if number of nodes is 1 or less since don't need to
   2361 	 * read SRAT then
   2362 	 */
   2363 	node_cnt = lgrp_plat_srat_domains(tp, prox_domain_min);
   2364 	if (node_cnt == 1)
   2365 		return (1);
   2366 	else if (node_cnt <= 0)
   2367 		return (-2);
   2368 
   2369 	/*
   2370 	 * Walk through SRAT, examining each CPU and memory entry to determine
   2371 	 * which CPUs and memory belong to which node.
   2372 	 */
   2373 	item = tp->list;
   2374 	srat_end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp);
   2375 	proc_entry_count = 0;
   2376 	while (item < srat_end) {
   2377 		uint32_t	apic_id;
   2378 		uint32_t	domain;
   2379 		uint64_t	end;
   2380 		uint64_t	length;
   2381 		uint64_t	start;
   2382 
   2383 		switch (item->type) {
   2384 		case SRAT_PROCESSOR:	/* CPU entry */
   2385 			if (!(item->i.p.flags & SRAT_ENABLED) ||
   2386 			    cpu_node == NULL)
   2387 				break;
   2388 
   2389 			/*
   2390 			 * Calculate domain (node) ID and fill in APIC ID to
   2391 			 * domain/node mapping table
   2392 			 */
   2393 			domain = item->i.p.domain1;
   2394 			for (i = 0; i < 3; i++) {
   2395 				domain += item->i.p.domain2[i] <<
   2396 				    ((i + 1) * 8);
   2397 			}
   2398 			apic_id = item->i.p.apic_id;
   2399 
   2400 			if (lgrp_plat_cpu_node_update(node_domain, node_cnt,
   2401 			    cpu_node, cpu_count, apic_id, domain) < 0)
   2402 				return (-3);
   2403 
   2404 			proc_entry_count++;
   2405 			break;
   2406 
   2407 		case SRAT_MEMORY:	/* memory entry */
   2408 			if (!(item->i.m.flags & SRAT_ENABLED) ||
   2409 			    node_memory == NULL)
   2410 				break;
   2411 
   2412 			/*
   2413 			 * Get domain (node) ID and fill in domain/node
   2414 			 * to memory mapping table
   2415 			 */
   2416 			domain = item->i.m.domain;
   2417 			start = item->i.m.base_addr;
   2418 			length = item->i.m.len;
   2419 			end = start + length - 1;
   2420 
   2421 			if (lgrp_plat_node_memory_update(node_domain, node_cnt,
   2422 			    node_memory, start, end, domain) < 0)
   2423 				return (-4);
   2424 			break;
   2425 		case SRAT_X2APIC:	/* x2apic CPU entry */
   2426 			if (!(item->i.xp.flags & SRAT_ENABLED) ||
   2427 			    cpu_node == NULL)
   2428 				break;
   2429 
   2430 			/*
   2431 			 * Calculate domain (node) ID and fill in APIC ID to
   2432 			 * domain/node mapping table
   2433 			 */
   2434 			domain = item->i.xp.domain;
   2435 			apic_id = item->i.xp.x2apic_id;
   2436 
   2437 			if (lgrp_plat_cpu_node_update(node_domain, node_cnt,
   2438 			    cpu_node, cpu_count, apic_id, domain) < 0)
   2439 				return (-3);
   2440 
   2441 			proc_entry_count++;
   2442 			break;
   2443 
   2444 		default:
   2445 			break;
   2446 		}
   2447 
   2448 		item = (struct srat_item *)((uintptr_t)item + item->len);
   2449 	}
   2450 
   2451 	/*
   2452 	 * Should have seen at least as many SRAT processor entries as CPUs
   2453 	 */
   2454 	if (proc_entry_count < cpu_count)
   2455 		return (-5);
   2456 
   2457 	/*
   2458 	 * Need to sort nodes by starting physical address since VM system
   2459 	 * assumes and expects memnodes to be sorted in ascending order by
   2460 	 * physical address
   2461 	 */
   2462 	lgrp_plat_node_sort(node_domain, node_cnt, cpu_node, cpu_count,
   2463 	    node_memory);
   2464 
   2465 	return (node_cnt);
   2466 }
   2467 
   2468 
   2469 /*
   2470  * Allocate permanent memory for any temporary memory that we needed to
   2471  * allocate using BOP_ALLOC() before kmem_alloc() and VM system were
   2472  * initialized and copy everything from temporary to permanent memory since
   2473  * temporary boot memory will eventually be released during boot
   2474  */
   2475 static void
   2476 lgrp_plat_release_bootstrap(void)
   2477 {
   2478 	void	*buf;
   2479 	size_t	size;
   2480 
   2481 	if (lgrp_plat_cpu_node_nentries > 0) {
   2482 		size = lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t);
   2483 		buf = kmem_alloc(size, KM_SLEEP);
   2484 		bcopy(lgrp_plat_cpu_node, buf, size);
   2485 		lgrp_plat_cpu_node = buf;
   2486 	}
   2487 }
   2488 
   2489 
   2490 /*
   2491  * Return number of proximity domains given in ACPI SRAT
   2492  */
   2493 static int
   2494 lgrp_plat_srat_domains(struct srat *tp, uint32_t *prox_domain_min)
   2495 {
   2496 	int			domain_cnt;
   2497 	uint32_t		domain_min;
   2498 	struct srat_item	*end;
   2499 	int			i;
   2500 	struct srat_item	*item;
   2501 	node_domain_map_t	node_domain[MAX_NODES];
   2502 
   2503 
   2504 	if (tp == NULL || !lgrp_plat_srat_enable)
   2505 		return (1);
   2506 
   2507 	/*
   2508 	 * Walk through SRAT to find minimum proximity domain ID
   2509 	 */
   2510 	domain_min = UINT32_MAX;
   2511 	item = tp->list;
   2512 	end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp);
   2513 	while (item < end) {
   2514 		uint32_t	domain;
   2515 
   2516 		switch (item->type) {
   2517 		case SRAT_PROCESSOR:	/* CPU entry */
   2518 			if (!(item->i.p.flags & SRAT_ENABLED)) {
   2519 				item = (struct srat_item *)((uintptr_t)item +
   2520 				    item->len);
   2521 				continue;
   2522 			}
   2523 			domain = item->i.p.domain1;
   2524 			for (i = 0; i < 3; i++) {
   2525 				domain += item->i.p.domain2[i] <<
   2526 				    ((i + 1) * 8);
   2527 			}
   2528 			break;
   2529 
   2530 		case SRAT_MEMORY:	/* memory entry */
   2531 			if (!(item->i.m.flags & SRAT_ENABLED)) {
   2532 				item = (struct srat_item *)((uintptr_t)item +
   2533 				    item->len);
   2534 				continue;
   2535 			}
   2536 			domain = item->i.m.domain;
   2537 			break;
   2538 
   2539 		case SRAT_X2APIC:	/* x2apic CPU entry */
   2540 			if (!(item->i.xp.flags & SRAT_ENABLED)) {
   2541 				item = (struct srat_item *)((uintptr_t)item +
   2542 				    item->len);
   2543 				continue;
   2544 			}
   2545 			domain = item->i.xp.domain;
   2546 			break;
   2547 
   2548 		default:
   2549 			item = (struct srat_item *)((uintptr_t)item +
   2550 			    item->len);
   2551 			continue;
   2552 		}
   2553 
   2554 		/*
   2555 		 * Keep track of minimum proximity domain ID
   2556 		 */
   2557 		if (domain < domain_min)
   2558 			domain_min = domain;
   2559 
   2560 		item = (struct srat_item *)((uintptr_t)item + item->len);
   2561 	}
   2562 	if (lgrp_plat_domain_min_enable && prox_domain_min != NULL)
   2563 		*prox_domain_min = domain_min;
   2564 
   2565 	/*
   2566 	 * Walk through SRAT, examining each CPU and memory entry to determine
   2567 	 * proximity domain ID for each.
   2568 	 */
   2569 	domain_cnt = 0;
   2570 	item = tp->list;
   2571 	end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp);
   2572 	bzero(node_domain, MAX_NODES * sizeof (node_domain_map_t));
   2573 	while (item < end) {
   2574 		uint32_t	domain;
   2575 		boolean_t	overflow;
   2576 		uint_t		start;
   2577 
   2578 		switch (item->type) {
   2579 		case SRAT_PROCESSOR:	/* CPU entry */
   2580 			if (!(item->i.p.flags & SRAT_ENABLED)) {
   2581 				item = (struct srat_item *)((uintptr_t)item +
   2582 				    item->len);
   2583 				continue;
   2584 			}
   2585 			domain = item->i.p.domain1;
   2586 			for (i = 0; i < 3; i++) {
   2587 				domain += item->i.p.domain2[i] <<
   2588 				    ((i + 1) * 8);
   2589 			}
   2590 			break;
   2591 
   2592 		case SRAT_MEMORY:	/* memory entry */
   2593 			if (!(item->i.m.flags & SRAT_ENABLED)) {
   2594 				item = (struct srat_item *)((uintptr_t)item +
   2595 				    item->len);
   2596 				continue;
   2597 			}
   2598 			domain = item->i.m.domain;
   2599 			break;
   2600 
   2601 		case SRAT_X2APIC:	/* x2apic CPU entry */
   2602 			if (!(item->i.xp.flags & SRAT_ENABLED)) {
   2603 				item = (struct srat_item *)((uintptr_t)item +
   2604 				    item->len);
   2605 				continue;
   2606 			}
   2607 			domain = item->i.xp.domain;
   2608 			break;
   2609 
   2610 		default:
   2611 			item = (struct srat_item *)((uintptr_t)item +
   2612 			    item->len);
   2613 			continue;
   2614 		}
   2615 
   2616 		/*
   2617 		 * Count and keep track of which proximity domain IDs seen
   2618 		 */
   2619 		start = i = domain % MAX_NODES;
   2620 		overflow = B_TRUE;
   2621 		do {
   2622 			/*
   2623 			 * Create entry for proximity domain and increment
   2624 			 * count when no entry exists where proximity domain
   2625 			 * hashed
   2626 			 */
   2627 			if (!node_domain[i].exists) {
   2628 				node_domain[i].exists = 1;
   2629 				node_domain[i].prox_domain = domain;
   2630 				domain_cnt++;
   2631 				overflow = B_FALSE;
   2632 				break;
   2633 			}
   2634 
   2635 			/*
   2636 			 * Nothing to do when proximity domain seen already
   2637 			 * and its entry exists
   2638 			 */
   2639 			if (node_domain[i].prox_domain == domain) {
   2640 				overflow = B_FALSE;
   2641 				break;
   2642 			}
   2643 
   2644 			/*
   2645 			 * Entry exists where proximity domain hashed, but for
   2646 			 * different proximity domain so keep search for empty
   2647 			 * slot to put it or matching entry whichever comes
   2648 			 * first.
   2649 			 */
   2650 			i = (i + 1) % MAX_NODES;
   2651 		} while (i != start);
   2652 
   2653 		/*
   2654 		 * Didn't find empty or matching entry which means have more
   2655 		 * proximity domains than supported nodes (:-(
   2656 		 */
   2657 		ASSERT(overflow != B_TRUE);
   2658 		if (overflow == B_TRUE)
   2659 			return (-1);
   2660 
   2661 		item = (struct srat_item *)((uintptr_t)item + item->len);
   2662 	}
   2663 	return (domain_cnt);
   2664 }
   2665 
   2666 
   2667 /*
   2668  * Set lgroup latencies for 2 level lgroup topology
   2669  */
   2670 static void
   2671 lgrp_plat_2level_setup(node_phys_addr_map_t *node_memory,
   2672     lgrp_plat_latency_stats_t *lat_stats)
   2673 {
   2674 	int	i;
   2675 
   2676 	ASSERT(node_memory != NULL && lat_stats != NULL);
   2677 
   2678 	if (lgrp_plat_node_cnt >= 4)
   2679 		cmn_err(CE_NOTE,
   2680 		    "MPO only optimizing for local and remote\n");
   2681 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
   2682 		int	j;
   2683 
   2684 		if (!node_memory[i].exists)
   2685 			continue;
   2686 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
   2687 			if (!node_memory[j].exists)
   2688 				continue;
   2689 			if (i == j)
   2690 				lat_stats->latencies[i][j] = 2;
   2691 			else
   2692 				lat_stats->latencies[i][j] = 3;
   2693 		}
   2694 	}
   2695 	lat_stats->latency_min = 2;
   2696 	lat_stats->latency_max = 3;
   2697 	lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0);
   2698 }
   2699 
   2700 
   2701 /*
   2702  * The following Opteron specific constants, macros, types, and routines define
   2703  * PCI configuration space registers and how to read them to determine the NUMA
   2704  * configuration of *supported* Opteron processors.  They provide the same
   2705  * information that may be gotten from the ACPI System Resource Affinity Table
   2706  * (SRAT) if it exists on the machine of interest.
   2707  *
   2708  * The AMD BIOS and Kernel Developer's Guide (BKDG) for the processor family
   2709  * of interest describes all of these registers and their contents.  The main
   2710  * registers used by this code to determine the NUMA configuration of the
   2711  * machine are the node ID register for the number of NUMA nodes and the DRAM
   2712  * address map registers for the physical address range of each node.
   2713  *
   2714  * NOTE: The format and how to determine the NUMA configuration using PCI
   2715  *	 config space registers may change or may not be supported in future
   2716  *	 Opteron processor families.
   2717  */
   2718 
   2719 /*
   2720  * How many bits to shift Opteron DRAM Address Map base and limit registers
   2721  * to get actual value
   2722  */
   2723 #define	OPT_DRAMADDR_HI_LSHIFT_ADDR	40	/* shift left for address */
   2724 #define	OPT_DRAMADDR_LO_LSHIFT_ADDR	8	/* shift left for address */
   2725 
   2726 #define	OPT_DRAMADDR_HI_MASK_ADDR	0x000000FF /* address bits 47-40 */
   2727 #define	OPT_DRAMADDR_LO_MASK_ADDR	0xFFFF0000 /* address bits 39-24 */
   2728 
   2729 #define	OPT_DRAMADDR_LO_MASK_OFF	0xFFFFFF /* offset for address */
   2730 
   2731 /*
   2732  * Macros to derive addresses from Opteron DRAM Address Map registers
   2733  */
   2734 #define	OPT_DRAMADDR_HI(reg) \
   2735 	(((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \
   2736 	    OPT_DRAMADDR_HI_LSHIFT_ADDR)
   2737 
   2738 #define	OPT_DRAMADDR_LO(reg) \
   2739 	(((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \
   2740 	    OPT_DRAMADDR_LO_LSHIFT_ADDR)
   2741 
   2742 #define	OPT_DRAMADDR(high, low) \
   2743 	(OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low))
   2744 
   2745 /*
   2746  * Bit masks defining what's in Opteron DRAM Address Map base register
   2747  */
   2748 #define	OPT_DRAMBASE_LO_MASK_RE		0x1	/* read enable */
   2749 #define	OPT_DRAMBASE_LO_MASK_WE		0x2	/* write enable */
   2750 #define	OPT_DRAMBASE_LO_MASK_INTRLVEN	0x700	/* interleave */
   2751 
   2752 /*
   2753  * Bit masks defining what's in Opteron DRAM Address Map limit register
   2754  */
   2755 #define	OPT_DRAMLIMIT_LO_MASK_DSTNODE	0x7		/* destination node */
   2756 #define	OPT_DRAMLIMIT_LO_MASK_INTRLVSEL	0x700		/* interleave select */
   2757 
   2758 
   2759 /*
   2760  * Opteron Node ID register in PCI configuration space contains
   2761  * number of nodes in system, etc. for Opteron K8.  The following
   2762  * constants and macros define its contents, structure, and access.
   2763  */
   2764 
   2765 /*
   2766  * Bit masks defining what's in Opteron Node ID register
   2767  */
   2768 #define	OPT_NODE_MASK_ID	0x7	/* node ID */
   2769 #define	OPT_NODE_MASK_CNT	0x70	/* node count */
   2770 #define	OPT_NODE_MASK_IONODE	0x700	/* Hypertransport I/O hub node ID */
   2771 #define	OPT_NODE_MASK_LCKNODE	0x7000	/* lock controller node ID */
   2772 #define	OPT_NODE_MASK_CPUCNT	0xF0000	/* CPUs in system (0 means 1 CPU)  */
   2773 
   2774 /*
   2775  * How many bits in Opteron Node ID register to shift right to get actual value
   2776  */
   2777 #define	OPT_NODE_RSHIFT_CNT	0x4	/* shift right for node count value */
   2778 
   2779 /*
   2780  * Macros to get values from Opteron Node ID register
   2781  */
   2782 #define	OPT_NODE_CNT(reg) \
   2783 	((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT)
   2784 
   2785 /*
   2786  * Macro to setup PCI Extended Configuration Space (ECS) address to give to
   2787  * "in/out" instructions
   2788  *
   2789  * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any
   2790  *	 other uses should just do MMIO to access PCI ECS.
   2791  *	 Must enable special bit in Northbridge Configuration Register on
   2792  *	 Greyhound for extended CF8 space access to be able to access PCI ECS
   2793  *	 using "in/out" instructions and restore special bit after done
   2794  *	 accessing PCI ECS.
   2795  */
   2796 #define	OPT_PCI_ECS_ADDR(bus, device, function, reg) \
   2797 	(PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11)  | \
   2798 	    (((function) & 0x7) << 8) | ((reg) & 0xfc) | \
   2799 	    ((((reg) >> 8) & 0xf) << 24))
   2800 
   2801 /*
   2802  * PCI configuration space registers accessed by specifying
   2803  * a bus, device, function, and offset.  The following constants
   2804  * define the values needed to access Opteron K8 configuration
   2805  * info to determine its node topology
   2806  */
   2807 
   2808 #define	OPT_PCS_BUS_CONFIG	0	/* Hypertransport config space bus */
   2809 
   2810 /*
   2811  * Opteron PCI configuration space register function values
   2812  */
   2813 #define	OPT_PCS_FUNC_HT		0	/* Hypertransport configuration */
   2814 #define	OPT_PCS_FUNC_ADDRMAP	1	/* Address map configuration */
   2815 #define	OPT_PCS_FUNC_DRAM	2	/* DRAM configuration */
   2816 #define	OPT_PCS_FUNC_MISC	3	/* Miscellaneous configuration */
   2817 
   2818 /*
   2819  * PCI Configuration Space register offsets
   2820  */
   2821 #define	OPT_PCS_OFF_VENDOR	0x0	/* device/vendor ID register */
   2822 #define	OPT_PCS_OFF_DRAMBASE_HI	0x140	/* DRAM Base register (node 0) */
   2823 #define	OPT_PCS_OFF_DRAMBASE_LO	0x40	/* DRAM Base register (node 0) */
   2824 #define	OPT_PCS_OFF_NODEID	0x60	/* Node ID register */
   2825 
   2826 /*
   2827  * Opteron PCI Configuration Space device IDs for nodes
   2828  */
   2829 #define	OPT_PCS_DEV_NODE0		24	/* device number for node 0 */
   2830 
   2831 
   2832 /*
   2833  * Opteron DRAM address map gives base and limit for physical memory in a node
   2834  */
   2835 typedef	struct opt_dram_addr_map {
   2836 	uint32_t	base_hi;
   2837 	uint32_t	base_lo;
   2838 	uint32_t	limit_hi;
   2839 	uint32_t	limit_lo;
   2840 } opt_dram_addr_map_t;
   2841 
   2842 
   2843 /*
   2844  * Supported AMD processor families
   2845  */
   2846 #define	AMD_FAMILY_HAMMER	15
   2847 #define	AMD_FAMILY_GREYHOUND	16
   2848 
   2849 /*
   2850  * Whether to have is_opteron() return 1 even when processor isn't supported
   2851  */
   2852 uint_t	is_opteron_override = 0;
   2853 
   2854 /*
   2855  * AMD processor family for current CPU
   2856  */
   2857 uint_t	opt_family = 0;
   2858 
   2859 
   2860 /*
   2861  * Determine whether we're running on a supported AMD Opteron since reading
   2862  * node count and DRAM address map registers may have different format or
   2863  * may not be supported across processor families
   2864  */
   2865 static int
   2866 is_opteron(void)
   2867 {
   2868 
   2869 	if (x86_vendor != X86_VENDOR_AMD)
   2870 		return (0);
   2871 
   2872 	opt_family = cpuid_getfamily(CPU);
   2873 	if (opt_family == AMD_FAMILY_HAMMER ||
   2874 	    opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override)
   2875 		return (1);
   2876 	else
   2877 		return (0);
   2878 }
   2879 
   2880 
   2881 /*
   2882  * Determine NUMA configuration for Opteron from registers that live in PCI
   2883  * configuration space
   2884  */
   2885 static void
   2886 opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv,
   2887     node_phys_addr_map_t *node_memory)
   2888 {
   2889 	uint_t				bus;
   2890 	uint_t				dev;
   2891 	struct opt_dram_addr_map	dram_map[MAX_NODES];
   2892 	uint_t				node;
   2893 	uint_t				node_info[MAX_NODES];
   2894 	uint_t				off_hi;
   2895 	uint_t				off_lo;
   2896 	uint64_t			nb_cfg_reg;
   2897 
   2898 	/*
   2899 	 * Read configuration registers from PCI configuration space to
   2900 	 * determine node information, which memory is in each node, etc.
   2901 	 *
   2902 	 * Write to PCI configuration space address register to specify
   2903 	 * which configuration register to read and read/write PCI
   2904 	 * configuration space data register to get/set contents
   2905 	 */
   2906 	bus = OPT_PCS_BUS_CONFIG;
   2907 	dev = OPT_PCS_DEV_NODE0;
   2908 	off_hi = OPT_PCS_OFF_DRAMBASE_HI;
   2909 	off_lo = OPT_PCS_OFF_DRAMBASE_LO;
   2910 
   2911 	/*
   2912 	 * Read node ID register for node 0 to get node count
   2913 	 */
   2914 	node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT,
   2915 	    OPT_PCS_OFF_NODEID);
   2916 	*node_cnt = OPT_NODE_CNT(node_info[0]) + 1;
   2917 
   2918 	/*
   2919 	 * If number of nodes is more than maximum supported, then set node
   2920 	 * count to 1 and treat system as UMA instead of NUMA.
   2921 	 */
   2922 	if (*node_cnt > MAX_NODES) {
   2923 		*node_cnt = 1;
   2924 		return;
   2925 	}
   2926 
   2927 	/*
   2928 	 * For Greyhound, PCI Extended Configuration Space must be enabled to
   2929 	 * read high DRAM address map base and limit registers
   2930 	 */
   2931 	if (opt_family == AMD_FAMILY_GREYHOUND) {
   2932 		nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG);
   2933 		if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0)
   2934 			wrmsr(MSR_AMD_NB_CFG,
   2935 			    nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS);
   2936 	}
   2937 
   2938 	for (node = 0; node < *node_cnt; node++) {
   2939 		uint32_t	base_hi;
   2940 		uint32_t	base_lo;
   2941 		uint32_t	limit_hi;
   2942 		uint32_t	limit_lo;
   2943 
   2944 		/*
   2945 		 * Read node ID register (except for node 0 which we just read)
   2946 		 */
   2947 		if (node > 0) {
   2948 			node_info[node] = pci_getl_func(bus, dev,
   2949 			    OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID);
   2950 		}
   2951 
   2952 		/*
   2953 		 * Read DRAM base and limit registers which specify
   2954 		 * physical memory range of each node
   2955 		 */
   2956 		if (opt_family != AMD_FAMILY_GREYHOUND)
   2957 			base_hi = 0;
   2958 		else {
   2959 			outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev,
   2960 			    OPT_PCS_FUNC_ADDRMAP, off_hi));
   2961 			base_hi = dram_map[node].base_hi =
   2962 			    inl(PCI_CONFDATA);
   2963 		}
   2964 		base_lo = dram_map[node].base_lo = pci_getl_func(bus, dev,
   2965 		    OPT_PCS_FUNC_ADDRMAP, off_lo);
   2966 
   2967 		if ((dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) &&
   2968 		    mem_intrlv)
   2969 			*mem_intrlv = *mem_intrlv + 1;
   2970 
   2971 		off_hi += 4;	/* high limit register offset */
   2972 		if (opt_family != AMD_FAMILY_GREYHOUND)
   2973 			limit_hi = 0;
   2974 		else {
   2975 			outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev,
   2976 			    OPT_PCS_FUNC_ADDRMAP, off_hi));
   2977 			limit_hi = dram_map[node].limit_hi =
   2978 			    inl(PCI_CONFDATA);
   2979 		}
   2980 
   2981 		off_lo += 4;	/* low limit register offset */
   2982 		limit_lo = dram_map[node].limit_lo = pci_getl_func(bus,
   2983 		    dev, OPT_PCS_FUNC_ADDRMAP, off_lo);
   2984 
   2985 		/*
   2986 		 * Increment device number to next node and register offsets
   2987 		 * for DRAM base register of next node
   2988 		 */
   2989 		off_hi += 4;
   2990 		off_lo += 4;
   2991 		dev++;
   2992 
   2993 		/*
   2994 		 * Both read and write enable bits must be enabled in DRAM
   2995 		 * address map base register for physical memory to exist in
   2996 		 * node
   2997 		 */
   2998 		if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 ||
   2999 		    (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) {
   3000 			/*
   3001 			 * Mark node memory as non-existent and set start and
   3002 			 * end addresses to be same in node_memory[]
   3003 			 */
   3004 			node_memory[node].exists = 0;
   3005 			node_memory[node].start = node_memory[node].end =
   3006 			    (pfn_t)-1;
   3007 			continue;
   3008 		}
   3009 
   3010 		/*
   3011 		 * Mark node memory as existing and remember physical address
   3012 		 * range of each node for use later
   3013 		 */
   3014 		node_memory[node].exists = 1;
   3015 
   3016 		node_memory[node].start = btop(OPT_DRAMADDR(base_hi, base_lo));
   3017 
   3018 		node_memory[node].end = btop(OPT_DRAMADDR(limit_hi, limit_lo) |
   3019 		    OPT_DRAMADDR_LO_MASK_OFF);
   3020 	}
   3021 
   3022 	/*
   3023 	 * Restore PCI Extended Configuration Space enable bit
   3024 	 */
   3025 	if (opt_family == AMD_FAMILY_GREYHOUND) {
   3026 		if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0)
   3027 			wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg);
   3028 	}
   3029 }
   3030 
   3031 
   3032 /*
   3033  * Return average amount of time to read vendor ID register on Northbridge
   3034  * N times on specified destination node from current CPU
   3035  */
   3036 static hrtime_t
   3037 opt_probe_vendor(int dest_node, int nreads)
   3038 {
   3039 	int		cnt;
   3040 	uint_t		dev;
   3041 	/* LINTED: set but not used in function */
   3042 	volatile uint_t	dev_vendor;
   3043 	hrtime_t	elapsed;
   3044 	hrtime_t	end;
   3045 	int		ipl;
   3046 	hrtime_t	start;
   3047 
   3048 	dev = OPT_PCS_DEV_NODE0 + dest_node;
   3049 	kpreempt_disable();
   3050 	ipl = spl8();
   3051 	outl(PCI_CONFADD, PCI_CADDR1(0, dev, OPT_PCS_FUNC_DRAM,
   3052 	    OPT_PCS_OFF_VENDOR));
   3053 	start = gethrtime();
   3054 	for (cnt = 0; cnt < nreads; cnt++)
   3055 		dev_vendor = inl(PCI_CONFDATA);
   3056 	end = gethrtime();
   3057 	elapsed = (end - start) / nreads;
   3058 	splx(ipl);
   3059 	kpreempt_enable();
   3060 	return (elapsed);
   3061 }
   3062