Home | History | Annotate | Download | only in os
      1      0    stevel /*
      2      0    stevel  * CDDL HEADER START
      3      0    stevel  *
      4      0    stevel  * The contents of this file are subject to the terms of the
      5   2220    stevel  * Common Development and Distribution License (the "License").
      6   2220    stevel  * You may not use this file except in compliance with the License.
      7      0    stevel  *
      8      0    stevel  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9      0    stevel  * or http://www.opensolaris.org/os/licensing.
     10      0    stevel  * See the License for the specific language governing permissions
     11      0    stevel  * and limitations under the License.
     12      0    stevel  *
     13      0    stevel  * When distributing Covered Code, include this CDDL HEADER in each
     14      0    stevel  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15      0    stevel  * If applicable, add the following below this CDDL HEADER, with the
     16      0    stevel  * fields enclosed by brackets "[]" replaced with your own identifying
     17      0    stevel  * information: Portions Copyright [yyyy] [name of copyright owner]
     18      0    stevel  *
     19      0    stevel  * CDDL HEADER END
     20      0    stevel  */
     21   2220    stevel 
     22      0    stevel /*
     23   9053  jonathan  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24      0    stevel  * Use is subject to license terms.
     25      0    stevel  */
     26      0    stevel 
     27   6445       jjc /*
     28   6445       jjc  * LOCALITY GROUP (LGROUP) PLATFORM SUPPORT FOR X86/AMD64 PLATFORMS
     29   6445       jjc  * ================================================================
     30   6445       jjc  * Multiprocessor AMD and Intel systems may have Non Uniform Memory Access
     31   6445       jjc  * (NUMA).  A NUMA machine consists of one or more "nodes" that each consist of
     32   6445       jjc  * one or more CPUs and some local memory.  The CPUs in each node can access
     33   6445       jjc  * the memory in the other nodes but at a higher latency than accessing their
     34   6445       jjc  * local memory.  Typically, a system with only one node has Uniform Memory
     35   6445       jjc  * Access (UMA), but it may be possible to have a one node system that has
     36   6445       jjc  * some global memory outside of the node which is higher latency.
     37   6445       jjc  *
     38   6445       jjc  * Module Description
     39   6445       jjc  * ------------------
     40   6445       jjc  * This module provides a platform interface for determining which CPUs and
     41   6445       jjc  * which memory (and how much) are in a NUMA node and how far each node is from
     42   6445       jjc  * each other.  The interface is used by the Virtual Memory (VM) system and the
     43   6445       jjc  * common lgroup framework.  The VM system uses the plat_*() routines to fill
     44   6445       jjc  * in its memory node (memnode) array with the physical address range spanned
     45   6445       jjc  * by each NUMA node to know which memory belongs to which node, so it can
     46   6445       jjc  * build and manage a physical page free list for each NUMA node and allocate
     47   6445       jjc  * local memory from each node as needed.  The common lgroup framework uses the
     48   6445       jjc  * exported lgrp_plat_*() routines to figure out which CPUs and memory belong
     49   6445       jjc  * to each node (leaf lgroup) and how far each node is from each other, so it
     50   6445       jjc  * can build the latency (lgroup) topology for the machine in order to optimize
     51   6445       jjc  * for locality.  Also, an lgroup platform handle instead of lgroups are used
     52   6445       jjc  * in the interface with this module, so this module shouldn't need to know
     53   6445       jjc  * anything about lgroups.  Instead, it just needs to know which CPUs, memory,
     54   6445       jjc  * etc. are in each NUMA node, how far each node is from each other, and to use
     55   6445       jjc  * a unique lgroup platform handle to refer to each node through the interface.
     56   6445       jjc  *
     57   6445       jjc  * Determining NUMA Configuration
     58   6445       jjc  * ------------------------------
     59   6445       jjc  * By default, this module will try to determine the NUMA configuration of the
     60   6445       jjc  * machine by reading the ACPI System Resource Affinity Table (SRAT) and System
     61   6445       jjc  * Locality Information Table (SLIT).  The SRAT contains info to tell which
     62   6445       jjc  * CPUs and memory are local to a given proximity domain (NUMA node).  The SLIT
     63   6445       jjc  * is a matrix that gives the distance between each system locality (which is
     64   6445       jjc  * a NUMA node and should correspond to proximity domains in the SRAT).  For
     65   6445       jjc  * more details on the SRAT and SLIT, please refer to an ACPI 3.0 or newer
     66   6445       jjc  * specification.
     67   6445       jjc  *
     68   6445       jjc  * If the SRAT doesn't exist on a system with AMD Opteron processors, we
     69   6445       jjc  * examine registers in PCI configuration space to determine how many nodes are
     70   6445       jjc  * in the system and which CPUs and memory are in each node.
     71   6445       jjc  * do while booting the kernel.
     72   6445       jjc  *
     73   6445       jjc  * NOTE: Using these PCI configuration space registers to determine this
     74   6445       jjc  *       locality info is not guaranteed to work or be compatible across all
     75   6445       jjc  *	 Opteron processor families.
     76   6445       jjc  *
     77   6445       jjc  * If the SLIT does not exist or look right, the kernel will probe to determine
     78   6445       jjc  * the distance between nodes as long as the NUMA CPU and memory configuration
     79   6445       jjc  * has been determined (see lgrp_plat_probe() for details).
     80   6445       jjc  *
     81   6445       jjc  * Data Structures
     82   6445       jjc  * ---------------
     83   6445       jjc  * The main data structures used by this code are the following:
     84   6445       jjc  *
     85   6671       jjc  * - lgrp_plat_cpu_node[]		CPU to node ID mapping table indexed by
     86   6671       jjc  *					CPU ID (only used for SRAT)
     87   6445       jjc  *
     88   6445       jjc  * - lgrp_plat_lat_stats.latencies[][]	Table of latencies between same and
     89   6445       jjc  *					different nodes indexed by node ID
     90   6445       jjc  *
     91   6445       jjc  * - lgrp_plat_node_cnt			Number of NUMA nodes in system
     92   6445       jjc  *
     93   6445       jjc  * - lgrp_plat_node_domain[]		Node ID to proximity domain ID mapping
     94   6445       jjc  *					table indexed by node ID (only used
     95   6445       jjc  *					for SRAT)
     96   6445       jjc  *
     97   6445       jjc  * - lgrp_plat_node_memory[]		Table with physical address range for
     98   6445       jjc  *					each node indexed by node ID
     99   6445       jjc  *
    100   6445       jjc  * The code is implemented to make the following always be true:
    101   6445       jjc  *
    102   6445       jjc  *	lgroup platform handle == node ID == memnode ID
    103   6445       jjc  *
    104   6445       jjc  * Moreover, it allows for the proximity domain ID to be equal to all of the
    105   6445       jjc  * above as long as the proximity domains IDs are numbered from 0 to <number of
    106   6445       jjc  * nodes - 1>.  This is done by hashing each proximity domain ID into the range
    107   6445       jjc  * from 0 to <number of nodes - 1>.  Then proximity ID N will hash into node ID
    108   6445       jjc  * N and proximity domain ID N will be entered into lgrp_plat_node_domain[N]
    109   6445       jjc  * and be assigned node ID N.  If the proximity domain IDs aren't numbered
    110   6445       jjc  * from 0 to <number of nodes - 1>, then hashing the proximity domain IDs into
    111   6445       jjc  * lgrp_plat_node_domain[] will still work for assigning proximity domain IDs
    112   6445       jjc  * to node IDs.  However, the proximity domain IDs may not map to the
    113   6445       jjc  * equivalent node ID since we want to keep the node IDs numbered from 0 to
    114   6445       jjc  * <number of nodes - 1> to minimize cost of searching and potentially space.
    115   9716  jonathan  *
    116   9716  jonathan  * The code below really tries to do the above.  However, the virtual memory
    117   9716  jonathan  * system expects the memnodes which describe the physical address range for
    118   9716  jonathan  * each NUMA node to be arranged in ascending order by physical address.  (:-(
    119   9716  jonathan  * Otherwise, the kernel will panic in different semi-random places in the VM
    120   9923  jonathan  * system.
    121   9716  jonathan  *
    122   9716  jonathan  * Consequently, this module has to try to sort the nodes in ascending order by
    123   9716  jonathan  * each node's starting physical address to try to meet this "constraint" in
    124   9716  jonathan  * the VM system (see lgrp_plat_node_sort()).  Also, the lowest numbered
    125   9716  jonathan  * proximity domain ID in the system is deteremined and used to make the lowest
    126   9716  jonathan  * numbered proximity domain map to node 0 in hopes that the proximity domains
    127   9716  jonathan  * are sorted in ascending order by physical address already even if their IDs
    128   9716  jonathan  * don't start at 0 (see NODE_DOMAIN_HASH() and lgrp_plat_srat_domains()).
    129   9716  jonathan  * Finally, it is important to note that these workarounds may not be
    130   9716  jonathan  * sufficient if/when memory hotplugging is supported and the VM system may
    131   9716  jonathan  * ultimately need to be fixed to handle this....
    132   6445       jjc  */
    133   6445       jjc 
    134   6445       jjc 
    135      0    stevel #include <sys/archsystm.h>	/* for {in,out}{b,w,l}() */
    136   6671       jjc #include <sys/bootconf.h>
    137      0    stevel #include <sys/cmn_err.h>
    138   4898       jjc #include <sys/controlregs.h>
    139      0    stevel #include <sys/cpupart.h>
    140      0    stevel #include <sys/cpuvar.h>
    141      0    stevel #include <sys/lgrp.h>
    142      0    stevel #include <sys/machsystm.h>
    143      0    stevel #include <sys/memlist.h>
    144      0    stevel #include <sys/memnode.h>
    145      0    stevel #include <sys/mman.h>
    146    938     esaxe #include <sys/pci_cfgspace.h>
    147    938     esaxe #include <sys/pci_impl.h>
    148      0    stevel #include <sys/param.h>
    149   3434     esaxe #include <sys/pghw.h>
    150      0    stevel #include <sys/promif.h>		/* for prom_printf() */
    151   6445       jjc #include <sys/sysmacros.h>
    152      0    stevel #include <sys/systm.h>
    153      0    stevel #include <sys/thread.h>
    154      0    stevel #include <sys/types.h>
    155      0    stevel #include <sys/var.h>
    156      0    stevel #include <sys/x86_archext.h>	/* for x86_feature and X86_AMD */
    157      0    stevel #include <vm/hat_i86.h>
    158      0    stevel #include <vm/seg_kmem.h>
    159    414     kchow #include <vm/vm_dep.h>
    160      0    stevel 
    161   6445       jjc #include "acpi_fw.h"		/* for SRAT and SLIT */
    162      0    stevel 
    163      0    stevel 
    164      0    stevel #define	MAX_NODES		8
    165      0    stevel #define	NLGRP			(MAX_NODES * (MAX_NODES - 1) + 1)
    166      0    stevel 
    167   6445       jjc /*
    168   6445       jjc  * Constants for configuring probing
    169   6445       jjc  */
    170      0    stevel #define	LGRP_PLAT_PROBE_NROUNDS		64	/* default laps for probing */
    171      0    stevel #define	LGRP_PLAT_PROBE_NSAMPLES	1	/* default samples to take */
    172   1228    andrei #define	LGRP_PLAT_PROBE_NREADS		256	/* number of vendor ID reads */
    173      0    stevel 
    174      0    stevel /*
    175   6445       jjc  * Flags for probing
    176      0    stevel  */
    177   6445       jjc #define	LGRP_PLAT_PROBE_ENABLE		0x1	/* enable probing */
    178   6445       jjc #define	LGRP_PLAT_PROBE_PGCPY		0x2	/* probe using page copy */
    179   6445       jjc #define	LGRP_PLAT_PROBE_VENDOR		0x4	/* probe vendor ID register */
    180      0    stevel 
    181      0    stevel /*
    182   9716  jonathan  * Hash proximity domain ID into node to domain mapping table "mod" number of
    183   9716  jonathan  * nodes to minimize span of entries used and try to have lowest numbered
    184   9716  jonathan  * proximity domain be node 0
    185      0    stevel  */
    186   9716  jonathan #define	NODE_DOMAIN_HASH(domain, node_cnt) \
    187   9716  jonathan 	((lgrp_plat_prox_domain_min == UINT32_MAX) ? (domain) % node_cnt : \
    188   9716  jonathan 	    ((domain) - lgrp_plat_prox_domain_min) % node_cnt)
    189      0    stevel 
    190   4898       jjc 
    191   4898       jjc /*
    192   6671       jjc  * CPU to node ID mapping structure (only used with SRAT)
    193   4898       jjc  */
    194   6445       jjc typedef	struct cpu_node_map {
    195   6445       jjc 	int		exists;
    196   6445       jjc 	uint_t		node;
    197   6445       jjc 	uint32_t	apicid;
    198   6445       jjc 	uint32_t	prox_domain;
    199   6445       jjc } cpu_node_map_t;
    200      0    stevel 
    201      0    stevel /*
    202   6445       jjc  * Latency statistics
    203      0    stevel  */
    204   6445       jjc typedef struct lgrp_plat_latency_stats {
    205   6445       jjc 	hrtime_t	latencies[MAX_NODES][MAX_NODES];
    206   6445       jjc 	hrtime_t	latency_max;
    207   6445       jjc 	hrtime_t	latency_min;
    208   6445       jjc } lgrp_plat_latency_stats_t;
    209      0    stevel 
    210      0    stevel /*
    211   6445       jjc  * Memory configuration for probing
    212      0    stevel  */
    213   6445       jjc typedef struct lgrp_plat_probe_mem_config {
    214   6445       jjc 	size_t	probe_memsize;		/* how much memory to probe per node */
    215   6445       jjc 	caddr_t	probe_va[MAX_NODES];	/* where memory mapped for probing */
    216   6445       jjc 	pfn_t	probe_pfn[MAX_NODES];	/* physical pages to map for probing */
    217   6445       jjc } lgrp_plat_probe_mem_config_t;
    218      0    stevel 
    219      0    stevel /*
    220   6445       jjc  * Statistics kept for probing
    221      0    stevel  */
    222   6445       jjc typedef struct lgrp_plat_probe_stats {
    223   6445       jjc 	hrtime_t	flush_cost;
    224   6445       jjc 	hrtime_t	probe_cost;
    225   6445       jjc 	hrtime_t	probe_cost_total;
    226   6445       jjc 	hrtime_t	probe_error_code;
    227   6445       jjc 	hrtime_t	probe_errors[MAX_NODES][MAX_NODES];
    228   6445       jjc 	int		probe_suspect[MAX_NODES][MAX_NODES];
    229   6445       jjc 	hrtime_t	probe_max[MAX_NODES][MAX_NODES];
    230   6445       jjc 	hrtime_t	probe_min[MAX_NODES][MAX_NODES];
    231   6445       jjc } lgrp_plat_probe_stats_t;
    232      0    stevel 
    233      0    stevel /*
    234   6445       jjc  * Node to proximity domain ID mapping structure (only used with SRAT)
    235      0    stevel  */
    236   6445       jjc typedef	struct node_domain_map {
    237   6445       jjc 	int		exists;
    238   6445       jjc 	uint32_t	prox_domain;
    239   6445       jjc } node_domain_map_t;
    240      0    stevel 
    241      0    stevel /*
    242   6445       jjc  * Node ID and starting and ending page for physical memory in node
    243      0    stevel  */
    244   6445       jjc typedef	struct node_phys_addr_map {
    245   6445       jjc 	pfn_t		start;
    246   6445       jjc 	pfn_t		end;
    247   6445       jjc 	int		exists;
    248   6445       jjc 	uint32_t	prox_domain;
    249   6445       jjc } node_phys_addr_map_t;
    250   6445       jjc 
    251   6671       jjc /*
    252   6706       jjc  * Number of CPUs for which we got APIC IDs
    253   6671       jjc  */
    254   6706       jjc static int				lgrp_plat_apic_ncpus = 0;
    255      0    stevel 
    256      0    stevel /*
    257  10710  jonathan  * CPU to node ID mapping table (only used for SRAT) and its max number of
    258  10710  jonathan  * entries
    259      0    stevel  */
    260  10710  jonathan static cpu_node_map_t			*lgrp_plat_cpu_node = NULL;
    261  10710  jonathan static uint_t				lgrp_plat_cpu_node_nentries = 0;
    262      0    stevel 
    263   4898       jjc /*
    264   6445       jjc  * Latency statistics
    265   4898       jjc  */
    266   6445       jjc lgrp_plat_latency_stats_t		lgrp_plat_lat_stats;
    267      0    stevel 
    268      0    stevel /*
    269      0    stevel  * Whether memory is interleaved across nodes causing MPO to be disabled
    270      0    stevel  */
    271   6445       jjc static int				lgrp_plat_mem_intrlv = 0;
    272   6445       jjc 
    273   6445       jjc /*
    274   6445       jjc  * Node ID to proximity domain ID mapping table (only used for SRAT)
    275   6445       jjc  */
    276   6445       jjc static node_domain_map_t		lgrp_plat_node_domain[MAX_NODES];
    277   6445       jjc 
    278   6445       jjc /*
    279   6445       jjc  * Physical address range for memory in each node
    280   6445       jjc  */
    281   6445       jjc static node_phys_addr_map_t		lgrp_plat_node_memory[MAX_NODES];
    282   6445       jjc 
    283   6445       jjc /*
    284   6445       jjc  * Statistics gotten from probing
    285   6445       jjc  */
    286   6445       jjc static lgrp_plat_probe_stats_t		lgrp_plat_probe_stats;
    287   6445       jjc 
    288   6445       jjc /*
    289   6445       jjc  * Memory configuration for probing
    290   6445       jjc  */
    291   6445       jjc static lgrp_plat_probe_mem_config_t	lgrp_plat_probe_mem_config;
    292   6445       jjc 
    293   6445       jjc /*
    294   9716  jonathan  * Lowest proximity domain ID seen in ACPI SRAT
    295   9716  jonathan  */
    296   9716  jonathan static uint32_t				lgrp_plat_prox_domain_min = UINT32_MAX;
    297   9716  jonathan 
    298   9716  jonathan /*
    299   6445       jjc  * Error code from processing ACPI SRAT
    300   6445       jjc  */
    301   6445       jjc static int				lgrp_plat_srat_error = 0;
    302   6445       jjc 
    303   6445       jjc /*
    304   6445       jjc  * Error code from processing ACPI SLIT
    305   6445       jjc  */
    306   6445       jjc static int				lgrp_plat_slit_error = 0;
    307   6445       jjc 
    308   6445       jjc /*
    309   6445       jjc  * Allocate lgroup array statically
    310   6445       jjc  */
    311   6445       jjc static lgrp_t				lgrp_space[NLGRP];
    312   6445       jjc static int				nlgrps_alloc;
    313   6445       jjc 
    314      0    stevel 
    315      0    stevel /*
    316   9716  jonathan  * Enable finding and using minimum proximity domain ID when hashing
    317   9716  jonathan  */
    318   9716  jonathan int			lgrp_plat_domain_min_enable = 1;
    319   9716  jonathan 
    320   9716  jonathan /*
    321      0    stevel  * Number of nodes in system
    322      0    stevel  */
    323      0    stevel uint_t			lgrp_plat_node_cnt = 1;
    324   9716  jonathan 
    325   9716  jonathan /*
    326   9716  jonathan  * Enable sorting nodes in ascending order by starting physical address
    327   9716  jonathan  */
    328   9716  jonathan int			lgrp_plat_node_sort_enable = 1;
    329      0    stevel 
    330      0    stevel /*
    331   6445       jjc  * Configuration Parameters for Probing
    332   6445       jjc  * - lgrp_plat_probe_flags	Flags to specify enabling probing, probe
    333   6445       jjc  *				operation, etc.
    334   6445       jjc  * - lgrp_plat_probe_nrounds	How many rounds of probing to do
    335   6445       jjc  * - lgrp_plat_probe_nsamples	Number of samples to take when probing each
    336   6445       jjc  *				node
    337   6445       jjc  * - lgrp_plat_probe_nreads	Number of times to read vendor ID from
    338   6445       jjc  *				Northbridge for each probe
    339      0    stevel  */
    340   6445       jjc uint_t			lgrp_plat_probe_flags = 0;
    341      0    stevel int			lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS;
    342      0    stevel int			lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES;
    343   1228    andrei int			lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS;
    344      0    stevel 
    345      0    stevel /*
    346   6445       jjc  * Enable use of ACPI System Resource Affinity Table (SRAT) and System
    347   6445       jjc  * Locality Information Table (SLIT)
    348      0    stevel  */
    349   6445       jjc int			lgrp_plat_srat_enable = 1;
    350   6445       jjc int			lgrp_plat_slit_enable = 1;
    351      0    stevel 
    352      0    stevel /*
    353   9732       Kit  * mnode_xwa: set to non-zero value to initiate workaround if large pages are
    354   9732       Kit  * found to be crossing memory node boundaries. The workaround will eliminate
    355   9732       Kit  * a base size page at the end of each memory node boundary to ensure that
    356   9732       Kit  * a large page with constituent pages that span more than 1 memory node
    357   9732       Kit  * can never be formed.
    358   9732       Kit  *
    359   9732       Kit  */
    360   9732       Kit int	mnode_xwa = 1;
    361   9732       Kit 
    362   9732       Kit /*
    363   6445       jjc  * Static array to hold lgroup statistics
    364      0    stevel  */
    365   6445       jjc struct lgrp_stats	lgrp_stats[NLGRP];
    366   6445       jjc 
    367      0    stevel 
    368      0    stevel /*
    369   6445       jjc  * Forward declarations of platform interface routines
    370      0    stevel  */
    371   6445       jjc void		plat_build_mem_nodes(struct memlist *list);
    372   6445       jjc 
    373   6445       jjc int		plat_lgrphand_to_mem_node(lgrp_handle_t hand);
    374   6445       jjc 
    375   6445       jjc lgrp_handle_t	plat_mem_node_to_lgrphand(int mnode);
    376   6445       jjc 
    377   6445       jjc int		plat_mnode_xcheck(pfn_t pfncnt);
    378   6445       jjc 
    379   6445       jjc int		plat_pfn_to_mem_node(pfn_t pfn);
    380      0    stevel 
    381      0    stevel /*
    382   6445       jjc  * Forward declarations of lgroup platform interface routines
    383      0    stevel  */
    384   6445       jjc lgrp_t		*lgrp_plat_alloc(lgrp_id_t lgrpid);
    385   6445       jjc 
    386   6445       jjc void		lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg);
    387   6445       jjc 
    388   6445       jjc lgrp_handle_t	lgrp_plat_cpu_to_hand(processorid_t id);
    389   6445       jjc 
    390  10710  jonathan void		lgrp_plat_init(lgrp_init_stages_t stage);
    391   6445       jjc 
    392   6445       jjc int		lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to);
    393   6445       jjc 
    394   6445       jjc int		lgrp_plat_max_lgrps(void);
    395   6445       jjc 
    396   6445       jjc pgcnt_t		lgrp_plat_mem_size(lgrp_handle_t plathand,
    397   6445       jjc     lgrp_mem_query_t query);
    398   6445       jjc 
    399   6445       jjc lgrp_handle_t	lgrp_plat_pfn_to_hand(pfn_t pfn);
    400   6445       jjc 
    401   6445       jjc void		lgrp_plat_probe(void);
    402   6445       jjc 
    403   6445       jjc lgrp_handle_t	lgrp_plat_root_hand(void);
    404   6445       jjc 
    405      0    stevel 
    406      0    stevel /*
    407   6445       jjc  * Forward declarations of local routines
    408      0    stevel  */
    409   6445       jjc static int	is_opteron(void);
    410   6445       jjc 
    411   6671       jjc static int	lgrp_plat_cpu_node_update(node_domain_map_t *node_domain,
    412   6706       jjc     int node_cnt, cpu_node_map_t *cpu_node, int nentries, uint32_t apicid,
    413   6706       jjc     uint32_t domain);
    414   6671       jjc 
    415  10710  jonathan static int	lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node,
    416  10710  jonathan     int cpu_node_nentries);
    417   6445       jjc 
    418   6445       jjc static int	lgrp_plat_domain_to_node(node_domain_map_t *node_domain,
    419   6706       jjc     int node_cnt, uint32_t domain);
    420  10710  jonathan 
    421  10710  jonathan static void	lgrp_plat_get_numa_config(void);
    422   6445       jjc 
    423   6445       jjc static void	lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory,
    424   6445       jjc     lgrp_plat_latency_stats_t *lat_stats,
    425   6445       jjc     lgrp_plat_probe_stats_t *probe_stats);
    426   6445       jjc 
    427   6445       jjc static int	lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory,
    428   6445       jjc     lgrp_plat_latency_stats_t *lat_stats);
    429  10710  jonathan 
    430  10710  jonathan static void	lgrp_plat_main_init(void);
    431   6445       jjc 
    432   6445       jjc static pgcnt_t	lgrp_plat_mem_size_default(lgrp_handle_t, lgrp_mem_query_t);
    433   6445       jjc 
    434   6445       jjc static int	lgrp_plat_node_domain_update(node_domain_map_t *node_domain,
    435   6706       jjc     int node_cnt, uint32_t domain);
    436   6445       jjc 
    437   6445       jjc static int	lgrp_plat_node_memory_update(node_domain_map_t *node_domain,
    438   6706       jjc     int node_cnt, node_phys_addr_map_t *node_memory, uint64_t start,
    439   6706       jjc     uint64_t end, uint32_t domain);
    440   6445       jjc 
    441   9716  jonathan static void	lgrp_plat_node_sort(node_domain_map_t *node_domain,
    442   9716  jonathan     int node_cnt, cpu_node_map_t *cpu_node, int cpu_count,
    443   9716  jonathan     node_phys_addr_map_t *node_memory);
    444   9716  jonathan 
    445   6445       jjc static hrtime_t	lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node,
    446  10710  jonathan     int cpu_node_nentries, lgrp_plat_probe_mem_config_t *probe_mem_config,
    447  10710  jonathan     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats);
    448   6445       jjc 
    449   6706       jjc static int	lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node);
    450   6671       jjc 
    451   6445       jjc static int	lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt,
    452   6445       jjc     node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats);
    453   6445       jjc 
    454   6706       jjc static int	lgrp_plat_process_srat(struct srat *tp,
    455   9716  jonathan     uint32_t *prox_domain_min, node_domain_map_t *node_domain,
    456   9716  jonathan     cpu_node_map_t *cpu_node, int cpu_count,
    457   6445       jjc     node_phys_addr_map_t *node_memory);
    458  10710  jonathan 
    459  10710  jonathan static void	lgrp_plat_release_bootstrap(void);
    460   6445       jjc 
    461   9716  jonathan static int	lgrp_plat_srat_domains(struct srat *tp,
    462   9716  jonathan     uint32_t *prox_domain_min);
    463   6445       jjc 
    464   6445       jjc static void	lgrp_plat_2level_setup(node_phys_addr_map_t *node_memory,
    465   6445       jjc     lgrp_plat_latency_stats_t *lat_stats);
    466   6445       jjc 
    467   6445       jjc static void	opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv,
    468   6445       jjc     node_phys_addr_map_t *node_memory);
    469   6445       jjc 
    470   6445       jjc static hrtime_t	opt_probe_vendor(int dest_node, int nreads);
    471      0    stevel 
    472      0    stevel 
    473      0    stevel /*
    474   6445       jjc  * PLATFORM INTERFACE ROUTINES
    475      0    stevel  */
    476      0    stevel 
    477      0    stevel /*
    478      0    stevel  * Configure memory nodes for machines with more than one node (ie NUMA)
    479      0    stevel  */
    480      0    stevel void
    481      0    stevel plat_build_mem_nodes(struct memlist *list)
    482      0    stevel {
    483    892       jjc 	pfn_t		cur_start;	/* start addr of subrange */
    484    892       jjc 	pfn_t		cur_end;	/* end addr of subrange */
    485    892       jjc 	pfn_t		start;		/* start addr of whole range */
    486    892       jjc 	pfn_t		end;		/* end addr of whole range */
    487   9732       Kit 	pgcnt_t		endcnt;		/* pages to sacrifice */
    488      0    stevel 
    489      0    stevel 	/*
    490      0    stevel 	 * Boot install lists are arranged <addr, len>, ...
    491      0    stevel 	 */
    492      0    stevel 	while (list) {
    493      0    stevel 		int	node;
    494      0    stevel 
    495      0    stevel 		start = list->address >> PAGESHIFT;
    496      0    stevel 		end = (list->address + list->size - 1) >> PAGESHIFT;
    497      0    stevel 
    498      0    stevel 		if (start > physmax) {
    499      0    stevel 			list = list->next;
    500      0    stevel 			continue;
    501      0    stevel 		}
    502      0    stevel 		if (end > physmax)
    503      0    stevel 			end = physmax;
    504      0    stevel 
    505      0    stevel 		/*
    506      0    stevel 		 * When there is only one memnode, just add memory to memnode
    507      0    stevel 		 */
    508      0    stevel 		if (max_mem_nodes == 1) {
    509      0    stevel 			mem_node_add_slice(start, end);
    510      0    stevel 			list = list->next;
    511      0    stevel 			continue;
    512      0    stevel 		}
    513      0    stevel 
    514      0    stevel 		/*
    515      0    stevel 		 * mem_node_add_slice() expects to get a memory range that
    516      0    stevel 		 * is within one memnode, so need to split any memory range
    517      0    stevel 		 * that spans multiple memnodes into subranges that are each
    518      0    stevel 		 * contained within one memnode when feeding them to
    519      0    stevel 		 * mem_node_add_slice()
    520      0    stevel 		 */
    521      0    stevel 		cur_start = start;
    522      0    stevel 		do {
    523      0    stevel 			node = plat_pfn_to_mem_node(cur_start);
    524      0    stevel 
    525    892       jjc 			/*
    526    892       jjc 			 * Panic if DRAM address map registers or SRAT say
    527    892       jjc 			 * memory in node doesn't exist or address from
    528    892       jjc 			 * boot installed memory list entry isn't in this node.
    529    892       jjc 			 * This shouldn't happen and rest of code can't deal
    530    892       jjc 			 * with this if it does.
    531    892       jjc 			 */
    532    892       jjc 			if (node < 0 || node >= lgrp_plat_node_cnt ||
    533    892       jjc 			    !lgrp_plat_node_memory[node].exists ||
    534    892       jjc 			    cur_start < lgrp_plat_node_memory[node].start ||
    535    892       jjc 			    cur_start > lgrp_plat_node_memory[node].end) {
    536    892       jjc 				cmn_err(CE_PANIC, "Don't know which memnode "
    537    892       jjc 				    "to add installed memory address 0x%lx\n",
    538    892       jjc 				    cur_start);
    539    892       jjc 			}
    540      0    stevel 
    541      0    stevel 			/*
    542      0    stevel 			 * End of current subrange should not span memnodes
    543      0    stevel 			 */
    544    892       jjc 			cur_end = end;
    545   9732       Kit 			endcnt = 0;
    546    892       jjc 			if (lgrp_plat_node_memory[node].exists &&
    547   9732       Kit 			    cur_end > lgrp_plat_node_memory[node].end) {
    548      0    stevel 				cur_end = lgrp_plat_node_memory[node].end;
    549   9732       Kit 				if (mnode_xwa > 1) {
    550   9732       Kit 					/*
    551   9732       Kit 					 * sacrifice the last page in each
    552   9732       Kit 					 * node to eliminate large pages
    553   9732       Kit 					 * that span more than 1 memory node.
    554   9732       Kit 					 */
    555   9732       Kit 					endcnt = 1;
    556  10274       Kit 					physinstalled--;
    557   9732       Kit 				}
    558   9732       Kit 			}
    559      0    stevel 
    560   9732       Kit 			mem_node_add_slice(cur_start, cur_end - endcnt);
    561      0    stevel 
    562      0    stevel 			/*
    563      0    stevel 			 * Next subrange starts after end of current one
    564      0    stevel 			 */
    565      0    stevel 			cur_start = cur_end + 1;
    566      0    stevel 		} while (cur_end < end);
    567      0    stevel 
    568      0    stevel 		list = list->next;
    569      0    stevel 	}
    570      0    stevel 	mem_node_physalign = 0;
    571      0    stevel 	mem_node_pfn_shift = 0;
    572      0    stevel }
    573      0    stevel 
    574      0    stevel 
    575   6445       jjc int
    576   6445       jjc plat_lgrphand_to_mem_node(lgrp_handle_t hand)
    577   6445       jjc {
    578   6445       jjc 	if (max_mem_nodes == 1)
    579   6445       jjc 		return (0);
    580   6445       jjc 
    581   6445       jjc 	return ((int)hand);
    582   6445       jjc }
    583   6445       jjc 
    584   6445       jjc 
    585   6445       jjc /*
    586   6445       jjc  * plat_mnode_xcheck: checks the node memory ranges to see if there is a pfncnt
    587   6445       jjc  * range of pages aligned on pfncnt that crosses an node boundary. Returns 1 if
    588   6445       jjc  * a crossing is found and returns 0 otherwise.
    589   6445       jjc  */
    590   6445       jjc int
    591   6445       jjc plat_mnode_xcheck(pfn_t pfncnt)
    592   6445       jjc {
    593   6445       jjc 	int	node, prevnode = -1, basenode;
    594   6445       jjc 	pfn_t	ea, sa;
    595   6445       jjc 
    596   6445       jjc 	for (node = 0; node < lgrp_plat_node_cnt; node++) {
    597   6445       jjc 
    598   6445       jjc 		if (lgrp_plat_node_memory[node].exists == 0)
    599   6445       jjc 			continue;
    600   6445       jjc 
    601   6445       jjc 		if (prevnode == -1) {
    602   6445       jjc 			prevnode = node;
    603   6445       jjc 			basenode = node;
    604   6445       jjc 			continue;
    605   6445       jjc 		}
    606   6445       jjc 
    607   6445       jjc 		/* assume x86 node pfn ranges are in increasing order */
    608   6445       jjc 		ASSERT(lgrp_plat_node_memory[node].start >
    609   6445       jjc 		    lgrp_plat_node_memory[prevnode].end);
    610   6445       jjc 
    611   6445       jjc 		/*
    612   6445       jjc 		 * continue if the starting address of node is not contiguous
    613   6445       jjc 		 * with the previous node.
    614   6445       jjc 		 */
    615   6445       jjc 
    616   6445       jjc 		if (lgrp_plat_node_memory[node].start !=
    617   6445       jjc 		    (lgrp_plat_node_memory[prevnode].end + 1)) {
    618   6445       jjc 			basenode = node;
    619   6445       jjc 			prevnode = node;
    620   6445       jjc 			continue;
    621   6445       jjc 		}
    622   6445       jjc 
    623   6445       jjc 		/* check if the starting address of node is pfncnt aligned */
    624   6445       jjc 		if ((lgrp_plat_node_memory[node].start & (pfncnt - 1)) != 0) {
    625   6445       jjc 
    626   6445       jjc 			/*
    627   6445       jjc 			 * at this point, node starts at an unaligned boundary
    628   6445       jjc 			 * and is contiguous with the previous node(s) to
    629   6445       jjc 			 * basenode. Check if there is an aligned contiguous
    630   6445       jjc 			 * range of length pfncnt that crosses this boundary.
    631   6445       jjc 			 */
    632   6445       jjc 
    633   6445       jjc 			sa = P2ALIGN(lgrp_plat_node_memory[prevnode].end,
    634   6445       jjc 			    pfncnt);
    635   6445       jjc 			ea = P2ROUNDUP((lgrp_plat_node_memory[node].start),
    636   6445       jjc 			    pfncnt);
    637   6445       jjc 
    638   6445       jjc 			ASSERT((ea - sa) == pfncnt);
    639   6445       jjc 			if (sa >= lgrp_plat_node_memory[basenode].start &&
    640   9732       Kit 			    ea <= (lgrp_plat_node_memory[node].end + 1)) {
    641   9732       Kit 				/*
    642   9732       Kit 				 * large page found to cross mnode boundary.
    643   9732       Kit 				 * Return Failure if workaround not enabled.
    644   9732       Kit 				 */
    645   9732       Kit 				if (mnode_xwa == 0)
    646   9732       Kit 					return (1);
    647   9732       Kit 				mnode_xwa++;
    648   9732       Kit 			}
    649   6445       jjc 		}
    650   6445       jjc 		prevnode = node;
    651   6445       jjc 	}
    652   6445       jjc 	return (0);
    653   6445       jjc }
    654   6445       jjc 
    655   6445       jjc 
    656   6445       jjc lgrp_handle_t
    657   6445       jjc plat_mem_node_to_lgrphand(int mnode)
    658   6445       jjc {
    659   6445       jjc 	if (max_mem_nodes == 1)
    660   6445       jjc 		return (LGRP_DEFAULT_HANDLE);
    661   6445       jjc 
    662   6445       jjc 	return ((lgrp_handle_t)mnode);
    663   6445       jjc }
    664   6445       jjc 
    665   6445       jjc 
    666   6445       jjc int
    667   6445       jjc plat_pfn_to_mem_node(pfn_t pfn)
    668   6445       jjc {
    669   6445       jjc 	int	node;
    670   6445       jjc 
    671   6445       jjc 	if (max_mem_nodes == 1)
    672   6445       jjc 		return (0);
    673   6445       jjc 
    674   6445       jjc 	for (node = 0; node < lgrp_plat_node_cnt; node++) {
    675   6445       jjc 		/*
    676   6445       jjc 		 * Skip nodes with no memory
    677   6445       jjc 		 */
    678   6445       jjc 		if (!lgrp_plat_node_memory[node].exists)
    679   6445       jjc 			continue;
    680   6445       jjc 
    681   6445       jjc 		if (pfn >= lgrp_plat_node_memory[node].start &&
    682   6445       jjc 		    pfn <= lgrp_plat_node_memory[node].end)
    683   6445       jjc 			return (node);
    684   6445       jjc 	}
    685   6445       jjc 
    686   6445       jjc 	/*
    687   6445       jjc 	 * Didn't find memnode where this PFN lives which should never happen
    688   6445       jjc 	 */
    689   6445       jjc 	ASSERT(node < lgrp_plat_node_cnt);
    690   6445       jjc 	return (-1);
    691   6445       jjc }
    692   6445       jjc 
    693   6445       jjc 
    694   6445       jjc /*
    695   6445       jjc  * LGROUP PLATFORM INTERFACE ROUTINES
    696   6445       jjc  */
    697   6445       jjc 
    698   6445       jjc /*
    699   6445       jjc  * Allocate additional space for an lgroup.
    700   6445       jjc  */
    701   6445       jjc /* ARGSUSED */
    702   6445       jjc lgrp_t *
    703   6445       jjc lgrp_plat_alloc(lgrp_id_t lgrpid)
    704   6445       jjc {
    705   6445       jjc 	lgrp_t *lgrp;
    706   6445       jjc 
    707   6445       jjc 	lgrp = &lgrp_space[nlgrps_alloc++];
    708   6445       jjc 	if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP)
    709   6445       jjc 		return (NULL);
    710   6445       jjc 	return (lgrp);
    711   6445       jjc }
    712   6445       jjc 
    713   6445       jjc 
    714   6445       jjc /*
    715   6445       jjc  * Platform handling for (re)configuration changes
    716   6445       jjc  */
    717   6445       jjc /* ARGSUSED */
    718   6445       jjc void
    719   6445       jjc lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg)
    720   6445       jjc {
    721   6445       jjc }
    722   6445       jjc 
    723   6445       jjc 
    724   6445       jjc /*
    725   6445       jjc  * Return the platform handle for the lgroup containing the given CPU
    726   6445       jjc  */
    727   6445       jjc /* ARGSUSED */
    728   6445       jjc lgrp_handle_t
    729   6445       jjc lgrp_plat_cpu_to_hand(processorid_t id)
    730   6445       jjc {
    731   6445       jjc 	lgrp_handle_t	hand;
    732   6445       jjc 
    733   6445       jjc 	if (lgrp_plat_node_cnt == 1)
    734   6445       jjc 		return (LGRP_DEFAULT_HANDLE);
    735   6445       jjc 
    736   6445       jjc 	hand = (lgrp_handle_t)lgrp_plat_cpu_to_node(cpu[id],
    737  10710  jonathan 	    lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries);
    738   6445       jjc 
    739   6445       jjc 	ASSERT(hand != (lgrp_handle_t)-1);
    740   6445       jjc 	if (hand == (lgrp_handle_t)-1)
    741   6445       jjc 		return (LGRP_NULL_HANDLE);
    742   6445       jjc 
    743   6445       jjc 	return (hand);
    744   6445       jjc }
    745   6445       jjc 
    746   6445       jjc 
    747      0    stevel /*
    748      0    stevel  * Platform-specific initialization of lgroups
    749      0    stevel  */
    750      0    stevel void
    751  10710  jonathan lgrp_plat_init(lgrp_init_stages_t stage)
    752      0    stevel {
    753   5084   johnlev #if defined(__xpv)
    754   5084   johnlev #else	/* __xpv */
    755   9053  jonathan 	u_longlong_t	value;
    756  10710  jonathan #endif	/* __xpv */
    757  10710  jonathan 
    758  10710  jonathan 	switch (stage) {
    759  10710  jonathan 	case LGRP_INIT_STAGE1:
    760  10710  jonathan #if defined(__xpv)
    761  10710  jonathan 		/*
    762  10710  jonathan 		 * XXPV	For now, the hypervisor treats all memory equally.
    763  10710  jonathan 		 */
    764  10710  jonathan 		lgrp_plat_node_cnt = max_mem_nodes = 1;
    765  10710  jonathan #else	/* __xpv */
    766  10710  jonathan 		/*
    767  10710  jonathan 		 * Get boot property for lgroup topology height limit
    768  10710  jonathan 		 */
    769  10710  jonathan 		if (bootprop_getval(BP_LGRP_TOPO_LEVELS, &value) == 0)
    770  10710  jonathan 			(void) lgrp_topo_ht_limit_set((int)value);
    771  10710  jonathan 
    772  10710  jonathan 		/*
    773  10710  jonathan 		 * Get boot property for enabling/disabling SRAT
    774  10710  jonathan 		 */
    775  10710  jonathan 		if (bootprop_getval(BP_LGRP_SRAT_ENABLE, &value) == 0)
    776  10710  jonathan 			lgrp_plat_srat_enable = (int)value;
    777  10710  jonathan 
    778  10710  jonathan 		/*
    779  10710  jonathan 		 * Get boot property for enabling/disabling SLIT
    780  10710  jonathan 		 */
    781  10710  jonathan 		if (bootprop_getval(BP_LGRP_SLIT_ENABLE, &value) == 0)
    782  10710  jonathan 			lgrp_plat_slit_enable = (int)value;
    783  10710  jonathan 
    784  10710  jonathan 		/*
    785  10710  jonathan 		 * Initialize as a UMA machine
    786  10710  jonathan 		 */
    787  10710  jonathan 		if (lgrp_topo_ht_limit() == 1) {
    788  10710  jonathan 			lgrp_plat_node_cnt = max_mem_nodes = 1;
    789  10710  jonathan 			return;
    790  10710  jonathan 		}
    791  10710  jonathan 
    792  10710  jonathan 		lgrp_plat_get_numa_config();
    793  10710  jonathan #endif	/* __xpv */
    794  10710  jonathan 		break;
    795  10710  jonathan 
    796  10710  jonathan 	case LGRP_INIT_STAGE3:
    797  10710  jonathan 		lgrp_plat_probe();
    798  10710  jonathan 		lgrp_plat_release_bootstrap();
    799  10710  jonathan 		break;
    800  10710  jonathan 
    801  10710  jonathan 	case LGRP_INIT_STAGE4:
    802  10710  jonathan 		lgrp_plat_main_init();
    803  10710  jonathan 		break;
    804  10710  jonathan 
    805  10710  jonathan 	default:
    806  10710  jonathan 		break;
    807  10710  jonathan 	}
    808  10710  jonathan }
    809  10710  jonathan 
    810  10710  jonathan 
    811  10710  jonathan /*
    812  10710  jonathan  * Return latency between "from" and "to" lgroups
    813  10710  jonathan  *
    814  10710  jonathan  * This latency number can only be used for relative comparison
    815  10710  jonathan  * between lgroups on the running system, cannot be used across platforms,
    816  10710  jonathan  * and may not reflect the actual latency.  It is platform and implementation
    817  10710  jonathan  * specific, so platform gets to decide its value.  It would be nice if the
    818  10710  jonathan  * number was at least proportional to make comparisons more meaningful though.
    819  10710  jonathan  */
    820  10710  jonathan /* ARGSUSED */
    821  10710  jonathan int
    822  10710  jonathan lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to)
    823  10710  jonathan {
    824  10710  jonathan 	lgrp_handle_t	src, dest;
    825  10710  jonathan 	int		node;
    826  10710  jonathan 
    827  10710  jonathan 	if (max_mem_nodes == 1)
    828  10710  jonathan 		return (0);
    829   9053  jonathan 
    830   9053  jonathan 	/*
    831  10710  jonathan 	 * Return max latency for root lgroup
    832   9053  jonathan 	 */
    833  10710  jonathan 	if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)
    834  10710  jonathan 		return (lgrp_plat_lat_stats.latency_max);
    835  10710  jonathan 
    836  10710  jonathan 	src = from;
    837  10710  jonathan 	dest = to;
    838   9053  jonathan 
    839   9053  jonathan 	/*
    840  10710  jonathan 	 * Return 0 for nodes (lgroup platform handles) out of range
    841   9053  jonathan 	 */
    842  10710  jonathan 	if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES)
    843  10710  jonathan 		return (0);
    844   9053  jonathan 
    845   9053  jonathan 	/*
    846  10710  jonathan 	 * Probe from current CPU if its lgroup latencies haven't been set yet
    847  10710  jonathan 	 * and we are trying to get latency from current CPU to some node
    848   9053  jonathan 	 */
    849  10710  jonathan 	node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
    850  10710  jonathan 	    lgrp_plat_cpu_node_nentries);
    851  10710  jonathan 	ASSERT(node >= 0 && node < lgrp_plat_node_cnt);
    852  10710  jonathan 	if (lgrp_plat_lat_stats.latencies[src][src] == 0 && node == src)
    853  10710  jonathan 		lgrp_plat_probe();
    854  10710  jonathan 
    855  10710  jonathan 	return (lgrp_plat_lat_stats.latencies[src][dest]);
    856  10710  jonathan }
    857  10710  jonathan 
    858  10710  jonathan 
    859  10710  jonathan /*
    860  10710  jonathan  * Return the maximum number of lgrps supported by the platform.
    861  10710  jonathan  * Before lgrp topology is known it returns an estimate based on the number of
    862  10710  jonathan  * nodes. Once topology is known it returns the actual maximim number of lgrps
    863  10710  jonathan  * created. Since x86/x64 doesn't support Dynamic Reconfiguration (DR) and
    864  10710  jonathan  * dynamic addition of new nodes, this number may not grow during system
    865  10710  jonathan  * lifetime (yet).
    866  10710  jonathan  */
    867  10710  jonathan int
    868  10710  jonathan lgrp_plat_max_lgrps(void)
    869  10710  jonathan {
    870  10710  jonathan 	return (lgrp_topo_initialized ?
    871  10710  jonathan 	    lgrp_alloc_max + 1 :
    872  10710  jonathan 	    lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1);
    873  10710  jonathan }
    874  10710  jonathan 
    875  10710  jonathan 
    876  10710  jonathan /*
    877  10710  jonathan  * Return the number of free pages in an lgroup.
    878  10710  jonathan  *
    879  10710  jonathan  * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize
    880  10710  jonathan  * pages on freelists.  For query of LGRP_MEM_SIZE_AVAIL, return the
    881  10710  jonathan  * number of allocatable base pagesize pages corresponding to the
    882  10710  jonathan  * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..)
    883  10710  jonathan  * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical
    884  10710  jonathan  * memory installed, regardless of whether or not it's usable.
    885  10710  jonathan  */
    886  10710  jonathan pgcnt_t
    887  10710  jonathan lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query)
    888  10710  jonathan {
    889  10710  jonathan 	int	mnode;
    890  10710  jonathan 	pgcnt_t npgs = (pgcnt_t)0;
    891  10710  jonathan 	extern struct memlist *phys_avail;
    892  10710  jonathan 	extern struct memlist *phys_install;
    893  10710  jonathan 
    894  10710  jonathan 
    895  10710  jonathan 	if (plathand == LGRP_DEFAULT_HANDLE)
    896  10710  jonathan 		return (lgrp_plat_mem_size_default(plathand, query));
    897  10710  jonathan 
    898  10710  jonathan 	if (plathand != LGRP_NULL_HANDLE) {
    899  10710  jonathan 		mnode = plat_lgrphand_to_mem_node(plathand);
    900  10710  jonathan 		if (mnode >= 0 && mem_node_config[mnode].exists) {
    901  10710  jonathan 			switch (query) {
    902  10710  jonathan 			case LGRP_MEM_SIZE_FREE:
    903  10710  jonathan 				npgs = MNODE_PGCNT(mnode);
    904  10710  jonathan 				break;
    905  10710  jonathan 			case LGRP_MEM_SIZE_AVAIL:
    906  10710  jonathan 				npgs = mem_node_memlist_pages(mnode,
    907  10710  jonathan 				    phys_avail);
    908  10710  jonathan 				break;
    909  10710  jonathan 			case LGRP_MEM_SIZE_INSTALL:
    910  10710  jonathan 				npgs = mem_node_memlist_pages(mnode,
    911  10710  jonathan 				    phys_install);
    912  10710  jonathan 				break;
    913  10710  jonathan 			default:
    914  10710  jonathan 				break;
    915  10710  jonathan 			}
    916  10710  jonathan 		}
    917  10710  jonathan 	}
    918  10710  jonathan 	return (npgs);
    919  10710  jonathan }
    920  10710  jonathan 
    921  10710  jonathan 
    922  10710  jonathan /*
    923  10710  jonathan  * Return the platform handle of the lgroup that contains the physical memory
    924  10710  jonathan  * corresponding to the given page frame number
    925  10710  jonathan  */
    926  10710  jonathan /* ARGSUSED */
    927  10710  jonathan lgrp_handle_t
    928  10710  jonathan lgrp_plat_pfn_to_hand(pfn_t pfn)
    929  10710  jonathan {
    930  10710  jonathan 	int	mnode;
    931  10710  jonathan 
    932  10710  jonathan 	if (max_mem_nodes == 1)
    933  10710  jonathan 		return (LGRP_DEFAULT_HANDLE);
    934  10710  jonathan 
    935  10710  jonathan 	if (pfn > physmax)
    936  10710  jonathan 		return (LGRP_NULL_HANDLE);
    937  10710  jonathan 
    938  10710  jonathan 	mnode = plat_pfn_to_mem_node(pfn);
    939  10710  jonathan 	if (mnode < 0)
    940  10710  jonathan 		return (LGRP_NULL_HANDLE);
    941  10710  jonathan 
    942  10710  jonathan 	return (MEM_NODE_2_LGRPHAND(mnode));
    943  10710  jonathan }
    944  10710  jonathan 
    945  10710  jonathan 
    946  10710  jonathan /*
    947  10710  jonathan  * Probe memory in each node from current CPU to determine latency topology
    948  10710  jonathan  *
    949  10710  jonathan  * The probing code will probe the vendor ID register on the Northbridge of
    950  10710  jonathan  * Opteron processors and probe memory for other processors by default.
    951  10710  jonathan  *
    952  10710  jonathan  * Since probing is inherently error prone, the code takes laps across all the
    953  10710  jonathan  * nodes probing from each node to each of the other nodes some number of
    954  10710  jonathan  * times.  Furthermore, each node is probed some number of times before moving
    955  10710  jonathan  * onto the next one during each lap.  The minimum latency gotten between nodes
    956  10710  jonathan  * is kept as the latency between the nodes.
    957  10710  jonathan  *
    958  10710  jonathan  * After all that,  the probe times are adjusted by normalizing values that are
    959  10710  jonathan  * close to each other and local latencies are made the same.  Lastly, the
    960  10710  jonathan  * latencies are verified to make sure that certain conditions are met (eg.
    961  10710  jonathan  * local < remote, latency(a, b) == latency(b, a), etc.).
    962  10710  jonathan  *
    963  10710  jonathan  * If any of the conditions aren't met, the code will export a NUMA
    964  10710  jonathan  * configuration with the local CPUs and memory given by the SRAT or PCI config
    965  10710  jonathan  * space registers and one remote memory latency since it can't tell exactly
    966  10710  jonathan  * how far each node is from each other.
    967  10710  jonathan  */
    968  10710  jonathan void
    969  10710  jonathan lgrp_plat_probe(void)
    970  10710  jonathan {
    971  10710  jonathan 	int				from;
    972  10710  jonathan 	int				i;
    973  10710  jonathan 	lgrp_plat_latency_stats_t	*lat_stats;
    974  10710  jonathan 	boolean_t			probed;
    975  10710  jonathan 	hrtime_t			probe_time;
    976  10710  jonathan 	int				to;
    977  10710  jonathan 
    978  10710  jonathan 	if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) ||
    979  10710  jonathan 	    max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2)
    980  10710  jonathan 		return;
    981      0    stevel 
    982      0    stevel 	/*
    983  10710  jonathan 	 * Determine ID of node containing current CPU
    984      0    stevel 	 */
    985  10710  jonathan 	from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
    986  10710  jonathan 	    lgrp_plat_cpu_node_nentries);
    987  10710  jonathan 	ASSERT(from >= 0 && from < lgrp_plat_node_cnt);
    988  10710  jonathan 	if (srat_ptr && lgrp_plat_srat_enable && !lgrp_plat_srat_error)
    989  10710  jonathan 		ASSERT(lgrp_plat_node_domain[from].exists);
    990  10710  jonathan 
    991  10710  jonathan 	/*
    992  10710  jonathan 	 * Don't need to probe if got times already
    993  10710  jonathan 	 */
    994  10710  jonathan 	lat_stats = &lgrp_plat_lat_stats;
    995  10710  jonathan 	if (lat_stats->latencies[from][from] != 0)
    996      0    stevel 		return;
    997  10710  jonathan 
    998  10710  jonathan 	/*
    999  10710  jonathan 	 * Read vendor ID in Northbridge or read and write page(s)
   1000  10710  jonathan 	 * in each node from current CPU and remember how long it takes,
   1001  10710  jonathan 	 * so we can build latency topology of machine later.
   1002  10710  jonathan 	 * This should approximate the memory latency between each node.
   1003  10710  jonathan 	 */
   1004  10710  jonathan 	probed = B_FALSE;
   1005  10710  jonathan 	for (i = 0; i < lgrp_plat_probe_nrounds; i++) {
   1006  10710  jonathan 		for (to = 0; to < lgrp_plat_node_cnt; to++) {
   1007  10710  jonathan 			/*
   1008  10710  jonathan 			 * Get probe time and skip over any nodes that can't be
   1009  10710  jonathan 			 * probed yet or don't have memory
   1010  10710  jonathan 			 */
   1011  10710  jonathan 			probe_time = lgrp_plat_probe_time(to,
   1012  10710  jonathan 			    lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries,
   1013  10710  jonathan 			    &lgrp_plat_probe_mem_config, &lgrp_plat_lat_stats,
   1014  10710  jonathan 			    &lgrp_plat_probe_stats);
   1015  10710  jonathan 			if (probe_time == 0)
   1016  10710  jonathan 				continue;
   1017  10710  jonathan 
   1018  10710  jonathan 			probed = B_TRUE;
   1019  10710  jonathan 
   1020  10710  jonathan 			/*
   1021  10710  jonathan 			 * Keep lowest probe time as latency between nodes
   1022  10710  jonathan 			 */
   1023  10710  jonathan 			if (lat_stats->latencies[from][to] == 0 ||
   1024  10710  jonathan 			    probe_time < lat_stats->latencies[from][to])
   1025  10710  jonathan 				lat_stats->latencies[from][to] = probe_time;
   1026  10710  jonathan 
   1027  10710  jonathan 			/*
   1028  10710  jonathan 			 * Update overall minimum and maximum probe times
   1029  10710  jonathan 			 * across all nodes
   1030  10710  jonathan 			 */
   1031  10710  jonathan 			if (probe_time < lat_stats->latency_min ||
   1032  10710  jonathan 			    lat_stats->latency_min == -1)
   1033  10710  jonathan 				lat_stats->latency_min = probe_time;
   1034  10710  jonathan 			if (probe_time > lat_stats->latency_max)
   1035  10710  jonathan 				lat_stats->latency_max = probe_time;
   1036  10710  jonathan 		}
   1037      0    stevel 	}
   1038      0    stevel 
   1039      0    stevel 	/*
   1040  10710  jonathan 	 * Bail out if weren't able to probe any nodes from current CPU
   1041   6671       jjc 	 */
   1042  10710  jonathan 	if (probed == B_FALSE)
   1043  10710  jonathan 		return;
   1044  10710  jonathan 
   1045  10710  jonathan 	/*
   1046  10710  jonathan 	 * - Fix up latencies such that local latencies are same,
   1047  10710  jonathan 	 *   latency(i, j) == latency(j, i), etc. (if possible)
   1048  10710  jonathan 	 *
   1049  10710  jonathan 	 * - Verify that latencies look ok
   1050  10710  jonathan 	 *
   1051  10710  jonathan 	 * - Fallback to just optimizing for local and remote if
   1052  10710  jonathan 	 *   latencies didn't look right
   1053  10710  jonathan 	 */
   1054  10710  jonathan 	lgrp_plat_latency_adjust(lgrp_plat_node_memory, &lgrp_plat_lat_stats,
   1055  10710  jonathan 	    &lgrp_plat_probe_stats);
   1056  10710  jonathan 	lgrp_plat_probe_stats.probe_error_code =
   1057  10710  jonathan 	    lgrp_plat_latency_verify(lgrp_plat_node_memory,
   1058  10710  jonathan 	    &lgrp_plat_lat_stats);
   1059  10710  jonathan 	if (lgrp_plat_probe_stats.probe_error_code)
   1060  10710  jonathan 		lgrp_plat_2level_setup(lgrp_plat_node_memory,
   1061  10710  jonathan 		    &lgrp_plat_lat_stats);
   1062  10710  jonathan }
   1063  10710  jonathan 
   1064  10710  jonathan 
   1065  10710  jonathan /*
   1066  10710  jonathan  * Return platform handle for root lgroup
   1067  10710  jonathan  */
   1068  10710  jonathan lgrp_handle_t
   1069  10710  jonathan lgrp_plat_root_hand(void)
   1070  10710  jonathan {
   1071  10710  jonathan 	return (LGRP_DEFAULT_HANDLE);
   1072  10710  jonathan }
   1073  10710  jonathan 
   1074  10710  jonathan 
   1075  10710  jonathan /*
   1076  10710  jonathan  * INTERNAL ROUTINES
   1077  10710  jonathan  */
   1078  10710  jonathan 
   1079  10710  jonathan 
   1080  10710  jonathan /*
   1081  10710  jonathan  * Update CPU to node mapping for given CPU and proximity domain (and returns
   1082  10710  jonathan  * negative numbers for errors and positive ones for success)
   1083  10710  jonathan  */
   1084  10710  jonathan static int
   1085  10710  jonathan lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, int node_cnt,
   1086  10710  jonathan     cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, uint32_t domain)
   1087  10710  jonathan {
   1088  10710  jonathan 	uint_t	i;
   1089  10710  jonathan 	int	node;
   1090  10710  jonathan 
   1091  10710  jonathan 	/*
   1092  10710  jonathan 	 * Get node number for proximity domain
   1093  10710  jonathan 	 */
   1094  10710  jonathan 	node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain);
   1095  10710  jonathan 	if (node == -1) {
   1096  10710  jonathan 		node = lgrp_plat_node_domain_update(node_domain, node_cnt,
   1097  10710  jonathan 		    domain);
   1098  10710  jonathan 		if (node == -1)
   1099  10710  jonathan 			return (-1);
   1100  10710  jonathan 	}
   1101  10710  jonathan 
   1102  10710  jonathan 	/*
   1103  10710  jonathan 	 * Search for entry with given APIC ID and fill in its node and
   1104  10710  jonathan 	 * proximity domain IDs (if they haven't been set already)
   1105  10710  jonathan 	 */
   1106  10710  jonathan 	for (i = 0; i < nentries; i++) {
   1107  10710  jonathan 		/*
   1108  10710  jonathan 		 * Skip nonexistent entries and ones without matching APIC ID
   1109  10710  jonathan 		 */
   1110  10710  jonathan 		if (!cpu_node[i].exists || cpu_node[i].apicid != apicid)
   1111  10710  jonathan 			continue;
   1112  10710  jonathan 
   1113  10710  jonathan 		/*
   1114  10710  jonathan 		 * Just return if entry completely and correctly filled in
   1115  10710  jonathan 		 * already
   1116  10710  jonathan 		 */
   1117  10710  jonathan 		if (cpu_node[i].prox_domain == domain &&
   1118  10710  jonathan 		    cpu_node[i].node == node)
   1119  10710  jonathan 			return (1);
   1120  10710  jonathan 
   1121  10710  jonathan 		/*
   1122  10710  jonathan 		 * Fill in node and proximity domain IDs
   1123  10710  jonathan 		 */
   1124  10710  jonathan 		cpu_node[i].prox_domain = domain;
   1125  10710  jonathan 		cpu_node[i].node = node;
   1126  10710  jonathan 
   1127  10710  jonathan 		return (0);
   1128  10710  jonathan 	}
   1129  10710  jonathan 
   1130  10710  jonathan 	/*
   1131  10710  jonathan 	 * Return error when entry for APIC ID wasn't found in table
   1132  10710  jonathan 	 */
   1133  10710  jonathan 	return (-2);
   1134  10710  jonathan }
   1135  10710  jonathan 
   1136  10710  jonathan 
   1137  10710  jonathan /*
   1138  10710  jonathan  * Get node ID for given CPU
   1139  10710  jonathan  */
   1140  10710  jonathan static int
   1141  10710  jonathan lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node,
   1142  10710  jonathan     int cpu_node_nentries)
   1143  10710  jonathan {
   1144  10710  jonathan 	processorid_t	cpuid;
   1145  10710  jonathan 
   1146  10710  jonathan 	if (cp == NULL)
   1147  10710  jonathan 		return (-1);
   1148  10710  jonathan 
   1149  10710  jonathan 	cpuid = cp->cpu_id;
   1150  10710  jonathan 	if (cpuid < 0 || cpuid >= max_ncpus)
   1151  10710  jonathan 		return (-1);
   1152  10710  jonathan 
   1153  10710  jonathan 	/*
   1154  10710  jonathan 	 * SRAT doesn't exist, isn't enabled, or there was an error processing
   1155  10947   Srihari 	 * it, so return node ID for Opteron and -1 otherwise.
   1156  10710  jonathan 	 */
   1157  10710  jonathan 	if (srat_ptr == NULL || !lgrp_plat_srat_enable ||
   1158  10710  jonathan 	    lgrp_plat_srat_error) {
   1159  10710  jonathan 		if (is_opteron())
   1160  10947   Srihari 			return (pg_plat_hw_instance_id(cp, PGHW_PROCNODE));
   1161  10710  jonathan 		return (-1);
   1162  10710  jonathan 	}
   1163  10710  jonathan 
   1164  10710  jonathan 	/*
   1165  10710  jonathan 	 * Return -1 when CPU to node ID mapping entry doesn't exist for given
   1166  10710  jonathan 	 * CPU
   1167  10710  jonathan 	 */
   1168  10710  jonathan 	if (cpuid >= cpu_node_nentries || !cpu_node[cpuid].exists)
   1169  10710  jonathan 		return (-1);
   1170  10710  jonathan 
   1171  10710  jonathan 	return (cpu_node[cpuid].node);
   1172  10710  jonathan }
   1173  10710  jonathan 
   1174  10710  jonathan 
   1175  10710  jonathan /*
   1176  10710  jonathan  * Return node number for given proximity domain/system locality
   1177  10710  jonathan  */
   1178  10710  jonathan static int
   1179  10710  jonathan lgrp_plat_domain_to_node(node_domain_map_t *node_domain, int node_cnt,
   1180  10710  jonathan     uint32_t domain)
   1181  10710  jonathan {
   1182  10710  jonathan 	uint_t	node;
   1183  10710  jonathan 	uint_t	start;
   1184  10710  jonathan 
   1185  10710  jonathan 	/*
   1186  10710  jonathan 	 * Hash proximity domain ID into node to domain mapping table (array),
   1187  10710  jonathan 	 * search for entry with matching proximity domain ID, and return index
   1188  10710  jonathan 	 * of matching entry as node ID.
   1189  10710  jonathan 	 */
   1190  10710  jonathan 	node = start = NODE_DOMAIN_HASH(domain, node_cnt);
   1191  10710  jonathan 	do {
   1192  10710  jonathan 		if (node_domain[node].prox_domain == domain &&
   1193  10710  jonathan 		    node_domain[node].exists)
   1194  10710  jonathan 			return (node);
   1195  10710  jonathan 		node = (node + 1) % node_cnt;
   1196  10710  jonathan 	} while (node != start);
   1197  10710  jonathan 	return (-1);
   1198  10710  jonathan }
   1199  10710  jonathan 
   1200  10710  jonathan 
   1201  10710  jonathan /*
   1202  10710  jonathan  * Get NUMA configuration of machine
   1203  10710  jonathan  */
   1204  10710  jonathan static void
   1205  10710  jonathan lgrp_plat_get_numa_config(void)
   1206  10710  jonathan {
   1207  10710  jonathan 	uint_t		probe_op;
   1208  10710  jonathan 
   1209  10710  jonathan 	/*
   1210  10710  jonathan 	 * Read boot property with CPU to APIC ID mapping table/array to
   1211  10710  jonathan 	 * determine number of CPUs
   1212  10710  jonathan 	 */
   1213  10710  jonathan 	lgrp_plat_apic_ncpus = lgrp_plat_process_cpu_apicids(NULL);
   1214   6671       jjc 
   1215   6671       jjc 	/*
   1216   6445       jjc 	 * Determine which CPUs and memory are local to each other and number
   1217   6445       jjc 	 * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT)
   1218      0    stevel 	 */
   1219   6706       jjc 	if (lgrp_plat_apic_ncpus > 0) {
   1220   6706       jjc 		int	retval;
   1221  10710  jonathan 
   1222  10710  jonathan 		/*
   1223  10710  jonathan 		 * Temporarily allocate boot memory to use for CPU to node
   1224  10710  jonathan 		 * mapping since kernel memory allocator isn't alive yet
   1225  10710  jonathan 		 */
   1226  10710  jonathan 		lgrp_plat_cpu_node = (cpu_node_map_t *)BOP_ALLOC(bootops,
   1227  10710  jonathan 		    NULL, lgrp_plat_apic_ncpus * sizeof (cpu_node_map_t),
   1228  10710  jonathan 		    sizeof (int));
   1229  10710  jonathan 
   1230  10710  jonathan 		ASSERT(lgrp_plat_cpu_node != NULL);
   1231  10710  jonathan 		if (lgrp_plat_cpu_node) {
   1232  10710  jonathan 			lgrp_plat_cpu_node_nentries = lgrp_plat_apic_ncpus;
   1233  10710  jonathan 			bzero(lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries *
   1234  10710  jonathan 			    sizeof (cpu_node_map_t));
   1235  10710  jonathan 		}
   1236  10710  jonathan 
   1237  10710  jonathan 		/*
   1238  10710  jonathan 		 * Fill in CPU to node ID mapping table with APIC ID for each
   1239  10710  jonathan 		 * CPU
   1240  10710  jonathan 		 */
   1241  10710  jonathan 		(void) lgrp_plat_process_cpu_apicids(lgrp_plat_cpu_node);
   1242   6706       jjc 
   1243   6706       jjc 		retval = lgrp_plat_process_srat(srat_ptr,
   1244   9716  jonathan 		    &lgrp_plat_prox_domain_min,
   1245   6706       jjc 		    lgrp_plat_node_domain, lgrp_plat_cpu_node,
   1246   6706       jjc 		    lgrp_plat_apic_ncpus, lgrp_plat_node_memory);
   1247   6706       jjc 		if (retval <= 0) {
   1248   6706       jjc 			lgrp_plat_srat_error = retval;
   1249   6706       jjc 			lgrp_plat_node_cnt = 1;
   1250   6706       jjc 		} else {
   1251   6706       jjc 			lgrp_plat_srat_error = 0;
   1252   6706       jjc 			lgrp_plat_node_cnt = retval;
   1253   6706       jjc 		}
   1254   6671       jjc 	}
   1255      0    stevel 
   1256      0    stevel 	/*
   1257   6671       jjc 	 * Try to use PCI config space registers on Opteron if there's an error
   1258   6671       jjc 	 * processing CPU to APIC ID mapping or SRAT
   1259      0    stevel 	 */
   1260   6706       jjc 	if ((lgrp_plat_apic_ncpus <= 0 || lgrp_plat_srat_error != 0) &&
   1261   6671       jjc 	    is_opteron())
   1262   6445       jjc 		opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv,
   1263   6445       jjc 		    lgrp_plat_node_memory);
   1264      0    stevel 
   1265   4898       jjc 	/*
   1266   6445       jjc 	 * Don't bother to setup system for multiple lgroups and only use one
   1267   6445       jjc 	 * memory node when memory is interleaved between any nodes or there is
   1268   6445       jjc 	 * only one NUMA node
   1269   6445       jjc 	 *
   1270   6445       jjc 	 * NOTE: May need to change this for Dynamic Reconfiguration (DR)
   1271   6445       jjc 	 *	 when and if it happens for x86/x64
   1272   4898       jjc 	 */
   1273   6445       jjc 	if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) {
   1274   6445       jjc 		lgrp_plat_node_cnt = max_mem_nodes = 1;
   1275   6445       jjc 		(void) lgrp_topo_ht_limit_set(1);
   1276   6445       jjc 		return;
   1277   4898       jjc 	}
   1278   4898       jjc 
   1279   6445       jjc 	/*
   1280   6445       jjc 	 * Leaf lgroups on x86/x64 architectures contain one physical
   1281   6445       jjc 	 * processor chip. Tune lgrp_expand_proc_thresh and
   1282   6445       jjc 	 * lgrp_expand_proc_diff so that lgrp_choose() will spread
   1283   6445       jjc 	 * things out aggressively.
   1284   6445       jjc 	 */
   1285   6445       jjc 	lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2;
   1286   6445       jjc 	lgrp_expand_proc_diff = 0;
   1287   6445       jjc 
   1288   6445       jjc 	/*
   1289   6445       jjc 	 * There should be one memnode (physical page free list(s)) for
   1290   6445       jjc 	 * each node
   1291   6445       jjc 	 */
   1292   6445       jjc 	max_mem_nodes = lgrp_plat_node_cnt;
   1293   6565       jjc 
   1294   6565       jjc 	/*
   1295   6565       jjc 	 * Initialize min and max latency before reading SLIT or probing
   1296   6565       jjc 	 */
   1297   6565       jjc 	lgrp_plat_lat_stats.latency_min = -1;
   1298   6565       jjc 	lgrp_plat_lat_stats.latency_max = 0;
   1299   6445       jjc 
   1300   6445       jjc 	/*
   1301   6445       jjc 	 * Determine how far each NUMA node is from each other by
   1302   6445       jjc 	 * reading ACPI System Locality Information Table (SLIT) if it
   1303   6445       jjc 	 * exists
   1304   6445       jjc 	 */
   1305   6445       jjc 	lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr,
   1306   6445       jjc 	    lgrp_plat_node_cnt, lgrp_plat_node_memory,
   1307   6445       jjc 	    &lgrp_plat_lat_stats);
   1308   6445       jjc 	if (lgrp_plat_slit_error == 0)
   1309   6445       jjc 		return;
   1310   6445       jjc 
   1311   6445       jjc 	/*
   1312   6445       jjc 	 * Probe to determine latency between NUMA nodes when SLIT
   1313   6445       jjc 	 * doesn't exist or make sense
   1314   6445       jjc 	 */
   1315   6445       jjc 	lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE;
   1316   6445       jjc 
   1317   6445       jjc 	/*
   1318   6445       jjc 	 * Specify whether to probe using vendor ID register or page copy
   1319   6445       jjc 	 * if hasn't been specified already or is overspecified
   1320   6445       jjc 	 */
   1321   6445       jjc 	probe_op = lgrp_plat_probe_flags &
   1322   6445       jjc 	    (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
   1323   6445       jjc 
   1324   6445       jjc 	if (probe_op == 0 ||
   1325   6445       jjc 	    probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) {
   1326   6445       jjc 		lgrp_plat_probe_flags &=
   1327   6445       jjc 		    ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
   1328   6445       jjc 		if (is_opteron())
   1329   6445       jjc 			lgrp_plat_probe_flags |=
   1330   6445       jjc 			    LGRP_PLAT_PROBE_VENDOR;
   1331   6445       jjc 		else
   1332   6445       jjc 			lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY;
   1333   6445       jjc 	}
   1334   6445       jjc 
   1335   6445       jjc 	/*
   1336   6445       jjc 	 * Probing errors can mess up the lgroup topology and
   1337   6445       jjc 	 * force us fall back to a 2 level lgroup topology.
   1338   6445       jjc 	 * Here we bound how tall the lgroup topology can grow
   1339   6445       jjc 	 * in hopes of avoiding any anamolies in probing from
   1340   6445       jjc 	 * messing up the lgroup topology by limiting the
   1341   6445       jjc 	 * accuracy of the latency topology.
   1342   6445       jjc 	 *
   1343   6445       jjc 	 * Assume that nodes will at least be configured in a
   1344   6445       jjc 	 * ring, so limit height of lgroup topology to be less
   1345   6445       jjc 	 * than number of nodes on a system with 4 or more
   1346   6445       jjc 	 * nodes
   1347   6445       jjc 	 */
   1348   6445       jjc 	if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() ==
   1349   6445       jjc 	    lgrp_topo_ht_limit_default())
   1350   6445       jjc 		(void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1);
   1351      0    stevel }
   1352      0    stevel 
   1353      0    stevel 
   1354      0    stevel /*
   1355      0    stevel  * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to
   1356      0    stevel  * be considered same
   1357      0    stevel  */
   1358      0    stevel #define	LGRP_LAT_TOLERANCE_SHIFT	4
   1359      0    stevel 
   1360      0    stevel int	lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT;
   1361      0    stevel 
   1362      0    stevel 
   1363      0    stevel /*
   1364      0    stevel  * Adjust latencies between nodes to be symmetric, normalize latencies between
   1365      0    stevel  * any nodes that are within some tolerance to be same, and make local
   1366      0    stevel  * latencies be same
   1367      0    stevel  */
   1368      0    stevel static void
   1369   6445       jjc lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory,
   1370   6445       jjc     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats)
   1371      0    stevel {
   1372      0    stevel 	int				i;
   1373      0    stevel 	int				j;
   1374      0    stevel 	int				k;
   1375      0    stevel 	int				l;
   1376      0    stevel 	u_longlong_t			max;
   1377      0    stevel 	u_longlong_t			min;
   1378      0    stevel 	u_longlong_t			t;
   1379      0    stevel 	u_longlong_t			t1;
   1380      0    stevel 	u_longlong_t			t2;
   1381   2988       jjc 	const lgrp_config_flag_t	cflag = LGRP_CONFIG_LAT_CHANGE_ALL;
   1382      0    stevel 	int				lat_corrected[MAX_NODES][MAX_NODES];
   1383      0    stevel 
   1384      0    stevel 	/*
   1385   6445       jjc 	 * Nothing to do when this is an UMA machine or don't have args needed
   1386      0    stevel 	 */
   1387      0    stevel 	if (max_mem_nodes == 1)
   1388      0    stevel 		return;
   1389   6445       jjc 
   1390   6445       jjc 	ASSERT(node_memory != NULL && lat_stats != NULL &&
   1391   6445       jjc 	    probe_stats != NULL);
   1392      0    stevel 
   1393      0    stevel 	/*
   1394      0    stevel 	 * Make sure that latencies are symmetric between any two nodes
   1395      0    stevel 	 * (ie. latency(node0, node1) == latency(node1, node0))
   1396      0    stevel 	 */
   1397   6445       jjc 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
   1398   6445       jjc 		if (!node_memory[i].exists)
   1399   6445       jjc 			continue;
   1400   6445       jjc 
   1401      0    stevel 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
   1402   6445       jjc 			if (!node_memory[j].exists)
   1403   6445       jjc 				continue;
   1404   6445       jjc 
   1405   6445       jjc 			t1 = lat_stats->latencies[i][j];
   1406   6445       jjc 			t2 = lat_stats->latencies[j][i];
   1407      0    stevel 
   1408      0    stevel 			if (t1 == 0 || t2 == 0 || t1 == t2)
   1409      0    stevel 				continue;
   1410      0    stevel 
   1411      0    stevel 			/*
   1412      0    stevel 			 * Latencies should be same
   1413      0    stevel 			 * - Use minimum of two latencies which should be same
   1414      0    stevel 			 * - Track suspect probe times not within tolerance of
   1415      0    stevel 			 *   min value
   1416      0    stevel 			 * - Remember how much values are corrected by
   1417      0    stevel 			 */
   1418      0    stevel 			if (t1 > t2) {
   1419      0    stevel 				t = t2;
   1420   6445       jjc 				probe_stats->probe_errors[i][j] += t1 - t2;
   1421      0    stevel 				if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) {
   1422   6445       jjc 					probe_stats->probe_suspect[i][j]++;
   1423   6445       jjc 					probe_stats->probe_suspect[j][i]++;
   1424      0    stevel 				}
   1425      0    stevel 			} else if (t2 > t1) {
   1426      0    stevel 				t = t1;
   1427   6445       jjc 				probe_stats->probe_errors[j][i] += t2 - t1;
   1428      0    stevel 				if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) {
   1429   6445       jjc 					probe_stats->probe_suspect[i][j]++;
   1430   6445       jjc 					probe_stats->probe_suspect[j][i]++;
   1431      0    stevel 				}
   1432      0    stevel 			}
   1433      0    stevel 
   1434   6445       jjc 			lat_stats->latencies[i][j] =
   1435   6445       jjc 			    lat_stats->latencies[j][i] = t;
   1436      0    stevel 			lgrp_config(cflag, t1, t);
   1437      0    stevel 			lgrp_config(cflag, t2, t);
   1438      0    stevel 		}
   1439   6445       jjc 	}
   1440      0    stevel 
   1441      0    stevel 	/*
   1442      0    stevel 	 * Keep track of which latencies get corrected
   1443      0    stevel 	 */
   1444      0    stevel 	for (i = 0; i < MAX_NODES; i++)
   1445      0    stevel 		for (j = 0; j < MAX_NODES; j++)
   1446      0    stevel 			lat_corrected[i][j] = 0;
   1447      0    stevel 
   1448      0    stevel 	/*
   1449      0    stevel 	 * For every two nodes, see whether there is another pair of nodes which
   1450      0    stevel 	 * are about the same distance apart and make the latencies be the same
   1451      0    stevel 	 * if they are close enough together
   1452      0    stevel 	 */
   1453   6445       jjc 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
   1454      0    stevel 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
   1455   6445       jjc 			if (!node_memory[j].exists)
   1456   6445       jjc 				continue;
   1457      0    stevel 			/*
   1458      0    stevel 			 * Pick one pair of nodes (i, j)
   1459      0    stevel 			 * and get latency between them
   1460      0    stevel 			 */
   1461   6445       jjc 			t1 = lat_stats->latencies[i][j];
   1462      0    stevel 
   1463      0    stevel 			/*
   1464      0    stevel 			 * Skip this pair of nodes if there isn't a latency
   1465      0    stevel 			 * for it yet
   1466      0    stevel 			 */
   1467      0    stevel 			if (t1 == 0)
   1468      0    stevel 				continue;
   1469      0    stevel 
   1470   6445       jjc 			for (k = 0; k < lgrp_plat_node_cnt; k++) {
   1471      0    stevel 				for (l = 0; l < lgrp_plat_node_cnt; l++) {
   1472   6445       jjc 					if (!node_memory[l].exists)
   1473   6445       jjc 						continue;
   1474      0    stevel 					/*
   1475      0    stevel 					 * Pick another pair of nodes (k, l)
   1476      0    stevel 					 * not same as (i, j) and get latency
   1477      0    stevel 					 * between them
   1478      0    stevel 					 */
   1479      0    stevel 					if (k == i && l == j)
   1480      0    stevel 						continue;
   1481      0    stevel 
   1482   6445       jjc 					t2 = lat_stats->latencies[k][l];
   1483      0    stevel 
   1484      0    stevel 					/*
   1485      0    stevel 					 * Skip this pair of nodes if there
   1486      0    stevel 					 * isn't a latency for it yet
   1487      0    stevel 					 */
   1488      0    stevel 
   1489      0    stevel 					if (t2 == 0)
   1490      0    stevel 						continue;
   1491      0    stevel 
   1492      0    stevel 					/*
   1493      0    stevel 					 * Skip nodes (k, l) if they already
   1494      0    stevel 					 * have same latency as (i, j) or
   1495      0    stevel 					 * their latency isn't close enough to
   1496      0    stevel 					 * be considered/made the same
   1497      0    stevel 					 */
   1498      0    stevel 					if (t1 == t2 || (t1 > t2 && t1 - t2 >
   1499      0    stevel 					    t1 >> lgrp_plat_probe_lt_shift) ||
   1500      0    stevel 					    (t2 > t1 && t2 - t1 >
   1501      0    stevel 					    t2 >> lgrp_plat_probe_lt_shift))
   1502      0    stevel 						continue;
   1503      0    stevel 
   1504      0    stevel 					/*
   1505      0    stevel 					 * Make latency(i, j) same as
   1506      0    stevel 					 * latency(k, l), try to use latency
   1507      0    stevel 					 * that has been adjusted already to get
   1508      0    stevel 					 * more consistency (if possible), and
   1509      0    stevel 					 * remember which latencies were
   1510      0    stevel 					 * adjusted for next time
   1511      0    stevel 					 */
   1512      0    stevel 					if (lat_corrected[i][j]) {
   1513      0    stevel 						t = t1;
   1514      0    stevel 						lgrp_config(cflag, t2, t);
   1515      0    stevel 						t2 = t;
   1516      0    stevel 					} else if (lat_corrected[k][l]) {
   1517      0    stevel 						t = t2;
   1518      0    stevel 						lgrp_config(cflag, t1, t);
   1519      0    stevel 						t1 = t;
   1520      0    stevel 					} else {
   1521      0    stevel 						if (t1 > t2)
   1522      0    stevel 							t = t2;
   1523      0    stevel 						else
   1524      0    stevel 							t = t1;
   1525      0    stevel 						lgrp_config(cflag, t1, t);
   1526      0    stevel 						lgrp_config(cflag, t2, t);
   1527      0    stevel 						t1 = t2 = t;
   1528      0    stevel 					}
   1529      0    stevel 
   1530   6445       jjc 					lat_stats->latencies[i][j] =
   1531   6445       jjc 					    lat_stats->latencies[k][l] = t;
   1532      0    stevel 
   1533      0    stevel 					lat_corrected[i][j] =
   1534      0    stevel 					    lat_corrected[k][l] = 1;
   1535      0    stevel 				}
   1536   6445       jjc 			}
   1537      0    stevel 		}
   1538   6445       jjc 	}
   1539      0    stevel 
   1540      0    stevel 	/*
   1541      0    stevel 	 * Local latencies should be same
   1542      0    stevel 	 * - Find min and max local latencies
   1543      0    stevel 	 * - Make all local latencies be minimum
   1544      0    stevel 	 */
   1545      0    stevel 	min = -1;
   1546      0    stevel 	max = 0;
   1547      0    stevel 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
   1548   6445       jjc 		if (!node_memory[i].exists)
   1549   6445       jjc 			continue;
   1550   6445       jjc 		t = lat_stats->latencies[i][i];
   1551      0    stevel 		if (t == 0)
   1552      0    stevel 			continue;
   1553      0    stevel 		if (min == -1 || t < min)
   1554      0    stevel 			min = t;
   1555      0    stevel 		if (t > max)
   1556      0    stevel 			max = t;
   1557      0    stevel 	}
   1558      0    stevel 	if (min != max) {
   1559      0    stevel 		for (i = 0; i < lgrp_plat_node_cnt; i++) {
   1560      0    stevel 			int	local;
   1561      0    stevel 
   1562   6445       jjc 			if (!node_memory[i].exists)
   1563   6445       jjc 				continue;
   1564   6445       jjc 
   1565   6445       jjc 			local = lat_stats->latencies[i][i];
   1566      0    stevel 			if (local == 0)
   1567      0    stevel 				continue;
   1568      0    stevel 
   1569      0    stevel 			/*
   1570      0    stevel 			 * Track suspect probe times that aren't within
   1571      0    stevel 			 * tolerance of minimum local latency and how much
   1572      0    stevel 			 * probe times are corrected by
   1573      0    stevel 			 */
   1574      0    stevel 			if (local - min > min >> lgrp_plat_probe_lt_shift)
   1575   6445       jjc 				probe_stats->probe_suspect[i][i]++;
   1576      0    stevel 
   1577   6445       jjc 			probe_stats->probe_errors[i][i] += local - min;
   1578      0    stevel 
   1579      0    stevel 			/*
   1580      0    stevel 			 * Make local latencies be minimum
   1581      0    stevel 			 */
   1582   2988       jjc 			lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min);
   1583   6445       jjc 			lat_stats->latencies[i][i] = min;
   1584      0    stevel 		}
   1585      0    stevel 	}
   1586      0    stevel 
   1587      0    stevel 	/*
   1588      0    stevel 	 * Determine max probe time again since just adjusted latencies
   1589      0    stevel 	 */
   1590   6445       jjc 	lat_stats->latency_max = 0;
   1591   6445       jjc 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
   1592      0    stevel 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
   1593   6445       jjc 			if (!node_memory[j].exists)
   1594   6445       jjc 				continue;
   1595   6445       jjc 			t = lat_stats->latencies[i][j];
   1596   6445       jjc 			if (t > lat_stats->latency_max)
   1597   6445       jjc 				lat_stats->latency_max = t;
   1598      0    stevel 		}
   1599   6445       jjc 	}
   1600      0    stevel }
   1601      0    stevel 
   1602      0    stevel 
   1603      0    stevel /*
   1604      0    stevel  * Verify following about latencies between nodes:
   1605      0    stevel  *
   1606      0    stevel  * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a))
   1607      0    stevel  * - Local latencies same
   1608      0    stevel  * - Local < remote
   1609      0    stevel  * - Number of latencies seen is reasonable
   1610      0    stevel  * - Number of occurrences of a given latency should be more than 1
   1611      0    stevel  *
   1612      0    stevel  * Returns:
   1613      0    stevel  *	0	Success
   1614      0    stevel  *	-1	Not symmetric
   1615      0    stevel  *	-2	Local latencies not same
   1616      0    stevel  *	-3	Local >= remote
   1617      0    stevel  */
   1618      0    stevel static int
   1619   6445       jjc lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory,
   1620   6445       jjc     lgrp_plat_latency_stats_t *lat_stats)
   1621      0    stevel {
   1622      0    stevel 	int				i;
   1623      0    stevel 	int				j;
   1624      0    stevel 	u_longlong_t			t1;
   1625      0    stevel 	u_longlong_t			t2;
   1626   6445       jjc 
   1627   6445       jjc 	ASSERT(node_memory != NULL && lat_stats != NULL);
   1628      0    stevel 
   1629      0    stevel 	/*
   1630     50       jjc 	 * Nothing to do when this is an UMA machine, lgroup topology is
   1631     50       jjc 	 * limited to 2 levels, or there aren't any probe times yet
   1632      0    stevel 	 */
   1633      0    stevel 	if (max_mem_nodes == 1 || lgrp_topo_levels < 2 ||
   1634   6445       jjc 	    lat_stats->latencies[0][0] == 0)
   1635      0    stevel 		return (0);
   1636      0    stevel 
   1637      0    stevel 	/*
   1638      0    stevel 	 * Make sure that latencies are symmetric between any two nodes
   1639      0    stevel 	 * (ie. latency(node0, node1) == latency(node1, node0))
   1640      0    stevel 	 */
   1641   6445       jjc 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
   1642   6445       jjc 		if (!node_memory[i].exists)
   1643   6445       jjc 			continue;
   1644      0    stevel 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
   1645   6445       jjc 			if (!node_memory[j].exists)
   1646   6445       jjc 				continue;
   1647   6445       jjc 			t1 = lat_stats->latencies[i][j];
   1648   6445       jjc 			t2 = lat_stats->latencies[j][i];
   1649      0    stevel 
   1650      0    stevel 			if (t1 == 0 || t2 == 0 || t1 == t2)
   1651      0    stevel 				continue;
   1652      0    stevel 
   1653      0    stevel 			return (-1);
   1654      0    stevel 		}
   1655   6445       jjc 	}
   1656      0    stevel 
   1657      0    stevel 	/*
   1658      0    stevel 	 * Local latencies should be same
   1659      0    stevel 	 */
   1660   6445       jjc 	t1 = lat_stats->latencies[0][0];
   1661      0    stevel 	for (i = 1; i < lgrp_plat_node_cnt; i++) {
   1662   6445       jjc 		if (!node_memory[i].exists)
   1663   6445       jjc 			continue;
   1664   6445       jjc 
   1665   6445       jjc 		t2 = lat_stats->latencies[i][i];
   1666      0    stevel 		if (t2 == 0)
   1667      0    stevel 			continue;
   1668      0    stevel 
   1669     50       jjc 		if (t1 == 0) {
   1670     50       jjc 			t1 = t2;
   1671     50       jjc 			continue;
   1672     50       jjc 		}
   1673     50       jjc 
   1674      0    stevel 		if (t1 != t2)
   1675      0    stevel 			return (-2);
   1676      0    stevel 	}
   1677      0    stevel 
   1678      0    stevel 	/*
   1679      0    stevel 	 * Local latencies should be less than remote
   1680      0    stevel 	 */
   1681     50       jjc 	if (t1) {
   1682   6445       jjc 		for (i = 0; i < lgrp_plat_node_cnt; i++) {
   1683     50       jjc 			for (j = 0; j < lgrp_plat_node_cnt; j++) {
   1684   6445       jjc 				if (!node_memory[j].exists)
   1685   6445       jjc 					continue;
   1686   6445       jjc 				t2 = lat_stats->latencies[i][j];
   1687     50       jjc 				if (i == j || t2 == 0)
   1688     50       jjc 					continue;
   1689      0    stevel 
   1690     50       jjc 				if (t1 >= t2)
   1691     50       jjc 					return (-3);
   1692     50       jjc 			}
   1693      0    stevel 		}
   1694      0    stevel 	}
   1695      0    stevel 
   1696      0    stevel 	return (0);
   1697      0    stevel }
   1698      0    stevel 
   1699      0    stevel 
   1700      0    stevel /*
   1701  10710  jonathan  * Platform-specific initialization
   1702  10710  jonathan  */
   1703  10710  jonathan static void
   1704  10710  jonathan lgrp_plat_main_init(void)
   1705  10710  jonathan {
   1706  10710  jonathan 	int	curnode;
   1707  10710  jonathan 	int	ht_limit;
   1708  10710  jonathan 	int	i;
   1709  10710  jonathan 
   1710  10710  jonathan 	/*
   1711  10710  jonathan 	 * Print a notice that MPO is disabled when memory is interleaved
   1712  10710  jonathan 	 * across nodes....Would do this when it is discovered, but can't
   1713  10710  jonathan 	 * because it happens way too early during boot....
   1714  10710  jonathan 	 */
   1715  10710  jonathan 	if (lgrp_plat_mem_intrlv)
   1716  10710  jonathan 		cmn_err(CE_NOTE,
   1717  10710  jonathan 		    "MPO disabled because memory is interleaved\n");
   1718  10710  jonathan 
   1719  10710  jonathan 	/*
   1720  10710  jonathan 	 * Don't bother to do any probing if it is disabled, there is only one
   1721  10710  jonathan 	 * node, or the height of the lgroup topology less than or equal to 2
   1722  10710  jonathan 	 */
   1723  10710  jonathan 	ht_limit = lgrp_topo_ht_limit();
   1724  10710  jonathan 	if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) ||
   1725  10710  jonathan 	    max_mem_nodes == 1 || ht_limit <= 2) {
   1726  10710  jonathan 		/*
   1727  10710  jonathan 		 * Setup lgroup latencies for 2 level lgroup topology
   1728  10710  jonathan 		 * (ie. local and remote only) if they haven't been set yet
   1729  10710  jonathan 		 */
   1730  10710  jonathan 		if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 &&
   1731  10710  jonathan 		    lgrp_plat_lat_stats.latency_max == 0)
   1732  10710  jonathan 			lgrp_plat_2level_setup(lgrp_plat_node_memory,
   1733  10710  jonathan 			    &lgrp_plat_lat_stats);
   1734  10710  jonathan 		return;
   1735  10710  jonathan 	}
   1736  10710  jonathan 
   1737  10710  jonathan 	if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) {
   1738  10710  jonathan 		/*
   1739  10710  jonathan 		 * Should have been able to probe from CPU 0 when it was added
   1740  10710  jonathan 		 * to lgroup hierarchy, but may not have been able to then
   1741  10710  jonathan 		 * because it happens so early in boot that gethrtime() hasn't
   1742  10710  jonathan 		 * been initialized.  (:-(
   1743  10710  jonathan 		 */
   1744  10710  jonathan 		curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
   1745  10710  jonathan 		    lgrp_plat_cpu_node_nentries);
   1746  10710  jonathan 		ASSERT(curnode >= 0 && curnode < lgrp_plat_node_cnt);
   1747  10710  jonathan 		if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0)
   1748  10710  jonathan 			lgrp_plat_probe();
   1749  10710  jonathan 
   1750  10710  jonathan 		return;
   1751  10710  jonathan 	}
   1752  10710  jonathan 
   1753  10710  jonathan 	/*
   1754  10710  jonathan 	 * When probing memory, use one page for every sample to determine
   1755  10710  jonathan 	 * lgroup topology and taking multiple samples
   1756  10710  jonathan 	 */
   1757  10710  jonathan 	if (lgrp_plat_probe_mem_config.probe_memsize == 0)
   1758  10710  jonathan 		lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE *
   1759  10710  jonathan 		    lgrp_plat_probe_nsamples;
   1760  10710  jonathan 
   1761  10710  jonathan 	/*
   1762  10710  jonathan 	 * Map memory in each node needed for probing to determine latency
   1763  10710  jonathan 	 * topology
   1764  10710  jonathan 	 */
   1765  10710  jonathan 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
   1766  10710  jonathan 		int	mnode;
   1767  10710  jonathan 
   1768  10710  jonathan 		/*
   1769  10710  jonathan 		 * Skip this node and leave its probe page NULL
   1770  10710  jonathan 		 * if it doesn't have any memory
   1771  10710  jonathan 		 */
   1772  10710  jonathan 		mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i);
   1773  10710  jonathan 		if (!mem_node_config[mnode].exists) {
   1774  10710  jonathan 			lgrp_plat_probe_mem_config.probe_va[i] = NULL;
   1775  10710  jonathan 			continue;
   1776  10710  jonathan 		}
   1777  10710  jonathan 
   1778  10710  jonathan 		/*
   1779  10710  jonathan 		 * Allocate one kernel virtual page
   1780  10710  jonathan 		 */
   1781  10710  jonathan 		lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena,
   1782  10710  jonathan 		    lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP);
   1783  10710  jonathan 		if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) {
   1784  10710  jonathan 			cmn_err(CE_WARN,
   1785  10710  jonathan 			    "lgrp_plat_main_init: couldn't allocate memory");
   1786  10710  jonathan 			return;
   1787  10710  jonathan 		}
   1788  10710  jonathan 
   1789  10710  jonathan 		/*
   1790  10710  jonathan 		 * Get PFN for first page in each node
   1791  10710  jonathan 		 */
   1792  10710  jonathan 		lgrp_plat_probe_mem_config.probe_pfn[i] =
   1793  10710  jonathan 		    mem_node_config[mnode].physbase;
   1794  10710  jonathan 
   1795  10710  jonathan 		/*
   1796  10710  jonathan 		 * Map virtual page to first page in node
   1797  10710  jonathan 		 */
   1798  10710  jonathan 		hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i],
   1799  10710  jonathan 		    lgrp_plat_probe_mem_config.probe_memsize,
   1800  10710  jonathan 		    lgrp_plat_probe_mem_config.probe_pfn[i],
   1801  10710  jonathan 		    PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE,
   1802  10710  jonathan 		    HAT_LOAD_NOCONSIST);
   1803  10710  jonathan 	}
   1804  10710  jonathan 
   1805  10710  jonathan 	/*
   1806  10710  jonathan 	 * Probe from current CPU
   1807  10710  jonathan 	 */
   1808  10710  jonathan 	lgrp_plat_probe();
   1809  10710  jonathan }
   1810  10710  jonathan 
   1811  10710  jonathan 
   1812  10710  jonathan /*
   1813      0    stevel  * Return the number of free, allocatable, or installed
   1814      0    stevel  * pages in an lgroup
   1815      0    stevel  * This is a copy of the MAX_MEM_NODES == 1 version of the routine
   1816      0    stevel  * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup
   1817      0    stevel  */
   1818      0    stevel /* ARGSUSED */
   1819      0    stevel static pgcnt_t
   1820      0    stevel lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query)
   1821      0    stevel {
   1822      0    stevel 	struct memlist *mlist;
   1823      0    stevel 	pgcnt_t npgs = 0;
   1824      0    stevel 	extern struct memlist *phys_avail;
   1825      0    stevel 	extern struct memlist *phys_install;
   1826      0    stevel 
   1827      0    stevel 	switch (query) {
   1828      0    stevel 	case LGRP_MEM_SIZE_FREE:
   1829      0    stevel 		return ((pgcnt_t)freemem);
   1830      0    stevel 	case LGRP_MEM_SIZE_AVAIL:
   1831      0    stevel 		memlist_read_lock();
   1832      0    stevel 		for (mlist = phys_avail; mlist; mlist = mlist->next)
   1833      0    stevel 			npgs += btop(mlist->size);
   1834      0    stevel 		memlist_read_unlock();
   1835      0    stevel 		return (npgs);
   1836      0    stevel 	case LGRP_MEM_SIZE_INSTALL:
   1837      0    stevel 		memlist_read_lock();
   1838      0    stevel 		for (mlist = phys_install; mlist; mlist = mlist->next)
   1839      0    stevel 			npgs += btop(mlist->size);
   1840      0    stevel 		memlist_read_unlock();
   1841      0    stevel 		return (npgs);
   1842      0    stevel 	default:
   1843      0    stevel 		return ((pgcnt_t)0);
   1844      0    stevel 	}
   1845      0    stevel }
   1846      0    stevel 
   1847   6445       jjc 
   1848      0    stevel /*
   1849   6445       jjc  * Update node to proximity domain mappings for given domain and return node ID
   1850      0    stevel  */
   1851   6445       jjc static int
   1852   6706       jjc lgrp_plat_node_domain_update(node_domain_map_t *node_domain, int node_cnt,
   1853   6706       jjc     uint32_t domain)
   1854      0    stevel {
   1855   6445       jjc 	uint_t	node;
   1856   6445       jjc 	uint_t	start;
   1857      0    stevel 
   1858   6445       jjc 	/*
   1859   6445       jjc 	 * Hash proximity domain ID into node to domain mapping table (array)
   1860   6445       jjc 	 * and add entry for it into first non-existent or matching entry found
   1861   6445       jjc 	 */
   1862   6706       jjc 	node = start = NODE_DOMAIN_HASH(domain, node_cnt);
   1863   6445       jjc 	do {
   1864   6445       jjc 		/*
   1865   6445       jjc 		 * Entry doesn't exist yet, so create one for this proximity
   1866   6445       jjc 		 * domain and return node ID which is index into mapping table.
   1867   6445       jjc 		 */
   1868   6445       jjc 		if (!node_domain[node].exists) {
   1869   6445       jjc 			node_domain[node].exists = 1;
   1870   6445       jjc 			node_domain[node].prox_domain = domain;
   1871   6445       jjc 			return (node);
   1872   6445       jjc 		}
   1873      0    stevel 
   1874   6445       jjc 		/*
   1875   6445       jjc 		 * Entry exists for this proximity domain already, so just
   1876   6445       jjc 		 * return node ID (index into table).
   1877   6445       jjc 		 */
   1878   6445       jjc 		if (node_domain[node].prox_domain == domain)
   1879   6445       jjc 			return (node);
   1880   6706       jjc 		node = NODE_DOMAIN_HASH(node + 1, node_cnt);
   1881   6445       jjc 	} while (node != start);
   1882      0    stevel 
   1883   6445       jjc 	/*
   1884   6445       jjc 	 * Ran out of supported number of entries which shouldn't happen....
   1885   6445       jjc 	 */
   1886   6445       jjc 	ASSERT(node != start);
   1887   6445       jjc 	return (-1);
   1888   6445       jjc }
   1889   6445       jjc 
   1890   6445       jjc 
   1891   6445       jjc /*
   1892   6445       jjc  * Update node memory information for given proximity domain with specified
   1893   6445       jjc  * starting and ending physical address range (and return positive numbers for
   1894   6445       jjc  * success and negative ones for errors)
   1895   6445       jjc  */
   1896   6445       jjc static int
   1897   6706       jjc lgrp_plat_node_memory_update(node_domain_map_t *node_domain, int node_cnt,
   1898   6462       jjc     node_phys_addr_map_t *node_memory, uint64_t start, uint64_t end,
   1899   6445       jjc     uint32_t domain)
   1900   6445       jjc {
   1901   6445       jjc 	int	node;
   1902   6445       jjc 
   1903   6445       jjc 	/*
   1904   6445       jjc 	 * Get node number for proximity domain
   1905   6445       jjc 	 */
   1906   6706       jjc 	node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain);
   1907   6445       jjc 	if (node == -1) {
   1908   6706       jjc 		node = lgrp_plat_node_domain_update(node_domain, node_cnt,
   1909   6706       jjc 		    domain);
   1910   6445       jjc 		if (node == -1)
   1911   6445       jjc 			return (-1);
   1912   6445       jjc 	}
   1913   6445       jjc 
   1914   6445       jjc 	/*
   1915   6445       jjc 	 * Create entry in table for node if it doesn't exist
   1916   6445       jjc 	 */
   1917   6445       jjc 	if (!node_memory[node].exists) {
   1918   6445       jjc 		node_memory[node].exists = 1;
   1919   6445       jjc 		node_memory[node].start = btop(start);
   1920   6445       jjc 		node_memory[node].end = btop(end);
   1921   6445       jjc 		node_memory[node].prox_domain = domain;
   1922   6445       jjc 		return (0);
   1923   6445       jjc 	}
   1924   6445       jjc 
   1925   6445       jjc 	/*
   1926   6445       jjc 	 * Entry already exists for this proximity domain
   1927   6445       jjc 	 *
   1928   6445       jjc 	 * There may be more than one SRAT memory entry for a domain, so we may
   1929   6445       jjc 	 * need to update existing start or end address for the node.
   1930   6445       jjc 	 */
   1931   6445       jjc 	if (node_memory[node].prox_domain == domain) {
   1932   6445       jjc 		if (btop(start) < node_memory[node].start)
   1933   6445       jjc 			node_memory[node].start = btop(start);
   1934   6445       jjc 		if (btop(end) > node_memory[node].end)
   1935   6445       jjc 			node_memory[node].end = btop(end);
   1936   6445       jjc 		return (1);
   1937   6445       jjc 	}
   1938   6445       jjc 	return (-2);
   1939   6445       jjc }
   1940   6445       jjc 
   1941   6445       jjc 
   1942   6445       jjc /*
   1943   9716  jonathan  * Have to sort node by starting physical address because VM system (physical
   1944   9716  jonathan  * page free list management) assumes and expects memnodes to be sorted in
   1945   9716  jonathan  * ascending order by physical address.  If not, the kernel will panic in
   1946   9716  jonathan  * potentially a number of different places.  (:-(
   1947   9716  jonathan  * NOTE: This workaround will not be sufficient if/when hotplugging memory is
   1948   9716  jonathan  *	 supported on x86/x64.
   1949   9716  jonathan  */
   1950   9716  jonathan static void
   1951   9716  jonathan lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt,
   1952   9716  jonathan     cpu_node_map_t *cpu_node, int cpu_count, node_phys_addr_map_t *node_memory)
   1953   9716  jonathan {
   1954   9716  jonathan 	boolean_t	found;
   1955   9716  jonathan 	int		i;
   1956   9716  jonathan 	int		j;
   1957   9716  jonathan 	int		n;
   1958   9716  jonathan 	boolean_t	sorted;
   1959   9716  jonathan 	boolean_t	swapped;
   1960   9716  jonathan 
   1961   9716  jonathan 	if (!lgrp_plat_node_sort_enable || node_cnt <= 1 ||
   1962   9716  jonathan 	    node_domain == NULL || node_memory == NULL)
   1963   9716  jonathan 		return;
   1964   9716  jonathan 
   1965   9716  jonathan 	/*
   1966   9716  jonathan 	 * Sorted already?
   1967   9716  jonathan 	 */
   1968   9716  jonathan 	sorted = B_TRUE;
   1969   9716  jonathan 	for (i = 0; i < node_cnt - 1; i++) {
   1970   9716  jonathan 		/*
   1971   9716  jonathan 		 * Skip entries that don't exist
   1972   9716  jonathan 		 */
   1973   9716  jonathan 		if (!node_memory[i].exists)
   1974   9716  jonathan 			continue;
   1975   9716  jonathan 
   1976   9716  jonathan 		/*
   1977   9716  jonathan 		 * Try to find next existing entry to compare against
   1978   9716  jonathan 		 */
   1979   9716  jonathan 		found = B_FALSE;
   1980   9716  jonathan 		for (j = i + 1; j < node_cnt; j++) {
   1981   9716  jonathan 			if (node_memory[j].exists) {
   1982   9716  jonathan 				found = B_TRUE;
   1983   9716  jonathan 				break;
   1984   9716  jonathan 			}
   1985   9716  jonathan 		}
   1986   9716  jonathan 
   1987   9716  jonathan 		/*
   1988   9716  jonathan 		 * Done if no more existing entries to compare against
   1989   9716  jonathan 		 */
   1990   9716  jonathan 		if (found == B_FALSE)
   1991   9716  jonathan 			break;
   1992   9716  jonathan 
   1993   9716  jonathan 		/*
   1994   9716  jonathan 		 * Not sorted if starting address of current entry is bigger
   1995   9716  jonathan 		 * than starting address of next existing entry
   1996   9716  jonathan 		 */
   1997   9716  jonathan 		if (node_memory[i].start > node_memory[j].start) {
   1998   9716  jonathan 			sorted = B_FALSE;
   1999   9716  jonathan 			break;
   2000   9716  jonathan 		}
   2001   9716  jonathan 	}
   2002   9716  jonathan 
   2003   9716  jonathan 	/*
   2004   9716  jonathan 	 * Don't need to sort if sorted already
   2005   9716  jonathan 	 */
   2006   9716  jonathan 	if (sorted == B_TRUE)
   2007   9716  jonathan 		return;
   2008   9716  jonathan 
   2009   9716  jonathan 	/*
   2010   9716  jonathan 	 * Just use bubble sort since number of nodes is small
   2011   9716  jonathan 	 */
   2012   9716  jonathan 	n = node_cnt;
   2013   9716  jonathan 	do {
   2014   9716  jonathan 		swapped = B_FALSE;
   2015   9716  jonathan 		n--;
   2016   9716  jonathan 		for (i = 0; i < n; i++) {
   2017   9716  jonathan 			/*
   2018   9716  jonathan 			 * Skip entries that don't exist
   2019   9716  jonathan 			 */
   2020   9716  jonathan 			if (!node_memory[i].exists)
   2021   9716  jonathan 				continue;
   2022   9716  jonathan 
   2023   9716  jonathan 			/*
   2024   9716  jonathan 			 * Try to find next existing entry to compare against
   2025   9716  jonathan 			 */
   2026   9716  jonathan 			found = B_FALSE;
   2027   9716  jonathan 			for (j = i + 1; j <= n; j++) {
   2028   9716  jonathan 				if (node_memory[j].exists) {
   2029   9716  jonathan 					found = B_TRUE;
   2030   9716  jonathan 					break;
   2031   9716  jonathan 				}
   2032   9716  jonathan 			}
   2033   9716  jonathan 
   2034   9716  jonathan 			/*
   2035   9716  jonathan 			 * Done if no more existing entries to compare against
   2036   9716  jonathan 			 */
   2037   9716  jonathan 			if (found == B_FALSE)
   2038   9716  jonathan 				break;
   2039   9716  jonathan 
   2040   9716  jonathan 			if (node_memory[i].start > node_memory[j].start) {
   2041   9716  jonathan 				node_phys_addr_map_t	save_addr;
   2042   9716  jonathan 				node_domain_map_t	save_node;
   2043   9716  jonathan 
   2044   9716  jonathan 				/*
   2045   9716  jonathan 				 * Swap node to proxmity domain ID assignments
   2046   9716  jonathan 				 */
   2047   9716  jonathan 				bcopy(&node_domain[i], &save_node,
   2048   9716  jonathan 				    sizeof (node_domain_map_t));
   2049   9716  jonathan 				bcopy(&node_domain[j], &node_domain[i],
   2050   9716  jonathan 				    sizeof (node_domain_map_t));
   2051   9716  jonathan 				bcopy(&save_node, &node_domain[j],
   2052   9716  jonathan 				    sizeof (node_domain_map_t));
   2053   9716  jonathan 
   2054   9716  jonathan 				/*
   2055   9716  jonathan 				 * Swap node to physical memory assignments
   2056   9716  jonathan 				 */
   2057   9716  jonathan 				bcopy(&node_memory[i], &save_addr,
   2058   9716  jonathan 				    sizeof (node_phys_addr_map_t));
   2059   9716  jonathan 				bcopy(&node_memory[j], &node_memory[i],
   2060   9716  jonathan 				    sizeof (node_phys_addr_map_t));
   2061   9716  jonathan 				bcopy(&save_addr, &node_memory[j],
   2062   9716  jonathan 				    sizeof (node_phys_addr_map_t));
   2063   9716  jonathan 				swapped = B_TRUE;
   2064   9716  jonathan 			}
   2065   9716  jonathan 		}
   2066   9716  jonathan 	} while (swapped == B_TRUE);
   2067   9716  jonathan 
   2068   9716  jonathan 	/*
   2069   9716  jonathan 	 * Check to make sure that CPUs assigned to correct node IDs now since
   2070   9716  jonathan 	 * node to proximity domain ID assignments may have been changed above
   2071   9716  jonathan 	 */
   2072   9716  jonathan 	if (n == node_cnt - 1 || cpu_node == NULL || cpu_count < 1)
   2073   9716  jonathan 		return;
   2074   9716  jonathan 	for (i = 0; i < cpu_count; i++) {
   2075   9716  jonathan 		int		node;
   2076   9716  jonathan 
   2077   9716  jonathan 		node = lgrp_plat_domain_to_node(node_domain, node_cnt,
   2078   9716  jonathan 		    cpu_node[i].prox_domain);
   2079   9716  jonathan 		if (cpu_node[i].node != node)
   2080   9716  jonathan 			cpu_node[i].node = node;
   2081   9716  jonathan 	}
   2082   9716  jonathan 
   2083   9716  jonathan }
   2084   9716  jonathan 
   2085   9716  jonathan 
   2086   9716  jonathan /*
   2087   6445       jjc  * Return time needed to probe from current CPU to memory in given node
   2088   6445       jjc  */
   2089   6445       jjc static hrtime_t
   2090  10710  jonathan lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, int cpu_node_nentries,
   2091   6445       jjc     lgrp_plat_probe_mem_config_t *probe_mem_config,
   2092   6445       jjc     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats)
   2093   6445       jjc {
   2094   6445       jjc 	caddr_t			buf;
   2095   6445       jjc 	hrtime_t		elapsed;
   2096   6445       jjc 	hrtime_t		end;
   2097   6445       jjc 	int			from;
   2098   6445       jjc 	int			i;
   2099   6445       jjc 	int			ipl;
   2100   6445       jjc 	hrtime_t		max;
   2101   6445       jjc 	hrtime_t		min;
   2102   6445       jjc 	hrtime_t		start;
   2103   6445       jjc 	extern int		use_sse_pagecopy;
   2104   6445       jjc 
   2105   6445       jjc 	/*
   2106   6445       jjc 	 * Determine ID of node containing current CPU
   2107   6445       jjc 	 */
   2108  10710  jonathan 	from = lgrp_plat_cpu_to_node(CPU, cpu_node, cpu_node_nentries);
   2109   6445       jjc 	ASSERT(from >= 0 && from < lgrp_plat_node_cnt);
   2110   6445       jjc 
   2111   6445       jjc 	/*
   2112   6445       jjc 	 * Do common work for probing main memory
   2113   6445       jjc 	 */
   2114   6445       jjc 	if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_PGCPY) {
   2115   6445       jjc 		/*
   2116   6445       jjc 		 * Skip probing any nodes without memory and
   2117   6445       jjc 		 * set probe time to 0
   2118   6445       jjc 		 */
   2119   6445       jjc 		if (probe_mem_config->probe_va[to] == NULL) {
   2120   6445       jjc 			lat_stats->latencies[from][to] = 0;
   2121   6445       jjc 			return (0);
   2122   6445       jjc 		}
   2123   6445       jjc 
   2124   6445       jjc 		/*
   2125   6445       jjc 		 * Invalidate caches once instead of once every sample
   2126   6445       jjc 		 * which should cut cost of probing by a lot
   2127   6445       jjc 		 */
   2128   6445       jjc 		probe_stats->flush_cost = gethrtime();
   2129   6445       jjc 		invalidate_cache();
   2130   6445       jjc 		probe_stats->flush_cost = gethrtime() -
   2131   6445       jjc 		    probe_stats->flush_cost;
   2132   6445       jjc 		probe_stats->probe_cost_total += probe_stats->flush_cost;
   2133   6445       jjc 	}
   2134   6445       jjc 
   2135   6445       jjc 	/*
   2136   6445       jjc 	 * Probe from current CPU to given memory using specified operation
   2137   6445       jjc 	 * and take specified number of samples
   2138   6445       jjc 	 */
   2139   6445       jjc 	max = 0;
   2140   6445       jjc 	min = -1;
   2141   6445       jjc 	for (i = 0; i < lgrp_plat_probe_nsamples; i++) {
   2142   6445       jjc 		probe_stats->probe_cost = gethrtime();
   2143   6445       jjc 
   2144   6445       jjc 		/*
   2145   6445       jjc 		 * Can't measure probe time if gethrtime() isn't working yet
   2146   6445       jjc 		 */
   2147   6445       jjc 		if (probe_stats->probe_cost == 0 && gethrtime() == 0)
   2148   6445       jjc 			return (0);
   2149   6445       jjc 
   2150   6445       jjc 		if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) {
   2151   6445       jjc 			/*
   2152   6445       jjc 			 * Measure how long it takes to read vendor ID from
   2153   6445       jjc 			 * Northbridge
   2154   6445       jjc 			 */
   2155   6445       jjc 			elapsed = opt_probe_vendor(to, lgrp_plat_probe_nreads);
   2156   6445       jjc 		} else {
   2157   6445       jjc 			/*
   2158   6445       jjc 			 * Measure how long it takes to copy page
   2159   6445       jjc 			 * on top of itself
   2160   6445       jjc 			 */
   2161   6445       jjc 			buf = probe_mem_config->probe_va[to] + (i * PAGESIZE);
   2162   6445       jjc 
   2163   6445       jjc 			kpreempt_disable();
   2164   6445       jjc 			ipl = splhigh();
   2165   6445       jjc 			start = gethrtime();
   2166   6445       jjc 			if (use_sse_pagecopy)
   2167   6445       jjc 				hwblkpagecopy(buf, buf);
   2168   6445       jjc 			else
   2169   6445       jjc 				bcopy(buf, buf, PAGESIZE);
   2170   6445       jjc 			end = gethrtime();
   2171   6445       jjc 			elapsed = end - start;
   2172   6445       jjc 			splx(ipl);
   2173   6445       jjc 			kpreempt_enable();
   2174   6445       jjc 		}
   2175   6445       jjc 
   2176   6445       jjc 		probe_stats->probe_cost = gethrtime() -
   2177   6445       jjc 		    probe_stats->probe_cost;
   2178   6445       jjc 		probe_stats->probe_cost_total += probe_stats->probe_cost;
   2179   6445       jjc 
   2180   6445       jjc 		if (min == -1 || elapsed < min)
   2181   6445       jjc 			min = elapsed;
   2182   6445       jjc 		if (elapsed > max)
   2183   6445       jjc 			max = elapsed;
   2184   6445       jjc 	}
   2185   6445       jjc 
   2186   6445       jjc 	/*
   2187   6445       jjc 	 * Update minimum and maximum probe times between
   2188   6445       jjc 	 * these two nodes
   2189   6445       jjc 	 */
   2190   6445       jjc 	if (min < probe_stats->probe_min[from][to] ||
   2191   6445       jjc 	    probe_stats->probe_min[from][to] == 0)
   2192   6445       jjc 		probe_stats->probe_min[from][to] = min;
   2193   6445       jjc 
   2194   6445       jjc 	if (max > probe_stats->probe_max[from][to])
   2195   6445       jjc 		probe_stats->probe_max[from][to] = max;
   2196   6445       jjc 
   2197   6445       jjc 	return (min);
   2198   6445       jjc }
   2199   6445       jjc 
   2200   6445       jjc 
   2201   6445       jjc /*
   2202   6706       jjc  * Read boot property with CPU to APIC ID array, fill in CPU to node ID
   2203  10710  jonathan  * mapping table with APIC ID for each CPU (if pointer to table isn't NULL),
   2204  10710  jonathan  * and return number of CPU APIC IDs.
   2205   6671       jjc  *
   2206   6671       jjc  * NOTE: This code assumes that CPU IDs are assigned in order that they appear
   2207   6671       jjc  *       in in cpu_apicid_array boot property which is based on and follows
   2208   6671       jjc  *	 same ordering as processor list in ACPI MADT.  If the code in
   2209   6671       jjc  *	 usr/src/uts/i86pc/io/pcplusmp/apic.c that reads MADT and assigns
   2210   6671       jjc  *	 CPU IDs ever changes, then this code will need to change too....
   2211   6671       jjc  */
   2212   6671       jjc static int
   2213   6706       jjc lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node)
   2214   6671       jjc {
   2215   6706       jjc 	int	boot_prop_len;
   2216   6671       jjc 	char	*boot_prop_name = BP_CPU_APICID_ARRAY;
   2217   6671       jjc 	uint8_t	cpu_apicid_array[UINT8_MAX + 1];
   2218   6671       jjc 	int	i;
   2219   6706       jjc 	int	n;
   2220   6671       jjc 
   2221   6671       jjc 	/*
   2222   6671       jjc 	 * Check length of property value
   2223   6671       jjc 	 */
   2224   6671       jjc 	boot_prop_len = BOP_GETPROPLEN(bootops, boot_prop_name);
   2225   6706       jjc 	if (boot_prop_len <= 0 || boot_prop_len > sizeof (cpu_apicid_array))
   2226  10710  jonathan 		return (-1);
   2227   6706       jjc 
   2228   6706       jjc 	/*
   2229   6706       jjc 	 * Calculate number of entries in array and return when there's just
   2230   6706       jjc 	 * one CPU since that's not very interesting for NUMA
   2231   6706       jjc 	 */
   2232   6706       jjc 	n = boot_prop_len / sizeof (uint8_t);
   2233   6706       jjc 	if (n == 1)
   2234  10710  jonathan 		return (-2);
   2235   6671       jjc 
   2236   6671       jjc 	/*
   2237   6671       jjc 	 * Get CPU to APIC ID property value
   2238   6671       jjc 	 */
   2239   6671       jjc 	if (BOP_GETPROP(bootops, boot_prop_name, cpu_apicid_array) < 0)
   2240  10710  jonathan 		return (-3);
   2241  10710  jonathan 
   2242  10710  jonathan 	/*
   2243  10710  jonathan 	 * Just return number of CPU APIC IDs if CPU to node mapping table is
   2244  10710  jonathan 	 * NULL
   2245  10710  jonathan 	 */
   2246  10710  jonathan 	if (cpu_node == NULL)
   2247  10710  jonathan 		return (n);
   2248   6671       jjc 
   2249   6671       jjc 	/*
   2250   6671       jjc 	 * Fill in CPU to node ID mapping table with APIC ID for each CPU
   2251   6671       jjc 	 */
   2252   6706       jjc 	for (i = 0; i < n; i++) {
   2253   6671       jjc 		cpu_node[i].exists = 1;
   2254   6671       jjc 		cpu_node[i].apicid = cpu_apicid_array[i];
   2255   6671       jjc 	}
   2256   6671       jjc 
   2257   6706       jjc 	/*
   2258   6706       jjc 	 * Return number of CPUs based on number of APIC IDs
   2259   6706       jjc 	 */
   2260   6706       jjc 	return (n);
   2261   6671       jjc }
   2262   6671       jjc 
   2263   6671       jjc 
   2264   6671       jjc /*
   2265   6445       jjc  * Read ACPI System Locality Information Table (SLIT) to determine how far each
   2266   6445       jjc  * NUMA node is from each other
   2267   6445       jjc  */
   2268   6445       jjc static int
   2269   6445       jjc lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt,
   2270   6445       jjc     node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats)
   2271   6445       jjc {
   2272   6445       jjc 	int		i;
   2273   6445       jjc 	int		j;
   2274   6445       jjc 	int		localities;
   2275   6445       jjc 	hrtime_t	max;
   2276   6445       jjc 	hrtime_t	min;
   2277   6445       jjc 	int		retval;
   2278   6445       jjc 	uint8_t		*slit_entries;
   2279   6445       jjc 
   2280   6445       jjc 	if (tp == NULL || !lgrp_plat_slit_enable)
   2281   6445       jjc 		return (1);
   2282   6445       jjc 
   2283   6445       jjc 	if (lat_stats == NULL)
   2284   6445       jjc 		return (2);
   2285   6445       jjc 
   2286   6445       jjc 	localities = tp->number;
   2287   6445       jjc 	if (localities != node_cnt)
   2288   6445       jjc 		return (3);
   2289   6445       jjc 
   2290   6445       jjc 	min = lat_stats->latency_min;
   2291   6445       jjc 	max = lat_stats->latency_max;
   2292   6445       jjc 
   2293   6445       jjc 	/*
   2294   6445       jjc 	 * Fill in latency matrix based on SLIT entries
   2295   6445       jjc 	 */
   2296   6445       jjc 	slit_entries = tp->entry;
   2297   6445       jjc 	for (i = 0; i < localities; i++) {
   2298   6445       jjc 		for (j = 0; j < localities; j++) {
   2299   6445       jjc 			uint8_t	latency;
   2300   6445       jjc 
   2301   6445       jjc 			latency = slit_entries[(i * localities) + j];
   2302   6445       jjc 			lat_stats->latencies[i][j] = latency;
   2303   6565       jjc 			if (latency < min || min == -1)
   2304   6445       jjc 				min = latency;
   2305   6445       jjc 			if (latency > max)
   2306   6445       jjc 				max = latency;
   2307   6445       jjc 		}
   2308   6445       jjc 	}
   2309   6445       jjc 
   2310   6445       jjc 	/*
   2311   6445       jjc 	 * Verify that latencies/distances given in SLIT look reasonable
   2312   6445       jjc 	 */
   2313   6445       jjc 	retval = lgrp_plat_latency_verify(node_memory, lat_stats);
   2314   6445       jjc 
   2315   6445       jjc 	if (retval) {
   2316   6445       jjc 		/*
   2317   6445       jjc 		 * Reinitialize (zero) latency table since SLIT doesn't look
   2318   6445       jjc 		 * right
   2319   6445       jjc 		 */
   2320   6445       jjc 		for (i = 0; i < localities; i++) {
   2321   6445       jjc 			for (j = 0; j < localities; j++)
   2322   6445       jjc 				lat_stats->latencies[i][j] = 0;
   2323   6445       jjc 		}
   2324   6445       jjc 	} else {
   2325   6445       jjc 		/*
   2326   6445       jjc 		 * Update min and max latencies seen since SLIT looks valid
   2327   6445       jjc 		 */
   2328   6445       jjc 		lat_stats->latency_min = min;
   2329   6445       jjc 		lat_stats->latency_max = max;
   2330   6445       jjc 	}
   2331   6445       jjc 
   2332   6445       jjc 	return (retval);
   2333   6445       jjc }
   2334   6445       jjc 
   2335   6445       jjc 
   2336   6445       jjc /*
   2337   6445       jjc  * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs
   2338   6706       jjc  * and memory are local to each other in the same NUMA node and return number
   2339   6706       jjc  * of nodes
   2340   6445       jjc  */
   2341   6445       jjc static int
   2342   9716  jonathan lgrp_plat_process_srat(struct srat *tp, uint32_t *prox_domain_min,
   2343   9716  jonathan     node_domain_map_t *node_domain, cpu_node_map_t *cpu_node, int cpu_count,
   2344   9716  jonathan     node_phys_addr_map_t *node_memory)
   2345   6445       jjc {
   2346   6565       jjc 	struct srat_item	*srat_end;
   2347   6445       jjc 	int			i;
   2348   6445       jjc 	struct srat_item	*item;
   2349   6706       jjc 	int			node_cnt;
   2350   6671       jjc 	int			proc_entry_count;
   2351   6445       jjc 
   2352   6706       jjc 	/*
   2353   6706       jjc 	 * Nothing to do when no SRAT or disabled
   2354   6706       jjc 	 */
   2355   6445       jjc 	if (tp == NULL || !lgrp_plat_srat_enable)
   2356   6706       jjc 		return (-1);
   2357   6445       jjc 
   2358   6445       jjc 	/*
   2359   6445       jjc 	 * Determine number of nodes by counting number of proximity domains in
   2360   6706       jjc 	 * SRAT and return if number of nodes is 1 or less since don't need to
   2361   6706       jjc 	 * read SRAT then
   2362   6445       jjc 	 */
   2363   9716  jonathan 	node_cnt = lgrp_plat_srat_domains(tp, prox_domain_min);
   2364   6706       jjc 	if (node_cnt == 1)
   2365   6706       jjc 		return (1);
   2366   6706       jjc 	else if (node_cnt <= 0)
   2367   6706       jjc 		return (-2);
   2368   6445       jjc 
   2369   6445       jjc 	/*
   2370   6445       jjc 	 * Walk through SRAT, examining each CPU and memory entry to determine
   2371   6445       jjc 	 * which CPUs and memory belong to which node.
   2372   6445       jjc 	 */
   2373   6445       jjc 	item = tp->list;
   2374   6565       jjc 	srat_end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp);
   2375   6671       jjc 	proc_entry_count = 0;
   2376   6565       jjc 	while (item < srat_end) {
   2377   6445       jjc 		uint32_t	apic_id;
   2378   6445       jjc 		uint32_t	domain;
   2379   6445       jjc 		uint64_t	end;
   2380   6445       jjc 		uint64_t	length;
   2381   6445       jjc 		uint64_t	start;
   2382   6445       jjc 
   2383   6445       jjc 		switch (item->type) {
   2384   6445       jjc 		case SRAT_PROCESSOR:	/* CPU entry */
   2385   6445       jjc 			if (!(item->i.p.flags & SRAT_ENABLED) ||
   2386   6445       jjc 			    cpu_node == NULL)
   2387      0    stevel 				break;
   2388   6445       jjc 
   2389   6445       jjc 			/*
   2390   6445       jjc 			 * Calculate domain (node) ID and fill in APIC ID to
   2391   6445       jjc 			 * domain/node mapping table
   2392   6445       jjc 			 */
   2393   6445       jjc 			domain = item->i.p.domain1;
   2394   6445       jjc 			for (i = 0; i < 3; i++) {
   2395   6445       jjc 				domain += item->i.p.domain2[i] <<
   2396   6445       jjc 				    ((i + 1) * 8);
   2397   6445       jjc 			}
   2398   6445       jjc 			apic_id = item->i.p.apic_id;
   2399   6445       jjc 
   2400   6706       jjc 			if (lgrp_plat_cpu_node_update(node_domain, node_cnt,
   2401   6706       jjc 			    cpu_node, cpu_count, apic_id, domain) < 0)
   2402   6706       jjc 				return (-3);
   2403   6671       jjc 
   2404   6671       jjc 			proc_entry_count++;
   2405   6445       jjc 			break;
   2406   6445       jjc 
   2407   6445       jjc 		case SRAT_MEMORY:	/* memory entry */
   2408   6445       jjc 			if (!(item->i.m.flags & SRAT_ENABLED) ||
   2409   6445       jjc 			    node_memory == NULL)
   2410      0    stevel 				break;
   2411   6445       jjc 
   2412   6445       jjc 			/*
   2413   6445       jjc 			 * Get domain (node) ID and fill in domain/node
   2414   6445       jjc 			 * to memory mapping table
   2415   6445       jjc 			 */
   2416   6445       jjc 			domain = item->i.m.domain;
   2417   6445       jjc 			start = item->i.m.base_addr;
   2418   6445       jjc 			length = item->i.m.len;
   2419   6445       jjc 			end = start + length - 1;
   2420   6445       jjc 
   2421   6706       jjc 			if (lgrp_plat_node_memory_update(node_domain, node_cnt,
   2422   6445       jjc 			    node_memory, start, end, domain) < 0)
   2423   6706       jjc 				return (-4);
   2424   6445       jjc 			break;
   2425   7282    mishra 		case SRAT_X2APIC:	/* x2apic CPU entry */
   2426   7282    mishra 			if (!(item->i.xp.flags & SRAT_ENABLED) ||
   2427   7282    mishra 			    cpu_node == NULL)
   2428   7282    mishra 				break;
   2429   7282    mishra 
   2430   7282    mishra 			/*
   2431   7282    mishra 			 * Calculate domain (node) ID and fill in APIC ID to
   2432   7282    mishra 			 * domain/node mapping table
   2433   7282    mishra 			 */
   2434   7282    mishra 			domain = item->i.xp.domain;
   2435   7282    mishra 			apic_id = item->i.xp.x2apic_id;
   2436   7282    mishra 
   2437   7282    mishra 			if (lgrp_plat_cpu_node_update(node_domain, node_cnt,
   2438   7282    mishra 			    cpu_node, cpu_count, apic_id, domain) < 0)
   2439   7282    mishra 				return (-3);
   2440   7282    mishra 
   2441   7282    mishra 			proc_entry_count++;
   2442   7282    mishra 			break;
   2443   6445       jjc 
   2444   6445       jjc 		default:
   2445   6445       jjc 			break;
   2446   6445       jjc 		}
   2447   6445       jjc 
   2448   6445       jjc 		item = (struct srat_item *)((uintptr_t)item + item->len);
   2449   6445       jjc 	}
   2450   6671       jjc 
   2451   6671       jjc 	/*
   2452   6671       jjc 	 * Should have seen at least as many SRAT processor entries as CPUs
   2453   6671       jjc 	 */
   2454   6706       jjc 	if (proc_entry_count < cpu_count)
   2455   6706       jjc 		return (-5);
   2456   6671       jjc 
   2457   9716  jonathan 	/*
   2458   9716  jonathan 	 * Need to sort nodes by starting physical address since VM system
   2459   9716  jonathan 	 * assumes and expects memnodes to be sorted in ascending order by
   2460   9716  jonathan 	 * physical address
   2461   9716  jonathan 	 */
   2462   9716  jonathan 	lgrp_plat_node_sort(node_domain, node_cnt, cpu_node, cpu_count,
   2463   9716  jonathan 	    node_memory);
   2464   9716  jonathan 
   2465   6706       jjc 	return (node_cnt);
   2466  10710  jonathan }
   2467  10710  jonathan 
   2468  10710  jonathan 
   2469  10710  jonathan /*
   2470  10710  jonathan  * Allocate permanent memory for any temporary memory that we needed to
   2471  10710  jonathan  * allocate using BOP_ALLOC() before kmem_alloc() and VM system were
   2472  10710  jonathan  * initialized and copy everything from temporary to permanent memory since
   2473  10710  jonathan  * temporary boot memory will eventually be released during boot
   2474  10710  jonathan  */
   2475  10710  jonathan static void
   2476  10710  jonathan lgrp_plat_release_bootstrap(void)
   2477  10710  jonathan {
   2478  10710  jonathan 	void	*buf;
   2479  10710  jonathan 	size_t	size;
   2480  10710  jonathan 
   2481  10710  jonathan 	if (lgrp_plat_cpu_node_nentries > 0) {
   2482  10710  jonathan 		size = lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t);
   2483  10710  jonathan 		buf = kmem_alloc(size, KM_SLEEP);
   2484  10710  jonathan 		bcopy(lgrp_plat_cpu_node, buf, size);
   2485  10710  jonathan 		lgrp_plat_cpu_node = buf;
   2486  10710  jonathan 	}
   2487   6445       jjc }
   2488   6445       jjc 
   2489   6445       jjc 
   2490   6445       jjc /*
   2491   6445       jjc  * Return number of proximity domains given in ACPI SRAT
   2492   6445       jjc  */
   2493   6445       jjc static int
   2494   9716  jonathan lgrp_plat_srat_domains(struct srat *tp, uint32_t *prox_domain_min)
   2495   6445       jjc {
   2496   6445       jjc 	int			domain_cnt;
   2497   9716  jonathan 	uint32_t		domain_min;
   2498   6445       jjc 	struct srat_item	*end;
   2499   6445       jjc 	int			i;
   2500   6445       jjc 	struct srat_item	*item;
   2501   6445       jjc 	node_domain_map_t	node_domain[MAX_NODES];
   2502   6445       jjc 
   2503   6445       jjc 
   2504   6445       jjc 	if (tp == NULL || !lgrp_plat_srat_enable)
   2505   6445       jjc 		return (1);
   2506   9716  jonathan 
   2507   9716  jonathan 	/*
   2508   9716  jonathan 	 * Walk through SRAT to find minimum proximity domain ID
   2509   9716  jonathan 	 */
   2510   9716  jonathan 	domain_min = UINT32_MAX;
   2511   9716  jonathan 	item = tp->list;
   2512   9716  jonathan 	end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp);
   2513   9716  jonathan 	while (item < end) {
   2514   9716  jonathan 		uint32_t	domain;
   2515   9716  jonathan 
   2516   9716  jonathan 		switch (item->type) {
   2517   9716  jonathan 		case SRAT_PROCESSOR:	/* CPU entry */
   2518   9716  jonathan 			if (!(item->i.p.flags & SRAT_ENABLED)) {
   2519   9716  jonathan 				item = (struct srat_item *)((uintptr_t)item +
   2520   9716  jonathan 				    item->len);
   2521   9716  jonathan 				continue;
   2522   9716  jonathan 			}
   2523   9716  jonathan 			domain = item->i.p.domain1;
   2524   9716  jonathan 			for (i = 0; i < 3; i++) {
   2525   9716  jonathan 				domain += item->i.p.domain2[i] <<
   2526   9716  jonathan 				    ((i + 1) * 8);
   2527   9716  jonathan 			}
   2528   9716  jonathan 			break;
   2529   9716  jonathan 
   2530   9716  jonathan 		case SRAT_MEMORY:	/* memory entry */
   2531   9716  jonathan 			if (!(item->i.m.flags & SRAT_ENABLED)) {
   2532   9716  jonathan 				item = (struct srat_item *)((uintptr_t)item +
   2533   9716  jonathan 				    item->len);
   2534   9716  jonathan 				continue;
   2535   9716  jonathan 			}
   2536   9716  jonathan 			domain = item->i.m.domain;
   2537   9716  jonathan 			break;
   2538   9716  jonathan 
   2539   9716  jonathan 		case SRAT_X2APIC:	/* x2apic CPU entry */
   2540   9716  jonathan 			if (!(item->i.xp.flags & SRAT_ENABLED)) {
   2541   9716  jonathan 				item = (struct srat_item *)((uintptr_t)item +
   2542   9716  jonathan 				    item->len);
   2543   9716  jonathan 				continue;
   2544   9716  jonathan 			}
   2545   9716  jonathan 			domain = item->i.xp.domain;
   2546   9716  jonathan 			break;
   2547   9716  jonathan 
   2548   9716  jonathan 		default:
   2549   9716  jonathan 			item = (struct srat_item *)((uintptr_t)item +
   2550   9716  jonathan 			    item->len);
   2551   9716  jonathan 			continue;
   2552   9716  jonathan 		}
   2553   9716  jonathan 
   2554   9716  jonathan 		/*
   2555   9716  jonathan 		 * Keep track of minimum proximity domain ID
   2556   9716  jonathan 		 */
   2557   9716  jonathan 		if (domain < domain_min)
   2558   9716  jonathan 			domain_min = domain;
   2559   9716  jonathan 
   2560   9716  jonathan 		item = (struct srat_item *)((uintptr_t)item + item->len);
   2561   9716  jonathan 	}
   2562   9716  jonathan 	if (lgrp_plat_domain_min_enable && prox_domain_min != NULL)
   2563   9716  jonathan 		*prox_domain_min = domain_min;
   2564   6445       jjc 
   2565   6445       jjc 	/*
   2566   6445       jjc 	 * Walk through SRAT, examining each CPU and memory entry to determine
   2567   6445       jjc 	 * proximity domain ID for each.
   2568   6445       jjc 	 */
   2569   6445       jjc 	domain_cnt = 0;
   2570   6445       jjc 	item = tp->list;
   2571   6445       jjc 	end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp);
   2572   6445       jjc 	bzero(node_domain, MAX_NODES * sizeof (node_domain_map_t));
   2573   6445       jjc 	while (item < end) {
   2574   6445       jjc 		uint32_t	domain;
   2575   6445       jjc 		boolean_t	overflow;
   2576   6445       jjc 		uint_t		start;
   2577   6445       jjc 
   2578   6445       jjc 		switch (item->type) {
   2579   6445       jjc 		case SRAT_PROCESSOR:	/* CPU entry */
   2580   9716  jonathan 			if (!(item->i.p.flags & SRAT_ENABLED)) {
   2581   9716  jonathan 				item = (struct srat_item *)((uintptr_t)item +
   2582   9716  jonathan 				    item->len);
   2583   9716  jonathan 				continue;
   2584   9716  jonathan 			}
   2585   6445       jjc 			domain = item->i.p.domain1;
   2586   6445       jjc 			for (i = 0; i < 3; i++) {
   2587   6445       jjc 				domain += item->i.p.domain2[i] <<
   2588   6445       jjc 				    ((i + 1) * 8);
   2589   6445       jjc 			}
   2590   6445       jjc 			break;
   2591   6445       jjc 
   2592   6445       jjc 		case SRAT_MEMORY:	/* memory entry */
   2593   9716  jonathan 			if (!(item->i.m.flags & SRAT_ENABLED)) {
   2594   9716  jonathan 				item = (struct srat_item *)((uintptr_t)item +
   2595   9716  jonathan 				    item->len);
   2596   9716  jonathan 				continue;
   2597   9716  jonathan 			}
   2598   6445       jjc 			domain = item->i.m.domain;
   2599   7282    mishra 			break;
   2600   7282    mishra 
   2601   7282    mishra 		case SRAT_X2APIC:	/* x2apic CPU entry */
   2602   9716  jonathan 			if (!(item->i.xp.flags & SRAT_ENABLED)) {
   2603   9716  jonathan 				item = (struct srat_item *)((uintptr_t)item +
   2604   9716  jonathan 				    item->len);
   2605   9716  jonathan 				continue;
   2606   9716  jonathan 			}
   2607   7282    mishra 			domain = item->i.xp.domain;
   2608   6445       jjc 			break;
   2609   6445       jjc 
   2610   6445       jjc 		default:
   2611   9716  jonathan 			item = (struct srat_item *)((uintptr_t)item +
   2612   9716  jonathan 			    item->len);
   2613   9716  jonathan 			continue;
   2614   6445       jjc 		}
   2615   6445       jjc 
   2616   6445       jjc 		/*
   2617   6445       jjc 		 * Count and keep track of which proximity domain IDs seen
   2618   6445       jjc 		 */
   2619   6445       jjc 		start = i = domain % MAX_NODES;
   2620   6445       jjc 		overflow = B_TRUE;
   2621   6445       jjc 		do {
   2622   6445       jjc 			/*
   2623   6445       jjc 			 * Create entry for proximity domain and increment
   2624   6445       jjc 			 * count when no entry exists where proximity domain
   2625   6445       jjc 			 * hashed
   2626   6445       jjc 			 */
   2627   6445       jjc 			if (!node_domain[i].exists) {
   2628   6445       jjc 				node_domain[i].exists = 1;
   2629   6445       jjc 				node_domain[i].prox_domain = domain;
   2630   6445       jjc 				domain_cnt++;
   2631   6445       jjc 				overflow = B_FALSE;
   2632      0    stevel 				break;
   2633      0    stevel 			}
   2634   6445       jjc 
   2635   6445       jjc 			/*
   2636   6445       jjc 			 * Nothing to do when proximity domain seen already
   2637   6445       jjc 			 * and its entry exists
   2638   6445       jjc 			 */
   2639   6445       jjc 			if (node_domain[i].prox_domain == domain) {
   2640   6445       jjc 				overflow = B_FALSE;
   2641   6445       jjc 				break;
   2642   6445       jjc 			}
   2643   6445       jjc 
   2644   6445       jjc 			/*
   2645   6445       jjc 			 * Entry exists where proximity domain hashed, but for
   2646   6445       jjc 			 * different proximity domain so keep search for empty
   2647   6445       jjc 			 * slot to put it or matching entry whichever comes
   2648   6445       jjc 			 * first.
   2649   6445       jjc 			 */
   2650   6445       jjc 			i = (i + 1) % MAX_NODES;
   2651   6445       jjc 		} while (i != start);
   2652   6445       jjc 
   2653   6445       jjc 		/*
   2654   6445       jjc 		 * Didn't find empty or matching entry which means have more
   2655   6445       jjc 		 * proximity domains than supported nodes (:-(
   2656   6445       jjc 		 */
   2657   6445       jjc 		ASSERT(overflow != B_TRUE);
   2658   6445       jjc 		if (overflow == B_TRUE)
   2659   6445       jjc 			return (-1);
   2660   6445       jjc 
   2661   6445       jjc 		item = (struct srat_item *)((uintptr_t)item + item->len);
   2662   6445       jjc 	}
   2663   6445       jjc 	return (domain_cnt);
   2664   6445       jjc }
   2665   6445       jjc 
   2666   6445       jjc 
   2667   6445       jjc /*
   2668   6445       jjc  * Set lgroup latencies for 2 level lgroup topology
   2669   6445       jjc  */
   2670   6445       jjc static void
   2671   6445       jjc lgrp_plat_2level_setup(node_phys_addr_map_t *node_memory,
   2672   6445       jjc     lgrp_plat_latency_stats_t *lat_stats)
   2673   6445       jjc {
   2674   6445       jjc 	int	i;
   2675   6445       jjc 
   2676   6445       jjc 	ASSERT(node_memory != NULL && lat_stats != NULL);
   2677   6445       jjc 
   2678   6445       jjc 	if (lgrp_plat_node_cnt >= 4)
   2679   6445       jjc 		cmn_err(CE_NOTE,
   2680   6445       jjc 		    "MPO only optimizing for local and remote\n");
   2681   6445       jjc 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
   2682   6445       jjc 		int	j;
   2683   6445       jjc 
   2684   6445       jjc 		if (!node_memory[i].exists)
   2685   6445       jjc 			continue;
   2686   6445       jjc 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
   2687   6445       jjc 			if (!node_memory[j].exists)
   2688   6445       jjc 				continue;
   2689   6445       jjc 			if (i == j)
   2690   6445       jjc 				lat_stats->latencies[i][j] = 2;
   2691   6445       jjc 			else
   2692   6445       jjc 				lat_stats->latencies[i][j] = 3;
   2693      0    stevel 		}
   2694      0    stevel 	}
   2695   6445       jjc 	lat_stats->latency_min = 2;
   2696   6445       jjc 	lat_stats->latency_max = 3;
   2697   6445       jjc 	lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0);
   2698      0    stevel }
   2699      0    stevel 
   2700   6445       jjc 
   2701      0    stevel /*
   2702   6445       jjc  * The following Opteron specific constants, macros, types, and routines define
   2703   6445       jjc  * PCI configuration space registers and how to read them to determine the NUMA
   2704   6445       jjc  * configuration of *supported* Opteron processors.  They provide the same
   2705   6445       jjc  * information that may be gotten from the ACPI System Resource Affinity Table
   2706   6445       jjc  * (SRAT) if it exists on the machine of interest.
   2707      0    stevel  *
   2708   6445       jjc  * The AMD BIOS and Kernel Developer's Guide (BKDG) for the processor family
   2709   6445       jjc  * of interest describes all of these registers and their contents.  The main
   2710   6445       jjc  * registers used by this code to determine the NUMA configuration of the
   2711   6445       jjc  * machine are the node ID register for the number of NUMA nodes and the DRAM
   2712   6445       jjc  * address map registers for the physical address range of each node.
   2713   6445       jjc  *
   2714   6445       jjc  * NOTE: The format and how to determine the NUMA configuration using PCI
   2715   6445       jjc  *	 config space registers may change or may not be supported in future
   2716   6445       jjc  *	 Opteron processor families.
   2717      0    stevel  */
   2718   6445       jjc 
   2719   6445       jjc /*
   2720   6445       jjc  * How many bits to shift Opteron DRAM Address Map base and limit registers
   2721   6445       jjc  * to get actual value
   2722   6445       jjc  */
   2723   6445       jjc #define	OPT_DRAMADDR_HI_LSHIFT_ADDR	40	/* shift left for address */
   2724   6445       jjc #define	OPT_DRAMADDR_LO_LSHIFT_ADDR	8	/* shift left for address */
   2725   6445       jjc 
   2726   6445       jjc #define	OPT_DRAMADDR_HI_MASK_ADDR	0x000000FF /* address bits 47-40 */
   2727   6445       jjc #define	OPT_DRAMADDR_LO_MASK_ADDR	0xFFFF0000 /* address bits 39-24 */
   2728   6445       jjc 
   2729   6445       jjc #define	OPT_DRAMADDR_LO_MASK_OFF	0xFFFFFF /* offset for address */
   2730   6445       jjc 
   2731   6445       jjc /*
   2732   6445       jjc  * Macros to derive addresses from Opteron DRAM Address Map registers
   2733   6445       jjc  */
   2734   6445       jjc #define	OPT_DRAMADDR_HI(reg) \
   2735   6445       jjc 	(((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \
   2736   6445       jjc 	    OPT_DRAMADDR_HI_LSHIFT_ADDR)
   2737   6445       jjc 
   2738   6445       jjc #define	OPT_DRAMADDR_LO(reg) \
   2739   6445       jjc 	(((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \
   2740   6445       jjc 	    OPT_DRAMADDR_LO_LSHIFT_ADDR)
   2741   6445       jjc 
   2742   6445       jjc #define	OPT_DRAMADDR(high, low) \
   2743   6445       jjc 	(OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low))
   2744   6445       jjc 
   2745   6445       jjc /*
   2746   6445       jjc  * Bit masks defining what's in Opteron DRAM Address Map base register
   2747   6445       jjc  */
   2748   6445       jjc #define	OPT_DRAMBASE_LO_MASK_RE		0x1	/* read enable */
   2749   6445       jjc #define	OPT_DRAMBASE_LO_MASK_WE		0x2	/* write enable */
   2750   6445       jjc #define	OPT_DRAMBASE_LO_MASK_INTRLVEN	0x700	/* interleave */
   2751   6445       jjc 
   2752   6445       jjc /*
   2753   6445       jjc  * Bit masks defining what's in Opteron DRAM Address Map limit register
   2754   6445       jjc  */
   2755   6445       jjc #define	OPT_DRAMLIMIT_LO_MASK_DSTNODE	0x7		/* destination node */
   2756   6445       jjc #define	OPT_DRAMLIMIT_LO_MASK_INTRLVSEL	0x700		/* interleave select */
   2757   6445       jjc 
   2758   6445       jjc 
   2759   6445       jjc /*
   2760   6445       jjc  * Opteron Node ID register in PCI configuration space contains
   2761   6445       jjc  * number of nodes in system, etc. for Opteron K8.  The following
   2762   6445       jjc  * constants and macros define its contents, structure, and access.
   2763   6445       jjc  */
   2764   6445       jjc 
   2765   6445       jjc /*
   2766   6445       jjc  * Bit masks defining what's in Opteron Node ID register
   2767   6445       jjc  */
   2768   6445       jjc #define	OPT_NODE_MASK_ID	0x7	/* node ID */
   2769   6445       jjc #define	OPT_NODE_MASK_CNT	0x70	/* node count */
   2770   6445       jjc #define	OPT_NODE_MASK_IONODE	0x700	/* Hypertransport I/O hub node ID */
   2771   6445       jjc #define	OPT_NODE_MASK_LCKNODE	0x7000	/* lock controller node ID */
   2772   6445       jjc #define	OPT_NODE_MASK_CPUCNT	0xF0000	/* CPUs in system (0 means 1 CPU)  */
   2773   6445       jjc 
   2774   6445       jjc /*
   2775   6445       jjc  * How many bits in Opteron Node ID register to shift right to get actual value
   2776   6445       jjc  */
   2777   6445       jjc #define	OPT_NODE_RSHIFT_CNT	0x4	/* shift right for node count value */
   2778   6445       jjc 
   2779   6445       jjc /*
   2780   6445       jjc  * Macros to get values from Opteron Node ID register
   2781   6445       jjc  */
   2782   6445       jjc #define	OPT_NODE_CNT(reg) \
   2783   6445       jjc 	((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT)
   2784   6445       jjc 
   2785   6445       jjc /*
   2786   6445       jjc  * Macro to setup PCI Extended Configuration Space (ECS) address to give to
   2787   6445       jjc  * "in/out" instructions
   2788   6445       jjc  *
   2789   6445       jjc  * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any
   2790   6445       jjc  *	 other uses should just do MMIO to access PCI ECS.
   2791   6445       jjc  *	 Must enable special bit in Northbridge Configuration Register on
   2792   6445       jjc  *	 Greyhound for extended CF8 space access to be able to access PCI ECS
   2793   6445       jjc  *	 using "in/out" instructions and restore special bit after done
   2794   6445       jjc  *	 accessing PCI ECS.
   2795   6445       jjc  */
   2796   6445       jjc #define	OPT_PCI_ECS_ADDR(bus, device, function, reg) \
   2797   6445       jjc 	(PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11)  | \
   2798   6445       jjc 	    (((function) & 0x7) << 8) | ((reg) & 0xfc) | \
   2799   6445       jjc 	    ((((reg) >> 8) & 0xf) << 24))
   2800   6445       jjc 
   2801   6445       jjc /*
   2802   6445       jjc  * PCI configuration space registers accessed by specifying
   2803   6445       jjc  * a bus, device, function, and offset.  The following constants
   2804   6445       jjc  * define the values needed to access Opteron K8 configuration
   2805   6445       jjc  * info to determine its node topology
   2806   6445       jjc  */
   2807   6445       jjc 
   2808   6445       jjc #define	OPT_PCS_BUS_CONFIG	0	/* Hypertransport config space bus */
   2809   6445       jjc 
   2810   6445       jjc /*
   2811   6445       jjc  * Opteron PCI configuration space register function values
   2812   6445       jjc  */
   2813   6445       jjc #define	OPT_PCS_FUNC_HT		0	/* Hypertransport configuration */
   2814   6445       jjc #define	OPT_PCS_FUNC_ADDRMAP	1	/* Address map configuration */
   2815   6445       jjc #define	OPT_PCS_FUNC_DRAM	2	/* DRAM configuration */
   2816   6445       jjc #define	OPT_PCS_FUNC_MISC	3	/* Miscellaneous configuration */
   2817   6445       jjc 
   2818   6445       jjc /*
   2819   6445       jjc  * PCI Configuration Space register offsets
   2820   6445       jjc  */
   2821   6445       jjc #define	OPT_PCS_OFF_VENDOR	0x0	/* device/vendor ID register */
   2822   6445       jjc #define	OPT_PCS_OFF_DRAMBASE_HI	0x140	/* DRAM Base register (node 0) */
   2823   6445       jjc #define	OPT_PCS_OFF_DRAMBASE_LO	0x40	/* DRAM Base register (node 0) */
   2824   6445       jjc #define	OPT_PCS_OFF_NODEID	0x60	/* Node ID register */
   2825   6445       jjc 
   2826   6445       jjc /*
   2827   6445       jjc  * Opteron PCI Configuration Space device IDs for nodes
   2828   6445       jjc  */
   2829   6445       jjc #define	OPT_PCS_DEV_NODE0		24	/* device number for node 0 */
   2830   6445       jjc 
   2831   6445       jjc 
   2832   6445       jjc /*
   2833   6445       jjc  * Opteron DRAM address map gives base and limit for physical memory in a node
   2834   6445       jjc  */
   2835   6445       jjc typedef	struct opt_dram_addr_map {
   2836   6445       jjc 	uint32_t	base_hi;
   2837   6445       jjc 	uint32_t	base_lo;
   2838   6445       jjc 	uint32_t	limit_hi;
   2839   6445       jjc 	uint32_t	limit_lo;
   2840   6445       jjc } opt_dram_addr_map_t;
   2841   6445       jjc 
   2842   6445       jjc 
   2843   6445       jjc /*
   2844   6445       jjc  * Supported AMD processor families
   2845   6445       jjc  */
   2846   6445       jjc #define	AMD_FAMILY_HAMMER	15
   2847   6445       jjc #define	AMD_FAMILY_GREYHOUND	16
   2848   6445       jjc 
   2849   6445       jjc /*
   2850   6445       jjc  * Whether to have is_opteron() return 1 even when processor isn't supported
   2851   6445       jjc  */
   2852   6445       jjc uint_t	is_opteron_override = 0;
   2853   6445       jjc 
   2854   6445       jjc /*
   2855   6445       jjc  * AMD processor family for current CPU
   2856   6445       jjc  */
   2857   6445       jjc uint_t	opt_family = 0;
   2858   6445       jjc 
   2859   6445       jjc 
   2860   6445       jjc /*
   2861   6445       jjc  * Determine whether we're running on a supported AMD Opteron since reading
   2862   6445       jjc  * node count and DRAM address map registers may have different format or
   2863   6445       jjc  * may not be supported across processor families
   2864   6445       jjc  */
   2865   6445       jjc static int
   2866   6445       jjc is_opteron(void)
   2867      0    stevel {
   2868      0    stevel 
   2869   6445       jjc 	if (x86_vendor != X86_VENDOR_AMD)
   2870      0    stevel 		return (0);
   2871      0    stevel 
   2872   6445       jjc 	opt_family = cpuid_getfamily(CPU);
   2873   6445       jjc 	if (opt_family == AMD_FAMILY_HAMMER ||
   2874   6445       jjc 	    opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override)
   2875   6445       jjc 		return (1);
   2876   6445       jjc 	else
   2877   6445       jjc 		return (0);
   2878   6445       jjc }
   2879      0    stevel 
   2880   6445       jjc 
   2881   6445       jjc /*
   2882   6445       jjc  * Determine NUMA configuration for Opteron from registers that live in PCI
   2883   6445       jjc  * configuration space
   2884   6445       jjc  */
   2885   6445       jjc static void
   2886   6445       jjc opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv,
   2887   6445       jjc     node_phys_addr_map_t *node_memory)
   2888   6445       jjc {
   2889   6445       jjc 	uint_t				bus;
   2890   6445       jjc 	uint_t				dev;
   2891   6445       jjc 	struct opt_dram_addr_map	dram_map[MAX_NODES];
   2892   6445       jjc 	uint_t				node;
   2893   6445       jjc 	uint_t				node_info[MAX_NODES];
   2894   6445       jjc 	uint_t				off_hi;
   2895   6445       jjc 	uint_t				off_lo;
   2896   6445       jjc 	uint64_t			nb_cfg_reg;
   2897      0    stevel 
   2898      0    stevel 	/*
   2899   6445       jjc 	 * Read configuration registers from PCI configuration space to
   2900   6445       jjc 	 * determine node information, which memory is in each node, etc.
   2901   6445       jjc 	 *
   2902   6445       jjc 	 * Write to PCI configuration space address register to specify
   2903   6445       jjc 	 * which configuration register to read and read/write PCI
   2904   6445       jjc 	 * configuration space data register to get/set contents
   2905      0    stevel 	 */
   2906   6445       jjc 	bus = OPT_PCS_BUS_CONFIG;
   2907   6445       jjc 	dev = OPT_PCS_DEV_NODE0;
   2908   6445       jjc 	off_hi = OPT_PCS_OFF_DRAMBASE_HI;
   2909   6445       jjc 	off_lo = OPT_PCS_OFF_DRAMBASE_LO;
   2910      0    stevel 
   2911      0    stevel 	/*
   2912   6445       jjc 	 * Read node ID register for node 0 to get node count
   2913      0    stevel 	 */
   2914   6445       jjc 	node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT,
   2915   6445       jjc 	    OPT_PCS_OFF_NODEID);
   2916   6445       jjc 	*node_cnt = OPT_NODE_CNT(node_info[0]) + 1;
   2917      0    stevel 
   2918   6445       jjc 	/*
   2919   6445       jjc 	 * If number of nodes is more than maximum supported, then set node
   2920   6445       jjc 	 * count to 1 and treat system as UMA instead of NUMA.
   2921   6445       jjc 	 */
   2922   6445       jjc 	if (*node_cnt > MAX_NODES) {
   2923   6445       jjc 		*node_cnt = 1;
   2924   6445       jjc 		return;
   2925   6445       jjc 	}
   2926   6445       jjc 
   2927   6445       jjc 	/*
   2928   6445       jjc 	 * For Greyhound, PCI Extended Configuration Space must be enabled to
   2929   6445       jjc 	 * read high DRAM address map base and limit registers
   2930   6445       jjc 	 */
   2931   6445       jjc 	if (opt_family == AMD_FAMILY_GREYHOUND) {
   2932   6445       jjc 		nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG);
   2933   6445       jjc 		if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0)
   2934   6445       jjc 			wrmsr(MSR_AMD_NB_CFG,
   2935   6445       jjc 			    nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS);
   2936   6445       jjc 	}
   2937   6445       jjc 
   2938   6445       jjc 	for (node = 0; node < *node_cnt; node++) {
   2939   6445       jjc 		uint32_t	base_hi;
   2940   6445       jjc 		uint32_t	base_lo;
   2941   6445       jjc 		uint32_t	limit_hi;
   2942   6445       jjc 		uint32_t	limit_lo;
   2943   6445       jjc 
   2944   6445       jjc 		/*
   2945   6445       jjc 		 * Read node ID register (except for node 0 which we just read)
   2946   6445       jjc 		 */
   2947   6445       jjc 		if (node > 0) {
   2948   6445       jjc 			node_info[node] = pci_getl_func(bus, dev,
   2949   6445       jjc 			    OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID);
   2950   6445       jjc 		}
   2951   6445       jjc 
   2952   6445       jjc 		/*
   2953   6445       jjc 		 * Read DRAM base and limit registers which specify
   2954   6445       jjc 		 * physical memory range of each node
   2955   6445       jjc 		 */
   2956   6445       jjc 		if (opt_family != AMD_FAMILY_GREYHOUND)
   2957   6445       jjc 			base_hi = 0;
   2958   6445       jjc 		else {
   2959   6445       jjc 			outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev,
   2960   6445       jjc 			    OPT_PCS_FUNC_ADDRMAP, off_hi));
   2961   6445       jjc 			base_hi = dram_map[node].base_hi =
   2962   6445       jjc 			    inl(PCI_CONFDATA);
   2963   6445       jjc 		}
   2964   6445       jjc 		base_lo = dram_map[node].base_lo = pci_getl_func(bus, dev,
   2965   6445       jjc 		    OPT_PCS_FUNC_ADDRMAP, off_lo);
   2966   6445       jjc 
   2967   6445       jjc 		if ((dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) &&
   2968   6445       jjc 		    mem_intrlv)
   2969   6445       jjc 			*mem_intrlv = *mem_intrlv + 1;
   2970   6445       jjc 
   2971   6445       jjc 		off_hi += 4;	/* high limit register offset */
   2972   6445       jjc 		if (opt_family != AMD_FAMILY_GREYHOUND)
   2973   6445       jjc 			limit_hi = 0;
   2974   6445       jjc 		else {
   2975   6445       jjc 			outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev,
   2976   6445       jjc 			    OPT_PCS_FUNC_ADDRMAP, off_hi));
   2977   6445       jjc 			limit_hi = dram_map[node].limit_hi =
   2978   6445       jjc 			    inl(PCI_CONFDATA);
   2979   6445       jjc 		}
   2980   6445       jjc 
   2981   6445       jjc 		off_lo += 4;	/* low limit register offset */
   2982   6445       jjc 		limit_lo = dram_map[node].limit_lo = pci_getl_func(bus,
   2983   6445       jjc 		    dev, OPT_PCS_FUNC_ADDRMAP, off_lo);
   2984   6445       jjc 
   2985   6445       jjc 		/*
   2986   6445       jjc 		 * Increment device number to next node and register offsets
   2987   6445       jjc 		 * for DRAM base register of next node
   2988   6445       jjc 		 */
   2989   6445       jjc 		off_hi += 4;
   2990   6445       jjc 		off_lo += 4;
   2991   6445       jjc 		dev++;
   2992   6445       jjc 
   2993   6445       jjc 		/*
   2994   6445       jjc 		 * Both read and write enable bits must be enabled in DRAM
   2995   6445       jjc 		 * address map base register for physical memory to exist in
   2996   6445       jjc 		 * node
   2997   6445       jjc 		 */
   2998   6445       jjc 		if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 ||
   2999   6445       jjc 		    (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) {
   3000   6445       jjc 			/*
   3001   6445       jjc 			 * Mark node memory as non-existent and set start and
   3002   6445       jjc 			 * end addresses to be same in node_memory[]
   3003   6445       jjc 			 */
   3004   6445       jjc 			node_memory[node].exists = 0;
   3005   6445       jjc 			node_memory[node].start = node_memory[node].end =
   3006   6445       jjc 			    (pfn_t)-1;
   3007   6445       jjc 			continue;
   3008   6445       jjc 		}
   3009   6445       jjc 
   3010   6445       jjc 		/*
   3011   6445       jjc 		 * Mark node memory as existing and remember physical address
   3012   6445       jjc 		 * range of each node for use later
   3013   6445       jjc 		 */
   3014   6445       jjc 		node_memory[node].exists = 1;
   3015   6445       jjc 
   3016   6445       jjc 		node_memory[node].start = btop(OPT_DRAMADDR(base_hi, base_lo));
   3017   6445       jjc 
   3018   6445       jjc 		node_memory[node].end = btop(OPT_DRAMADDR(limit_hi, limit_lo) |
   3019   6445       jjc 		    OPT_DRAMADDR_LO_MASK_OFF);
   3020   6445       jjc 	}
   3021   6445       jjc 
   3022   6445       jjc 	/*
   3023   6445       jjc 	 * Restore PCI Extended Configuration Space enable bit
   3024   6445       jjc 	 */
   3025   6445       jjc 	if (opt_family == AMD_FAMILY_GREYHOUND) {
   3026   6445       jjc 		if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0)
   3027   6445       jjc 			wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg);
   3028   6445       jjc 	}
   3029      0    stevel }
   3030      0    stevel 
   3031   6445       jjc 
   3032      0    stevel /*
   3033   6445       jjc  * Return average amount of time to read vendor ID register on Northbridge
   3034   6445       jjc  * N times on specified destination node from current CPU
   3035      0    stevel  */
   3036   6445       jjc static hrtime_t
   3037   6445       jjc opt_probe_vendor(int dest_node, int nreads)
   3038      0    stevel {
   3039   6445       jjc 	int		cnt;
   3040   6445       jjc 	uint_t		dev;
   3041   6445       jjc 	/* LINTED: set but not used in function */
   3042   6445       jjc 	volatile uint_t	dev_vendor;
   3043   6445       jjc 	hrtime_t	elapsed;
   3044   6445       jjc 	hrtime_t	end;
   3045   6445       jjc 	int		ipl;
   3046   6445       jjc 	hrtime_t	start;
   3047   6445       jjc 
   3048   6445       jjc 	dev = OPT_PCS_DEV_NODE0 + dest_node;
   3049   6445       jjc 	kpreempt_disable();
   3050   6445       jjc 	ipl = spl8();
   3051   6445       jjc 	outl(PCI_CONFADD, PCI_CADDR1(0, dev, OPT_PCS_FUNC_DRAM,
   3052   6445       jjc 	    OPT_PCS_OFF_VENDOR));
   3053   6445       jjc 	start = gethrtime();
   3054   6445       jjc 	for (cnt = 0; cnt < nreads; cnt++)
   3055   6445       jjc 		dev_vendor = inl(PCI_CONFDATA);
   3056   6445       jjc 	end = gethrtime();
   3057   6445       jjc 	elapsed = (end - start) / nreads;
   3058   6445       jjc 	splx(ipl);
   3059   6445       jjc 	kpreempt_enable();
   3060   6445       jjc 	return (elapsed);
   3061      0    stevel }
   3062