1 0 stevel /* 2 0 stevel * CDDL HEADER START 3 0 stevel * 4 0 stevel * The contents of this file are subject to the terms of the 5 2220 stevel * Common Development and Distribution License (the "License"). 6 2220 stevel * You may not use this file except in compliance with the License. 7 0 stevel * 8 0 stevel * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 0 stevel * or http://www.opensolaris.org/os/licensing. 10 0 stevel * See the License for the specific language governing permissions 11 0 stevel * and limitations under the License. 12 0 stevel * 13 0 stevel * When distributing Covered Code, include this CDDL HEADER in each 14 0 stevel * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 0 stevel * If applicable, add the following below this CDDL HEADER, with the 16 0 stevel * fields enclosed by brackets "[]" replaced with your own identifying 17 0 stevel * information: Portions Copyright [yyyy] [name of copyright owner] 18 0 stevel * 19 0 stevel * CDDL HEADER END 20 0 stevel */ 21 2220 stevel 22 0 stevel /* 23 9053 jonathan * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 0 stevel * Use is subject to license terms. 25 0 stevel */ 26 0 stevel 27 6445 jjc /* 28 6445 jjc * LOCALITY GROUP (LGROUP) PLATFORM SUPPORT FOR X86/AMD64 PLATFORMS 29 6445 jjc * ================================================================ 30 6445 jjc * Multiprocessor AMD and Intel systems may have Non Uniform Memory Access 31 6445 jjc * (NUMA). A NUMA machine consists of one or more "nodes" that each consist of 32 6445 jjc * one or more CPUs and some local memory. The CPUs in each node can access 33 6445 jjc * the memory in the other nodes but at a higher latency than accessing their 34 6445 jjc * local memory. Typically, a system with only one node has Uniform Memory 35 6445 jjc * Access (UMA), but it may be possible to have a one node system that has 36 6445 jjc * some global memory outside of the node which is higher latency. 37 6445 jjc * 38 6445 jjc * Module Description 39 6445 jjc * ------------------ 40 6445 jjc * This module provides a platform interface for determining which CPUs and 41 6445 jjc * which memory (and how much) are in a NUMA node and how far each node is from 42 6445 jjc * each other. The interface is used by the Virtual Memory (VM) system and the 43 6445 jjc * common lgroup framework. The VM system uses the plat_*() routines to fill 44 6445 jjc * in its memory node (memnode) array with the physical address range spanned 45 6445 jjc * by each NUMA node to know which memory belongs to which node, so it can 46 6445 jjc * build and manage a physical page free list for each NUMA node and allocate 47 6445 jjc * local memory from each node as needed. The common lgroup framework uses the 48 6445 jjc * exported lgrp_plat_*() routines to figure out which CPUs and memory belong 49 6445 jjc * to each node (leaf lgroup) and how far each node is from each other, so it 50 6445 jjc * can build the latency (lgroup) topology for the machine in order to optimize 51 6445 jjc * for locality. Also, an lgroup platform handle instead of lgroups are used 52 6445 jjc * in the interface with this module, so this module shouldn't need to know 53 6445 jjc * anything about lgroups. Instead, it just needs to know which CPUs, memory, 54 6445 jjc * etc. are in each NUMA node, how far each node is from each other, and to use 55 6445 jjc * a unique lgroup platform handle to refer to each node through the interface. 56 6445 jjc * 57 6445 jjc * Determining NUMA Configuration 58 6445 jjc * ------------------------------ 59 6445 jjc * By default, this module will try to determine the NUMA configuration of the 60 6445 jjc * machine by reading the ACPI System Resource Affinity Table (SRAT) and System 61 6445 jjc * Locality Information Table (SLIT). The SRAT contains info to tell which 62 6445 jjc * CPUs and memory are local to a given proximity domain (NUMA node). The SLIT 63 6445 jjc * is a matrix that gives the distance between each system locality (which is 64 6445 jjc * a NUMA node and should correspond to proximity domains in the SRAT). For 65 6445 jjc * more details on the SRAT and SLIT, please refer to an ACPI 3.0 or newer 66 6445 jjc * specification. 67 6445 jjc * 68 6445 jjc * If the SRAT doesn't exist on a system with AMD Opteron processors, we 69 6445 jjc * examine registers in PCI configuration space to determine how many nodes are 70 6445 jjc * in the system and which CPUs and memory are in each node. 71 6445 jjc * do while booting the kernel. 72 6445 jjc * 73 6445 jjc * NOTE: Using these PCI configuration space registers to determine this 74 6445 jjc * locality info is not guaranteed to work or be compatible across all 75 6445 jjc * Opteron processor families. 76 6445 jjc * 77 6445 jjc * If the SLIT does not exist or look right, the kernel will probe to determine 78 6445 jjc * the distance between nodes as long as the NUMA CPU and memory configuration 79 6445 jjc * has been determined (see lgrp_plat_probe() for details). 80 6445 jjc * 81 6445 jjc * Data Structures 82 6445 jjc * --------------- 83 6445 jjc * The main data structures used by this code are the following: 84 6445 jjc * 85 6671 jjc * - lgrp_plat_cpu_node[] CPU to node ID mapping table indexed by 86 6671 jjc * CPU ID (only used for SRAT) 87 6445 jjc * 88 6445 jjc * - lgrp_plat_lat_stats.latencies[][] Table of latencies between same and 89 6445 jjc * different nodes indexed by node ID 90 6445 jjc * 91 6445 jjc * - lgrp_plat_node_cnt Number of NUMA nodes in system 92 6445 jjc * 93 6445 jjc * - lgrp_plat_node_domain[] Node ID to proximity domain ID mapping 94 6445 jjc * table indexed by node ID (only used 95 6445 jjc * for SRAT) 96 6445 jjc * 97 6445 jjc * - lgrp_plat_node_memory[] Table with physical address range for 98 6445 jjc * each node indexed by node ID 99 6445 jjc * 100 6445 jjc * The code is implemented to make the following always be true: 101 6445 jjc * 102 6445 jjc * lgroup platform handle == node ID == memnode ID 103 6445 jjc * 104 6445 jjc * Moreover, it allows for the proximity domain ID to be equal to all of the 105 6445 jjc * above as long as the proximity domains IDs are numbered from 0 to <number of 106 6445 jjc * nodes - 1>. This is done by hashing each proximity domain ID into the range 107 6445 jjc * from 0 to <number of nodes - 1>. Then proximity ID N will hash into node ID 108 6445 jjc * N and proximity domain ID N will be entered into lgrp_plat_node_domain[N] 109 6445 jjc * and be assigned node ID N. If the proximity domain IDs aren't numbered 110 6445 jjc * from 0 to <number of nodes - 1>, then hashing the proximity domain IDs into 111 6445 jjc * lgrp_plat_node_domain[] will still work for assigning proximity domain IDs 112 6445 jjc * to node IDs. However, the proximity domain IDs may not map to the 113 6445 jjc * equivalent node ID since we want to keep the node IDs numbered from 0 to 114 6445 jjc * <number of nodes - 1> to minimize cost of searching and potentially space. 115 9716 jonathan * 116 9716 jonathan * The code below really tries to do the above. However, the virtual memory 117 9716 jonathan * system expects the memnodes which describe the physical address range for 118 9716 jonathan * each NUMA node to be arranged in ascending order by physical address. (:-( 119 9716 jonathan * Otherwise, the kernel will panic in different semi-random places in the VM 120 9923 jonathan * system. 121 9716 jonathan * 122 9716 jonathan * Consequently, this module has to try to sort the nodes in ascending order by 123 9716 jonathan * each node's starting physical address to try to meet this "constraint" in 124 9716 jonathan * the VM system (see lgrp_plat_node_sort()). Also, the lowest numbered 125 9716 jonathan * proximity domain ID in the system is deteremined and used to make the lowest 126 9716 jonathan * numbered proximity domain map to node 0 in hopes that the proximity domains 127 9716 jonathan * are sorted in ascending order by physical address already even if their IDs 128 9716 jonathan * don't start at 0 (see NODE_DOMAIN_HASH() and lgrp_plat_srat_domains()). 129 9716 jonathan * Finally, it is important to note that these workarounds may not be 130 9716 jonathan * sufficient if/when memory hotplugging is supported and the VM system may 131 9716 jonathan * ultimately need to be fixed to handle this.... 132 6445 jjc */ 133 6445 jjc 134 6445 jjc 135 0 stevel #include <sys/archsystm.h> /* for {in,out}{b,w,l}() */ 136 6671 jjc #include <sys/bootconf.h> 137 0 stevel #include <sys/cmn_err.h> 138 4898 jjc #include <sys/controlregs.h> 139 0 stevel #include <sys/cpupart.h> 140 0 stevel #include <sys/cpuvar.h> 141 0 stevel #include <sys/lgrp.h> 142 0 stevel #include <sys/machsystm.h> 143 0 stevel #include <sys/memlist.h> 144 0 stevel #include <sys/memnode.h> 145 0 stevel #include <sys/mman.h> 146 938 esaxe #include <sys/pci_cfgspace.h> 147 938 esaxe #include <sys/pci_impl.h> 148 0 stevel #include <sys/param.h> 149 3434 esaxe #include <sys/pghw.h> 150 0 stevel #include <sys/promif.h> /* for prom_printf() */ 151 6445 jjc #include <sys/sysmacros.h> 152 0 stevel #include <sys/systm.h> 153 0 stevel #include <sys/thread.h> 154 0 stevel #include <sys/types.h> 155 0 stevel #include <sys/var.h> 156 0 stevel #include <sys/x86_archext.h> /* for x86_feature and X86_AMD */ 157 0 stevel #include <vm/hat_i86.h> 158 0 stevel #include <vm/seg_kmem.h> 159 414 kchow #include <vm/vm_dep.h> 160 0 stevel 161 6445 jjc #include "acpi_fw.h" /* for SRAT and SLIT */ 162 0 stevel 163 0 stevel 164 0 stevel #define MAX_NODES 8 165 0 stevel #define NLGRP (MAX_NODES * (MAX_NODES - 1) + 1) 166 0 stevel 167 6445 jjc /* 168 6445 jjc * Constants for configuring probing 169 6445 jjc */ 170 0 stevel #define LGRP_PLAT_PROBE_NROUNDS 64 /* default laps for probing */ 171 0 stevel #define LGRP_PLAT_PROBE_NSAMPLES 1 /* default samples to take */ 172 1228 andrei #define LGRP_PLAT_PROBE_NREADS 256 /* number of vendor ID reads */ 173 0 stevel 174 0 stevel /* 175 6445 jjc * Flags for probing 176 0 stevel */ 177 6445 jjc #define LGRP_PLAT_PROBE_ENABLE 0x1 /* enable probing */ 178 6445 jjc #define LGRP_PLAT_PROBE_PGCPY 0x2 /* probe using page copy */ 179 6445 jjc #define LGRP_PLAT_PROBE_VENDOR 0x4 /* probe vendor ID register */ 180 0 stevel 181 0 stevel /* 182 9716 jonathan * Hash proximity domain ID into node to domain mapping table "mod" number of 183 9716 jonathan * nodes to minimize span of entries used and try to have lowest numbered 184 9716 jonathan * proximity domain be node 0 185 0 stevel */ 186 9716 jonathan #define NODE_DOMAIN_HASH(domain, node_cnt) \ 187 9716 jonathan ((lgrp_plat_prox_domain_min == UINT32_MAX) ? (domain) % node_cnt : \ 188 9716 jonathan ((domain) - lgrp_plat_prox_domain_min) % node_cnt) 189 0 stevel 190 4898 jjc 191 4898 jjc /* 192 6671 jjc * CPU to node ID mapping structure (only used with SRAT) 193 4898 jjc */ 194 6445 jjc typedef struct cpu_node_map { 195 6445 jjc int exists; 196 6445 jjc uint_t node; 197 6445 jjc uint32_t apicid; 198 6445 jjc uint32_t prox_domain; 199 6445 jjc } cpu_node_map_t; 200 0 stevel 201 0 stevel /* 202 6445 jjc * Latency statistics 203 0 stevel */ 204 6445 jjc typedef struct lgrp_plat_latency_stats { 205 6445 jjc hrtime_t latencies[MAX_NODES][MAX_NODES]; 206 6445 jjc hrtime_t latency_max; 207 6445 jjc hrtime_t latency_min; 208 6445 jjc } lgrp_plat_latency_stats_t; 209 0 stevel 210 0 stevel /* 211 6445 jjc * Memory configuration for probing 212 0 stevel */ 213 6445 jjc typedef struct lgrp_plat_probe_mem_config { 214 6445 jjc size_t probe_memsize; /* how much memory to probe per node */ 215 6445 jjc caddr_t probe_va[MAX_NODES]; /* where memory mapped for probing */ 216 6445 jjc pfn_t probe_pfn[MAX_NODES]; /* physical pages to map for probing */ 217 6445 jjc } lgrp_plat_probe_mem_config_t; 218 0 stevel 219 0 stevel /* 220 6445 jjc * Statistics kept for probing 221 0 stevel */ 222 6445 jjc typedef struct lgrp_plat_probe_stats { 223 6445 jjc hrtime_t flush_cost; 224 6445 jjc hrtime_t probe_cost; 225 6445 jjc hrtime_t probe_cost_total; 226 6445 jjc hrtime_t probe_error_code; 227 6445 jjc hrtime_t probe_errors[MAX_NODES][MAX_NODES]; 228 6445 jjc int probe_suspect[MAX_NODES][MAX_NODES]; 229 6445 jjc hrtime_t probe_max[MAX_NODES][MAX_NODES]; 230 6445 jjc hrtime_t probe_min[MAX_NODES][MAX_NODES]; 231 6445 jjc } lgrp_plat_probe_stats_t; 232 0 stevel 233 0 stevel /* 234 6445 jjc * Node to proximity domain ID mapping structure (only used with SRAT) 235 0 stevel */ 236 6445 jjc typedef struct node_domain_map { 237 6445 jjc int exists; 238 6445 jjc uint32_t prox_domain; 239 6445 jjc } node_domain_map_t; 240 0 stevel 241 0 stevel /* 242 6445 jjc * Node ID and starting and ending page for physical memory in node 243 0 stevel */ 244 6445 jjc typedef struct node_phys_addr_map { 245 6445 jjc pfn_t start; 246 6445 jjc pfn_t end; 247 6445 jjc int exists; 248 6445 jjc uint32_t prox_domain; 249 6445 jjc } node_phys_addr_map_t; 250 6445 jjc 251 6671 jjc /* 252 6706 jjc * Number of CPUs for which we got APIC IDs 253 6671 jjc */ 254 6706 jjc static int lgrp_plat_apic_ncpus = 0; 255 0 stevel 256 0 stevel /* 257 10710 jonathan * CPU to node ID mapping table (only used for SRAT) and its max number of 258 10710 jonathan * entries 259 0 stevel */ 260 10710 jonathan static cpu_node_map_t *lgrp_plat_cpu_node = NULL; 261 10710 jonathan static uint_t lgrp_plat_cpu_node_nentries = 0; 262 0 stevel 263 4898 jjc /* 264 6445 jjc * Latency statistics 265 4898 jjc */ 266 6445 jjc lgrp_plat_latency_stats_t lgrp_plat_lat_stats; 267 0 stevel 268 0 stevel /* 269 0 stevel * Whether memory is interleaved across nodes causing MPO to be disabled 270 0 stevel */ 271 6445 jjc static int lgrp_plat_mem_intrlv = 0; 272 6445 jjc 273 6445 jjc /* 274 6445 jjc * Node ID to proximity domain ID mapping table (only used for SRAT) 275 6445 jjc */ 276 6445 jjc static node_domain_map_t lgrp_plat_node_domain[MAX_NODES]; 277 6445 jjc 278 6445 jjc /* 279 6445 jjc * Physical address range for memory in each node 280 6445 jjc */ 281 6445 jjc static node_phys_addr_map_t lgrp_plat_node_memory[MAX_NODES]; 282 6445 jjc 283 6445 jjc /* 284 6445 jjc * Statistics gotten from probing 285 6445 jjc */ 286 6445 jjc static lgrp_plat_probe_stats_t lgrp_plat_probe_stats; 287 6445 jjc 288 6445 jjc /* 289 6445 jjc * Memory configuration for probing 290 6445 jjc */ 291 6445 jjc static lgrp_plat_probe_mem_config_t lgrp_plat_probe_mem_config; 292 6445 jjc 293 6445 jjc /* 294 9716 jonathan * Lowest proximity domain ID seen in ACPI SRAT 295 9716 jonathan */ 296 9716 jonathan static uint32_t lgrp_plat_prox_domain_min = UINT32_MAX; 297 9716 jonathan 298 9716 jonathan /* 299 6445 jjc * Error code from processing ACPI SRAT 300 6445 jjc */ 301 6445 jjc static int lgrp_plat_srat_error = 0; 302 6445 jjc 303 6445 jjc /* 304 6445 jjc * Error code from processing ACPI SLIT 305 6445 jjc */ 306 6445 jjc static int lgrp_plat_slit_error = 0; 307 6445 jjc 308 6445 jjc /* 309 6445 jjc * Allocate lgroup array statically 310 6445 jjc */ 311 6445 jjc static lgrp_t lgrp_space[NLGRP]; 312 6445 jjc static int nlgrps_alloc; 313 6445 jjc 314 0 stevel 315 0 stevel /* 316 9716 jonathan * Enable finding and using minimum proximity domain ID when hashing 317 9716 jonathan */ 318 9716 jonathan int lgrp_plat_domain_min_enable = 1; 319 9716 jonathan 320 9716 jonathan /* 321 0 stevel * Number of nodes in system 322 0 stevel */ 323 0 stevel uint_t lgrp_plat_node_cnt = 1; 324 9716 jonathan 325 9716 jonathan /* 326 9716 jonathan * Enable sorting nodes in ascending order by starting physical address 327 9716 jonathan */ 328 9716 jonathan int lgrp_plat_node_sort_enable = 1; 329 0 stevel 330 0 stevel /* 331 6445 jjc * Configuration Parameters for Probing 332 6445 jjc * - lgrp_plat_probe_flags Flags to specify enabling probing, probe 333 6445 jjc * operation, etc. 334 6445 jjc * - lgrp_plat_probe_nrounds How many rounds of probing to do 335 6445 jjc * - lgrp_plat_probe_nsamples Number of samples to take when probing each 336 6445 jjc * node 337 6445 jjc * - lgrp_plat_probe_nreads Number of times to read vendor ID from 338 6445 jjc * Northbridge for each probe 339 0 stevel */ 340 6445 jjc uint_t lgrp_plat_probe_flags = 0; 341 0 stevel int lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS; 342 0 stevel int lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES; 343 1228 andrei int lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS; 344 0 stevel 345 0 stevel /* 346 6445 jjc * Enable use of ACPI System Resource Affinity Table (SRAT) and System 347 6445 jjc * Locality Information Table (SLIT) 348 0 stevel */ 349 6445 jjc int lgrp_plat_srat_enable = 1; 350 6445 jjc int lgrp_plat_slit_enable = 1; 351 0 stevel 352 0 stevel /* 353 9732 Kit * mnode_xwa: set to non-zero value to initiate workaround if large pages are 354 9732 Kit * found to be crossing memory node boundaries. The workaround will eliminate 355 9732 Kit * a base size page at the end of each memory node boundary to ensure that 356 9732 Kit * a large page with constituent pages that span more than 1 memory node 357 9732 Kit * can never be formed. 358 9732 Kit * 359 9732 Kit */ 360 9732 Kit int mnode_xwa = 1; 361 9732 Kit 362 9732 Kit /* 363 6445 jjc * Static array to hold lgroup statistics 364 0 stevel */ 365 6445 jjc struct lgrp_stats lgrp_stats[NLGRP]; 366 6445 jjc 367 0 stevel 368 0 stevel /* 369 6445 jjc * Forward declarations of platform interface routines 370 0 stevel */ 371 6445 jjc void plat_build_mem_nodes(struct memlist *list); 372 6445 jjc 373 6445 jjc int plat_lgrphand_to_mem_node(lgrp_handle_t hand); 374 6445 jjc 375 6445 jjc lgrp_handle_t plat_mem_node_to_lgrphand(int mnode); 376 6445 jjc 377 6445 jjc int plat_mnode_xcheck(pfn_t pfncnt); 378 6445 jjc 379 6445 jjc int plat_pfn_to_mem_node(pfn_t pfn); 380 0 stevel 381 0 stevel /* 382 6445 jjc * Forward declarations of lgroup platform interface routines 383 0 stevel */ 384 6445 jjc lgrp_t *lgrp_plat_alloc(lgrp_id_t lgrpid); 385 6445 jjc 386 6445 jjc void lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg); 387 6445 jjc 388 6445 jjc lgrp_handle_t lgrp_plat_cpu_to_hand(processorid_t id); 389 6445 jjc 390 10710 jonathan void lgrp_plat_init(lgrp_init_stages_t stage); 391 6445 jjc 392 6445 jjc int lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to); 393 6445 jjc 394 6445 jjc int lgrp_plat_max_lgrps(void); 395 6445 jjc 396 6445 jjc pgcnt_t lgrp_plat_mem_size(lgrp_handle_t plathand, 397 6445 jjc lgrp_mem_query_t query); 398 6445 jjc 399 6445 jjc lgrp_handle_t lgrp_plat_pfn_to_hand(pfn_t pfn); 400 6445 jjc 401 6445 jjc void lgrp_plat_probe(void); 402 6445 jjc 403 6445 jjc lgrp_handle_t lgrp_plat_root_hand(void); 404 6445 jjc 405 0 stevel 406 0 stevel /* 407 6445 jjc * Forward declarations of local routines 408 0 stevel */ 409 6445 jjc static int is_opteron(void); 410 6445 jjc 411 6671 jjc static int lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, 412 6706 jjc int node_cnt, cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, 413 6706 jjc uint32_t domain); 414 6671 jjc 415 10710 jonathan static int lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node, 416 10710 jonathan int cpu_node_nentries); 417 6445 jjc 418 6445 jjc static int lgrp_plat_domain_to_node(node_domain_map_t *node_domain, 419 6706 jjc int node_cnt, uint32_t domain); 420 10710 jonathan 421 10710 jonathan static void lgrp_plat_get_numa_config(void); 422 6445 jjc 423 6445 jjc static void lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, 424 6445 jjc lgrp_plat_latency_stats_t *lat_stats, 425 6445 jjc lgrp_plat_probe_stats_t *probe_stats); 426 6445 jjc 427 6445 jjc static int lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory, 428 6445 jjc lgrp_plat_latency_stats_t *lat_stats); 429 10710 jonathan 430 10710 jonathan static void lgrp_plat_main_init(void); 431 6445 jjc 432 6445 jjc static pgcnt_t lgrp_plat_mem_size_default(lgrp_handle_t, lgrp_mem_query_t); 433 6445 jjc 434 6445 jjc static int lgrp_plat_node_domain_update(node_domain_map_t *node_domain, 435 6706 jjc int node_cnt, uint32_t domain); 436 6445 jjc 437 6445 jjc static int lgrp_plat_node_memory_update(node_domain_map_t *node_domain, 438 6706 jjc int node_cnt, node_phys_addr_map_t *node_memory, uint64_t start, 439 6706 jjc uint64_t end, uint32_t domain); 440 6445 jjc 441 9716 jonathan static void lgrp_plat_node_sort(node_domain_map_t *node_domain, 442 9716 jonathan int node_cnt, cpu_node_map_t *cpu_node, int cpu_count, 443 9716 jonathan node_phys_addr_map_t *node_memory); 444 9716 jonathan 445 6445 jjc static hrtime_t lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, 446 10710 jonathan int cpu_node_nentries, lgrp_plat_probe_mem_config_t *probe_mem_config, 447 10710 jonathan lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats); 448 6445 jjc 449 6706 jjc static int lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node); 450 6671 jjc 451 6445 jjc static int lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt, 452 6445 jjc node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats); 453 6445 jjc 454 6706 jjc static int lgrp_plat_process_srat(struct srat *tp, 455 9716 jonathan uint32_t *prox_domain_min, node_domain_map_t *node_domain, 456 9716 jonathan cpu_node_map_t *cpu_node, int cpu_count, 457 6445 jjc node_phys_addr_map_t *node_memory); 458 10710 jonathan 459 10710 jonathan static void lgrp_plat_release_bootstrap(void); 460 6445 jjc 461 9716 jonathan static int lgrp_plat_srat_domains(struct srat *tp, 462 9716 jonathan uint32_t *prox_domain_min); 463 6445 jjc 464 6445 jjc static void lgrp_plat_2level_setup(node_phys_addr_map_t *node_memory, 465 6445 jjc lgrp_plat_latency_stats_t *lat_stats); 466 6445 jjc 467 6445 jjc static void opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv, 468 6445 jjc node_phys_addr_map_t *node_memory); 469 6445 jjc 470 6445 jjc static hrtime_t opt_probe_vendor(int dest_node, int nreads); 471 0 stevel 472 0 stevel 473 0 stevel /* 474 6445 jjc * PLATFORM INTERFACE ROUTINES 475 0 stevel */ 476 0 stevel 477 0 stevel /* 478 0 stevel * Configure memory nodes for machines with more than one node (ie NUMA) 479 0 stevel */ 480 0 stevel void 481 0 stevel plat_build_mem_nodes(struct memlist *list) 482 0 stevel { 483 892 jjc pfn_t cur_start; /* start addr of subrange */ 484 892 jjc pfn_t cur_end; /* end addr of subrange */ 485 892 jjc pfn_t start; /* start addr of whole range */ 486 892 jjc pfn_t end; /* end addr of whole range */ 487 9732 Kit pgcnt_t endcnt; /* pages to sacrifice */ 488 0 stevel 489 0 stevel /* 490 0 stevel * Boot install lists are arranged <addr, len>, ... 491 0 stevel */ 492 0 stevel while (list) { 493 0 stevel int node; 494 0 stevel 495 0 stevel start = list->address >> PAGESHIFT; 496 0 stevel end = (list->address + list->size - 1) >> PAGESHIFT; 497 0 stevel 498 0 stevel if (start > physmax) { 499 0 stevel list = list->next; 500 0 stevel continue; 501 0 stevel } 502 0 stevel if (end > physmax) 503 0 stevel end = physmax; 504 0 stevel 505 0 stevel /* 506 0 stevel * When there is only one memnode, just add memory to memnode 507 0 stevel */ 508 0 stevel if (max_mem_nodes == 1) { 509 0 stevel mem_node_add_slice(start, end); 510 0 stevel list = list->next; 511 0 stevel continue; 512 0 stevel } 513 0 stevel 514 0 stevel /* 515 0 stevel * mem_node_add_slice() expects to get a memory range that 516 0 stevel * is within one memnode, so need to split any memory range 517 0 stevel * that spans multiple memnodes into subranges that are each 518 0 stevel * contained within one memnode when feeding them to 519 0 stevel * mem_node_add_slice() 520 0 stevel */ 521 0 stevel cur_start = start; 522 0 stevel do { 523 0 stevel node = plat_pfn_to_mem_node(cur_start); 524 0 stevel 525 892 jjc /* 526 892 jjc * Panic if DRAM address map registers or SRAT say 527 892 jjc * memory in node doesn't exist or address from 528 892 jjc * boot installed memory list entry isn't in this node. 529 892 jjc * This shouldn't happen and rest of code can't deal 530 892 jjc * with this if it does. 531 892 jjc */ 532 892 jjc if (node < 0 || node >= lgrp_plat_node_cnt || 533 892 jjc !lgrp_plat_node_memory[node].exists || 534 892 jjc cur_start < lgrp_plat_node_memory[node].start || 535 892 jjc cur_start > lgrp_plat_node_memory[node].end) { 536 892 jjc cmn_err(CE_PANIC, "Don't know which memnode " 537 892 jjc "to add installed memory address 0x%lx\n", 538 892 jjc cur_start); 539 892 jjc } 540 0 stevel 541 0 stevel /* 542 0 stevel * End of current subrange should not span memnodes 543 0 stevel */ 544 892 jjc cur_end = end; 545 9732 Kit endcnt = 0; 546 892 jjc if (lgrp_plat_node_memory[node].exists && 547 9732 Kit cur_end > lgrp_plat_node_memory[node].end) { 548 0 stevel cur_end = lgrp_plat_node_memory[node].end; 549 9732 Kit if (mnode_xwa > 1) { 550 9732 Kit /* 551 9732 Kit * sacrifice the last page in each 552 9732 Kit * node to eliminate large pages 553 9732 Kit * that span more than 1 memory node. 554 9732 Kit */ 555 9732 Kit endcnt = 1; 556 10274 Kit physinstalled--; 557 9732 Kit } 558 9732 Kit } 559 0 stevel 560 9732 Kit mem_node_add_slice(cur_start, cur_end - endcnt); 561 0 stevel 562 0 stevel /* 563 0 stevel * Next subrange starts after end of current one 564 0 stevel */ 565 0 stevel cur_start = cur_end + 1; 566 0 stevel } while (cur_end < end); 567 0 stevel 568 0 stevel list = list->next; 569 0 stevel } 570 0 stevel mem_node_physalign = 0; 571 0 stevel mem_node_pfn_shift = 0; 572 0 stevel } 573 0 stevel 574 0 stevel 575 6445 jjc int 576 6445 jjc plat_lgrphand_to_mem_node(lgrp_handle_t hand) 577 6445 jjc { 578 6445 jjc if (max_mem_nodes == 1) 579 6445 jjc return (0); 580 6445 jjc 581 6445 jjc return ((int)hand); 582 6445 jjc } 583 6445 jjc 584 6445 jjc 585 6445 jjc /* 586 6445 jjc * plat_mnode_xcheck: checks the node memory ranges to see if there is a pfncnt 587 6445 jjc * range of pages aligned on pfncnt that crosses an node boundary. Returns 1 if 588 6445 jjc * a crossing is found and returns 0 otherwise. 589 6445 jjc */ 590 6445 jjc int 591 6445 jjc plat_mnode_xcheck(pfn_t pfncnt) 592 6445 jjc { 593 6445 jjc int node, prevnode = -1, basenode; 594 6445 jjc pfn_t ea, sa; 595 6445 jjc 596 6445 jjc for (node = 0; node < lgrp_plat_node_cnt; node++) { 597 6445 jjc 598 6445 jjc if (lgrp_plat_node_memory[node].exists == 0) 599 6445 jjc continue; 600 6445 jjc 601 6445 jjc if (prevnode == -1) { 602 6445 jjc prevnode = node; 603 6445 jjc basenode = node; 604 6445 jjc continue; 605 6445 jjc } 606 6445 jjc 607 6445 jjc /* assume x86 node pfn ranges are in increasing order */ 608 6445 jjc ASSERT(lgrp_plat_node_memory[node].start > 609 6445 jjc lgrp_plat_node_memory[prevnode].end); 610 6445 jjc 611 6445 jjc /* 612 6445 jjc * continue if the starting address of node is not contiguous 613 6445 jjc * with the previous node. 614 6445 jjc */ 615 6445 jjc 616 6445 jjc if (lgrp_plat_node_memory[node].start != 617 6445 jjc (lgrp_plat_node_memory[prevnode].end + 1)) { 618 6445 jjc basenode = node; 619 6445 jjc prevnode = node; 620 6445 jjc continue; 621 6445 jjc } 622 6445 jjc 623 6445 jjc /* check if the starting address of node is pfncnt aligned */ 624 6445 jjc if ((lgrp_plat_node_memory[node].start & (pfncnt - 1)) != 0) { 625 6445 jjc 626 6445 jjc /* 627 6445 jjc * at this point, node starts at an unaligned boundary 628 6445 jjc * and is contiguous with the previous node(s) to 629 6445 jjc * basenode. Check if there is an aligned contiguous 630 6445 jjc * range of length pfncnt that crosses this boundary. 631 6445 jjc */ 632 6445 jjc 633 6445 jjc sa = P2ALIGN(lgrp_plat_node_memory[prevnode].end, 634 6445 jjc pfncnt); 635 6445 jjc ea = P2ROUNDUP((lgrp_plat_node_memory[node].start), 636 6445 jjc pfncnt); 637 6445 jjc 638 6445 jjc ASSERT((ea - sa) == pfncnt); 639 6445 jjc if (sa >= lgrp_plat_node_memory[basenode].start && 640 9732 Kit ea <= (lgrp_plat_node_memory[node].end + 1)) { 641 9732 Kit /* 642 9732 Kit * large page found to cross mnode boundary. 643 9732 Kit * Return Failure if workaround not enabled. 644 9732 Kit */ 645 9732 Kit if (mnode_xwa == 0) 646 9732 Kit return (1); 647 9732 Kit mnode_xwa++; 648 9732 Kit } 649 6445 jjc } 650 6445 jjc prevnode = node; 651 6445 jjc } 652 6445 jjc return (0); 653 6445 jjc } 654 6445 jjc 655 6445 jjc 656 6445 jjc lgrp_handle_t 657 6445 jjc plat_mem_node_to_lgrphand(int mnode) 658 6445 jjc { 659 6445 jjc if (max_mem_nodes == 1) 660 6445 jjc return (LGRP_DEFAULT_HANDLE); 661 6445 jjc 662 6445 jjc return ((lgrp_handle_t)mnode); 663 6445 jjc } 664 6445 jjc 665 6445 jjc 666 6445 jjc int 667 6445 jjc plat_pfn_to_mem_node(pfn_t pfn) 668 6445 jjc { 669 6445 jjc int node; 670 6445 jjc 671 6445 jjc if (max_mem_nodes == 1) 672 6445 jjc return (0); 673 6445 jjc 674 6445 jjc for (node = 0; node < lgrp_plat_node_cnt; node++) { 675 6445 jjc /* 676 6445 jjc * Skip nodes with no memory 677 6445 jjc */ 678 6445 jjc if (!lgrp_plat_node_memory[node].exists) 679 6445 jjc continue; 680 6445 jjc 681 6445 jjc if (pfn >= lgrp_plat_node_memory[node].start && 682 6445 jjc pfn <= lgrp_plat_node_memory[node].end) 683 6445 jjc return (node); 684 6445 jjc } 685 6445 jjc 686 6445 jjc /* 687 6445 jjc * Didn't find memnode where this PFN lives which should never happen 688 6445 jjc */ 689 6445 jjc ASSERT(node < lgrp_plat_node_cnt); 690 6445 jjc return (-1); 691 6445 jjc } 692 6445 jjc 693 6445 jjc 694 6445 jjc /* 695 6445 jjc * LGROUP PLATFORM INTERFACE ROUTINES 696 6445 jjc */ 697 6445 jjc 698 6445 jjc /* 699 6445 jjc * Allocate additional space for an lgroup. 700 6445 jjc */ 701 6445 jjc /* ARGSUSED */ 702 6445 jjc lgrp_t * 703 6445 jjc lgrp_plat_alloc(lgrp_id_t lgrpid) 704 6445 jjc { 705 6445 jjc lgrp_t *lgrp; 706 6445 jjc 707 6445 jjc lgrp = &lgrp_space[nlgrps_alloc++]; 708 6445 jjc if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP) 709 6445 jjc return (NULL); 710 6445 jjc return (lgrp); 711 6445 jjc } 712 6445 jjc 713 6445 jjc 714 6445 jjc /* 715 6445 jjc * Platform handling for (re)configuration changes 716 6445 jjc */ 717 6445 jjc /* ARGSUSED */ 718 6445 jjc void 719 6445 jjc lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg) 720 6445 jjc { 721 6445 jjc } 722 6445 jjc 723 6445 jjc 724 6445 jjc /* 725 6445 jjc * Return the platform handle for the lgroup containing the given CPU 726 6445 jjc */ 727 6445 jjc /* ARGSUSED */ 728 6445 jjc lgrp_handle_t 729 6445 jjc lgrp_plat_cpu_to_hand(processorid_t id) 730 6445 jjc { 731 6445 jjc lgrp_handle_t hand; 732 6445 jjc 733 6445 jjc if (lgrp_plat_node_cnt == 1) 734 6445 jjc return (LGRP_DEFAULT_HANDLE); 735 6445 jjc 736 6445 jjc hand = (lgrp_handle_t)lgrp_plat_cpu_to_node(cpu[id], 737 10710 jonathan lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries); 738 6445 jjc 739 6445 jjc ASSERT(hand != (lgrp_handle_t)-1); 740 6445 jjc if (hand == (lgrp_handle_t)-1) 741 6445 jjc return (LGRP_NULL_HANDLE); 742 6445 jjc 743 6445 jjc return (hand); 744 6445 jjc } 745 6445 jjc 746 6445 jjc 747 0 stevel /* 748 0 stevel * Platform-specific initialization of lgroups 749 0 stevel */ 750 0 stevel void 751 10710 jonathan lgrp_plat_init(lgrp_init_stages_t stage) 752 0 stevel { 753 5084 johnlev #if defined(__xpv) 754 5084 johnlev #else /* __xpv */ 755 9053 jonathan u_longlong_t value; 756 10710 jonathan #endif /* __xpv */ 757 10710 jonathan 758 10710 jonathan switch (stage) { 759 10710 jonathan case LGRP_INIT_STAGE1: 760 10710 jonathan #if defined(__xpv) 761 10710 jonathan /* 762 10710 jonathan * XXPV For now, the hypervisor treats all memory equally. 763 10710 jonathan */ 764 10710 jonathan lgrp_plat_node_cnt = max_mem_nodes = 1; 765 10710 jonathan #else /* __xpv */ 766 10710 jonathan /* 767 10710 jonathan * Get boot property for lgroup topology height limit 768 10710 jonathan */ 769 10710 jonathan if (bootprop_getval(BP_LGRP_TOPO_LEVELS, &value) == 0) 770 10710 jonathan (void) lgrp_topo_ht_limit_set((int)value); 771 10710 jonathan 772 10710 jonathan /* 773 10710 jonathan * Get boot property for enabling/disabling SRAT 774 10710 jonathan */ 775 10710 jonathan if (bootprop_getval(BP_LGRP_SRAT_ENABLE, &value) == 0) 776 10710 jonathan lgrp_plat_srat_enable = (int)value; 777 10710 jonathan 778 10710 jonathan /* 779 10710 jonathan * Get boot property for enabling/disabling SLIT 780 10710 jonathan */ 781 10710 jonathan if (bootprop_getval(BP_LGRP_SLIT_ENABLE, &value) == 0) 782 10710 jonathan lgrp_plat_slit_enable = (int)value; 783 10710 jonathan 784 10710 jonathan /* 785 10710 jonathan * Initialize as a UMA machine 786 10710 jonathan */ 787 10710 jonathan if (lgrp_topo_ht_limit() == 1) { 788 10710 jonathan lgrp_plat_node_cnt = max_mem_nodes = 1; 789 10710 jonathan return; 790 10710 jonathan } 791 10710 jonathan 792 10710 jonathan lgrp_plat_get_numa_config(); 793 10710 jonathan #endif /* __xpv */ 794 10710 jonathan break; 795 10710 jonathan 796 10710 jonathan case LGRP_INIT_STAGE3: 797 10710 jonathan lgrp_plat_probe(); 798 10710 jonathan lgrp_plat_release_bootstrap(); 799 10710 jonathan break; 800 10710 jonathan 801 10710 jonathan case LGRP_INIT_STAGE4: 802 10710 jonathan lgrp_plat_main_init(); 803 10710 jonathan break; 804 10710 jonathan 805 10710 jonathan default: 806 10710 jonathan break; 807 10710 jonathan } 808 10710 jonathan } 809 10710 jonathan 810 10710 jonathan 811 10710 jonathan /* 812 10710 jonathan * Return latency between "from" and "to" lgroups 813 10710 jonathan * 814 10710 jonathan * This latency number can only be used for relative comparison 815 10710 jonathan * between lgroups on the running system, cannot be used across platforms, 816 10710 jonathan * and may not reflect the actual latency. It is platform and implementation 817 10710 jonathan * specific, so platform gets to decide its value. It would be nice if the 818 10710 jonathan * number was at least proportional to make comparisons more meaningful though. 819 10710 jonathan */ 820 10710 jonathan /* ARGSUSED */ 821 10710 jonathan int 822 10710 jonathan lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) 823 10710 jonathan { 824 10710 jonathan lgrp_handle_t src, dest; 825 10710 jonathan int node; 826 10710 jonathan 827 10710 jonathan if (max_mem_nodes == 1) 828 10710 jonathan return (0); 829 9053 jonathan 830 9053 jonathan /* 831 10710 jonathan * Return max latency for root lgroup 832 9053 jonathan */ 833 10710 jonathan if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE) 834 10710 jonathan return (lgrp_plat_lat_stats.latency_max); 835 10710 jonathan 836 10710 jonathan src = from; 837 10710 jonathan dest = to; 838 9053 jonathan 839 9053 jonathan /* 840 10710 jonathan * Return 0 for nodes (lgroup platform handles) out of range 841 9053 jonathan */ 842 10710 jonathan if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES) 843 10710 jonathan return (0); 844 9053 jonathan 845 9053 jonathan /* 846 10710 jonathan * Probe from current CPU if its lgroup latencies haven't been set yet 847 10710 jonathan * and we are trying to get latency from current CPU to some node 848 9053 jonathan */ 849 10710 jonathan node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node, 850 10710 jonathan lgrp_plat_cpu_node_nentries); 851 10710 jonathan ASSERT(node >= 0 && node < lgrp_plat_node_cnt); 852 10710 jonathan if (lgrp_plat_lat_stats.latencies[src][src] == 0 && node == src) 853 10710 jonathan lgrp_plat_probe(); 854 10710 jonathan 855 10710 jonathan return (lgrp_plat_lat_stats.latencies[src][dest]); 856 10710 jonathan } 857 10710 jonathan 858 10710 jonathan 859 10710 jonathan /* 860 10710 jonathan * Return the maximum number of lgrps supported by the platform. 861 10710 jonathan * Before lgrp topology is known it returns an estimate based on the number of 862 10710 jonathan * nodes. Once topology is known it returns the actual maximim number of lgrps 863 10710 jonathan * created. Since x86/x64 doesn't support Dynamic Reconfiguration (DR) and 864 10710 jonathan * dynamic addition of new nodes, this number may not grow during system 865 10710 jonathan * lifetime (yet). 866 10710 jonathan */ 867 10710 jonathan int 868 10710 jonathan lgrp_plat_max_lgrps(void) 869 10710 jonathan { 870 10710 jonathan return (lgrp_topo_initialized ? 871 10710 jonathan lgrp_alloc_max + 1 : 872 10710 jonathan lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1); 873 10710 jonathan } 874 10710 jonathan 875 10710 jonathan 876 10710 jonathan /* 877 10710 jonathan * Return the number of free pages in an lgroup. 878 10710 jonathan * 879 10710 jonathan * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize 880 10710 jonathan * pages on freelists. For query of LGRP_MEM_SIZE_AVAIL, return the 881 10710 jonathan * number of allocatable base pagesize pages corresponding to the 882 10710 jonathan * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..) 883 10710 jonathan * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical 884 10710 jonathan * memory installed, regardless of whether or not it's usable. 885 10710 jonathan */ 886 10710 jonathan pgcnt_t 887 10710 jonathan lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query) 888 10710 jonathan { 889 10710 jonathan int mnode; 890 10710 jonathan pgcnt_t npgs = (pgcnt_t)0; 891 10710 jonathan extern struct memlist *phys_avail; 892 10710 jonathan extern struct memlist *phys_install; 893 10710 jonathan 894 10710 jonathan 895 10710 jonathan if (plathand == LGRP_DEFAULT_HANDLE) 896 10710 jonathan return (lgrp_plat_mem_size_default(plathand, query)); 897 10710 jonathan 898 10710 jonathan if (plathand != LGRP_NULL_HANDLE) { 899 10710 jonathan mnode = plat_lgrphand_to_mem_node(plathand); 900 10710 jonathan if (mnode >= 0 && mem_node_config[mnode].exists) { 901 10710 jonathan switch (query) { 902 10710 jonathan case LGRP_MEM_SIZE_FREE: 903 10710 jonathan npgs = MNODE_PGCNT(mnode); 904 10710 jonathan break; 905 10710 jonathan case LGRP_MEM_SIZE_AVAIL: 906 10710 jonathan npgs = mem_node_memlist_pages(mnode, 907 10710 jonathan phys_avail); 908 10710 jonathan break; 909 10710 jonathan case LGRP_MEM_SIZE_INSTALL: 910 10710 jonathan npgs = mem_node_memlist_pages(mnode, 911 10710 jonathan phys_install); 912 10710 jonathan break; 913 10710 jonathan default: 914 10710 jonathan break; 915 10710 jonathan } 916 10710 jonathan } 917 10710 jonathan } 918 10710 jonathan return (npgs); 919 10710 jonathan } 920 10710 jonathan 921 10710 jonathan 922 10710 jonathan /* 923 10710 jonathan * Return the platform handle of the lgroup that contains the physical memory 924 10710 jonathan * corresponding to the given page frame number 925 10710 jonathan */ 926 10710 jonathan /* ARGSUSED */ 927 10710 jonathan lgrp_handle_t 928 10710 jonathan lgrp_plat_pfn_to_hand(pfn_t pfn) 929 10710 jonathan { 930 10710 jonathan int mnode; 931 10710 jonathan 932 10710 jonathan if (max_mem_nodes == 1) 933 10710 jonathan return (LGRP_DEFAULT_HANDLE); 934 10710 jonathan 935 10710 jonathan if (pfn > physmax) 936 10710 jonathan return (LGRP_NULL_HANDLE); 937 10710 jonathan 938 10710 jonathan mnode = plat_pfn_to_mem_node(pfn); 939 10710 jonathan if (mnode < 0) 940 10710 jonathan return (LGRP_NULL_HANDLE); 941 10710 jonathan 942 10710 jonathan return (MEM_NODE_2_LGRPHAND(mnode)); 943 10710 jonathan } 944 10710 jonathan 945 10710 jonathan 946 10710 jonathan /* 947 10710 jonathan * Probe memory in each node from current CPU to determine latency topology 948 10710 jonathan * 949 10710 jonathan * The probing code will probe the vendor ID register on the Northbridge of 950 10710 jonathan * Opteron processors and probe memory for other processors by default. 951 10710 jonathan * 952 10710 jonathan * Since probing is inherently error prone, the code takes laps across all the 953 10710 jonathan * nodes probing from each node to each of the other nodes some number of 954 10710 jonathan * times. Furthermore, each node is probed some number of times before moving 955 10710 jonathan * onto the next one during each lap. The minimum latency gotten between nodes 956 10710 jonathan * is kept as the latency between the nodes. 957 10710 jonathan * 958 10710 jonathan * After all that, the probe times are adjusted by normalizing values that are 959 10710 jonathan * close to each other and local latencies are made the same. Lastly, the 960 10710 jonathan * latencies are verified to make sure that certain conditions are met (eg. 961 10710 jonathan * local < remote, latency(a, b) == latency(b, a), etc.). 962 10710 jonathan * 963 10710 jonathan * If any of the conditions aren't met, the code will export a NUMA 964 10710 jonathan * configuration with the local CPUs and memory given by the SRAT or PCI config 965 10710 jonathan * space registers and one remote memory latency since it can't tell exactly 966 10710 jonathan * how far each node is from each other. 967 10710 jonathan */ 968 10710 jonathan void 969 10710 jonathan lgrp_plat_probe(void) 970 10710 jonathan { 971 10710 jonathan int from; 972 10710 jonathan int i; 973 10710 jonathan lgrp_plat_latency_stats_t *lat_stats; 974 10710 jonathan boolean_t probed; 975 10710 jonathan hrtime_t probe_time; 976 10710 jonathan int to; 977 10710 jonathan 978 10710 jonathan if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) || 979 10710 jonathan max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2) 980 10710 jonathan return; 981 0 stevel 982 0 stevel /* 983 10710 jonathan * Determine ID of node containing current CPU 984 0 stevel */ 985 10710 jonathan from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node, 986 10710 jonathan lgrp_plat_cpu_node_nentries); 987 10710 jonathan ASSERT(from >= 0 && from < lgrp_plat_node_cnt); 988 10710 jonathan if (srat_ptr && lgrp_plat_srat_enable && !lgrp_plat_srat_error) 989 10710 jonathan ASSERT(lgrp_plat_node_domain[from].exists); 990 10710 jonathan 991 10710 jonathan /* 992 10710 jonathan * Don't need to probe if got times already 993 10710 jonathan */ 994 10710 jonathan lat_stats = &lgrp_plat_lat_stats; 995 10710 jonathan if (lat_stats->latencies[from][from] != 0) 996 0 stevel return; 997 10710 jonathan 998 10710 jonathan /* 999 10710 jonathan * Read vendor ID in Northbridge or read and write page(s) 1000 10710 jonathan * in each node from current CPU and remember how long it takes, 1001 10710 jonathan * so we can build latency topology of machine later. 1002 10710 jonathan * This should approximate the memory latency between each node. 1003 10710 jonathan */ 1004 10710 jonathan probed = B_FALSE; 1005 10710 jonathan for (i = 0; i < lgrp_plat_probe_nrounds; i++) { 1006 10710 jonathan for (to = 0; to < lgrp_plat_node_cnt; to++) { 1007 10710 jonathan /* 1008 10710 jonathan * Get probe time and skip over any nodes that can't be 1009 10710 jonathan * probed yet or don't have memory 1010 10710 jonathan */ 1011 10710 jonathan probe_time = lgrp_plat_probe_time(to, 1012 10710 jonathan lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries, 1013 10710 jonathan &lgrp_plat_probe_mem_config, &lgrp_plat_lat_stats, 1014 10710 jonathan &lgrp_plat_probe_stats); 1015 10710 jonathan if (probe_time == 0) 1016 10710 jonathan continue; 1017 10710 jonathan 1018 10710 jonathan probed = B_TRUE; 1019 10710 jonathan 1020 10710 jonathan /* 1021 10710 jonathan * Keep lowest probe time as latency between nodes 1022 10710 jonathan */ 1023 10710 jonathan if (lat_stats->latencies[from][to] == 0 || 1024 10710 jonathan probe_time < lat_stats->latencies[from][to]) 1025 10710 jonathan lat_stats->latencies[from][to] = probe_time; 1026 10710 jonathan 1027 10710 jonathan /* 1028 10710 jonathan * Update overall minimum and maximum probe times 1029 10710 jonathan * across all nodes 1030 10710 jonathan */ 1031 10710 jonathan if (probe_time < lat_stats->latency_min || 1032 10710 jonathan lat_stats->latency_min == -1) 1033 10710 jonathan lat_stats->latency_min = probe_time; 1034 10710 jonathan if (probe_time > lat_stats->latency_max) 1035 10710 jonathan lat_stats->latency_max = probe_time; 1036 10710 jonathan } 1037 0 stevel } 1038 0 stevel 1039 0 stevel /* 1040 10710 jonathan * Bail out if weren't able to probe any nodes from current CPU 1041 6671 jjc */ 1042 10710 jonathan if (probed == B_FALSE) 1043 10710 jonathan return; 1044 10710 jonathan 1045 10710 jonathan /* 1046 10710 jonathan * - Fix up latencies such that local latencies are same, 1047 10710 jonathan * latency(i, j) == latency(j, i), etc. (if possible) 1048 10710 jonathan * 1049 10710 jonathan * - Verify that latencies look ok 1050 10710 jonathan * 1051 10710 jonathan * - Fallback to just optimizing for local and remote if 1052 10710 jonathan * latencies didn't look right 1053 10710 jonathan */ 1054 10710 jonathan lgrp_plat_latency_adjust(lgrp_plat_node_memory, &lgrp_plat_lat_stats, 1055 10710 jonathan &lgrp_plat_probe_stats); 1056 10710 jonathan lgrp_plat_probe_stats.probe_error_code = 1057 10710 jonathan lgrp_plat_latency_verify(lgrp_plat_node_memory, 1058 10710 jonathan &lgrp_plat_lat_stats); 1059 10710 jonathan if (lgrp_plat_probe_stats.probe_error_code) 1060 10710 jonathan lgrp_plat_2level_setup(lgrp_plat_node_memory, 1061 10710 jonathan &lgrp_plat_lat_stats); 1062 10710 jonathan } 1063 10710 jonathan 1064 10710 jonathan 1065 10710 jonathan /* 1066 10710 jonathan * Return platform handle for root lgroup 1067 10710 jonathan */ 1068 10710 jonathan lgrp_handle_t 1069 10710 jonathan lgrp_plat_root_hand(void) 1070 10710 jonathan { 1071 10710 jonathan return (LGRP_DEFAULT_HANDLE); 1072 10710 jonathan } 1073 10710 jonathan 1074 10710 jonathan 1075 10710 jonathan /* 1076 10710 jonathan * INTERNAL ROUTINES 1077 10710 jonathan */ 1078 10710 jonathan 1079 10710 jonathan 1080 10710 jonathan /* 1081 10710 jonathan * Update CPU to node mapping for given CPU and proximity domain (and returns 1082 10710 jonathan * negative numbers for errors and positive ones for success) 1083 10710 jonathan */ 1084 10710 jonathan static int 1085 10710 jonathan lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, int node_cnt, 1086 10710 jonathan cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, uint32_t domain) 1087 10710 jonathan { 1088 10710 jonathan uint_t i; 1089 10710 jonathan int node; 1090 10710 jonathan 1091 10710 jonathan /* 1092 10710 jonathan * Get node number for proximity domain 1093 10710 jonathan */ 1094 10710 jonathan node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain); 1095 10710 jonathan if (node == -1) { 1096 10710 jonathan node = lgrp_plat_node_domain_update(node_domain, node_cnt, 1097 10710 jonathan domain); 1098 10710 jonathan if (node == -1) 1099 10710 jonathan return (-1); 1100 10710 jonathan } 1101 10710 jonathan 1102 10710 jonathan /* 1103 10710 jonathan * Search for entry with given APIC ID and fill in its node and 1104 10710 jonathan * proximity domain IDs (if they haven't been set already) 1105 10710 jonathan */ 1106 10710 jonathan for (i = 0; i < nentries; i++) { 1107 10710 jonathan /* 1108 10710 jonathan * Skip nonexistent entries and ones without matching APIC ID 1109 10710 jonathan */ 1110 10710 jonathan if (!cpu_node[i].exists || cpu_node[i].apicid != apicid) 1111 10710 jonathan continue; 1112 10710 jonathan 1113 10710 jonathan /* 1114 10710 jonathan * Just return if entry completely and correctly filled in 1115 10710 jonathan * already 1116 10710 jonathan */ 1117 10710 jonathan if (cpu_node[i].prox_domain == domain && 1118 10710 jonathan cpu_node[i].node == node) 1119 10710 jonathan return (1); 1120 10710 jonathan 1121 10710 jonathan /* 1122 10710 jonathan * Fill in node and proximity domain IDs 1123 10710 jonathan */ 1124 10710 jonathan cpu_node[i].prox_domain = domain; 1125 10710 jonathan cpu_node[i].node = node; 1126 10710 jonathan 1127 10710 jonathan return (0); 1128 10710 jonathan } 1129 10710 jonathan 1130 10710 jonathan /* 1131 10710 jonathan * Return error when entry for APIC ID wasn't found in table 1132 10710 jonathan */ 1133 10710 jonathan return (-2); 1134 10710 jonathan } 1135 10710 jonathan 1136 10710 jonathan 1137 10710 jonathan /* 1138 10710 jonathan * Get node ID for given CPU 1139 10710 jonathan */ 1140 10710 jonathan static int 1141 10710 jonathan lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node, 1142 10710 jonathan int cpu_node_nentries) 1143 10710 jonathan { 1144 10710 jonathan processorid_t cpuid; 1145 10710 jonathan 1146 10710 jonathan if (cp == NULL) 1147 10710 jonathan return (-1); 1148 10710 jonathan 1149 10710 jonathan cpuid = cp->cpu_id; 1150 10710 jonathan if (cpuid < 0 || cpuid >= max_ncpus) 1151 10710 jonathan return (-1); 1152 10710 jonathan 1153 10710 jonathan /* 1154 10710 jonathan * SRAT doesn't exist, isn't enabled, or there was an error processing 1155 10947 Srihari * it, so return node ID for Opteron and -1 otherwise. 1156 10710 jonathan */ 1157 10710 jonathan if (srat_ptr == NULL || !lgrp_plat_srat_enable || 1158 10710 jonathan lgrp_plat_srat_error) { 1159 10710 jonathan if (is_opteron()) 1160 10947 Srihari return (pg_plat_hw_instance_id(cp, PGHW_PROCNODE)); 1161 10710 jonathan return (-1); 1162 10710 jonathan } 1163 10710 jonathan 1164 10710 jonathan /* 1165 10710 jonathan * Return -1 when CPU to node ID mapping entry doesn't exist for given 1166 10710 jonathan * CPU 1167 10710 jonathan */ 1168 10710 jonathan if (cpuid >= cpu_node_nentries || !cpu_node[cpuid].exists) 1169 10710 jonathan return (-1); 1170 10710 jonathan 1171 10710 jonathan return (cpu_node[cpuid].node); 1172 10710 jonathan } 1173 10710 jonathan 1174 10710 jonathan 1175 10710 jonathan /* 1176 10710 jonathan * Return node number for given proximity domain/system locality 1177 10710 jonathan */ 1178 10710 jonathan static int 1179 10710 jonathan lgrp_plat_domain_to_node(node_domain_map_t *node_domain, int node_cnt, 1180 10710 jonathan uint32_t domain) 1181 10710 jonathan { 1182 10710 jonathan uint_t node; 1183 10710 jonathan uint_t start; 1184 10710 jonathan 1185 10710 jonathan /* 1186 10710 jonathan * Hash proximity domain ID into node to domain mapping table (array), 1187 10710 jonathan * search for entry with matching proximity domain ID, and return index 1188 10710 jonathan * of matching entry as node ID. 1189 10710 jonathan */ 1190 10710 jonathan node = start = NODE_DOMAIN_HASH(domain, node_cnt); 1191 10710 jonathan do { 1192 10710 jonathan if (node_domain[node].prox_domain == domain && 1193 10710 jonathan node_domain[node].exists) 1194 10710 jonathan return (node); 1195 10710 jonathan node = (node + 1) % node_cnt; 1196 10710 jonathan } while (node != start); 1197 10710 jonathan return (-1); 1198 10710 jonathan } 1199 10710 jonathan 1200 10710 jonathan 1201 10710 jonathan /* 1202 10710 jonathan * Get NUMA configuration of machine 1203 10710 jonathan */ 1204 10710 jonathan static void 1205 10710 jonathan lgrp_plat_get_numa_config(void) 1206 10710 jonathan { 1207 10710 jonathan uint_t probe_op; 1208 10710 jonathan 1209 10710 jonathan /* 1210 10710 jonathan * Read boot property with CPU to APIC ID mapping table/array to 1211 10710 jonathan * determine number of CPUs 1212 10710 jonathan */ 1213 10710 jonathan lgrp_plat_apic_ncpus = lgrp_plat_process_cpu_apicids(NULL); 1214 6671 jjc 1215 6671 jjc /* 1216 6445 jjc * Determine which CPUs and memory are local to each other and number 1217 6445 jjc * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT) 1218 0 stevel */ 1219 6706 jjc if (lgrp_plat_apic_ncpus > 0) { 1220 6706 jjc int retval; 1221 10710 jonathan 1222 10710 jonathan /* 1223 10710 jonathan * Temporarily allocate boot memory to use for CPU to node 1224 10710 jonathan * mapping since kernel memory allocator isn't alive yet 1225 10710 jonathan */ 1226 10710 jonathan lgrp_plat_cpu_node = (cpu_node_map_t *)BOP_ALLOC(bootops, 1227 10710 jonathan NULL, lgrp_plat_apic_ncpus * sizeof (cpu_node_map_t), 1228 10710 jonathan sizeof (int)); 1229 10710 jonathan 1230 10710 jonathan ASSERT(lgrp_plat_cpu_node != NULL); 1231 10710 jonathan if (lgrp_plat_cpu_node) { 1232 10710 jonathan lgrp_plat_cpu_node_nentries = lgrp_plat_apic_ncpus; 1233 10710 jonathan bzero(lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries * 1234 10710 jonathan sizeof (cpu_node_map_t)); 1235 10710 jonathan } 1236 10710 jonathan 1237 10710 jonathan /* 1238 10710 jonathan * Fill in CPU to node ID mapping table with APIC ID for each 1239 10710 jonathan * CPU 1240 10710 jonathan */ 1241 10710 jonathan (void) lgrp_plat_process_cpu_apicids(lgrp_plat_cpu_node); 1242 6706 jjc 1243 6706 jjc retval = lgrp_plat_process_srat(srat_ptr, 1244 9716 jonathan &lgrp_plat_prox_domain_min, 1245 6706 jjc lgrp_plat_node_domain, lgrp_plat_cpu_node, 1246 6706 jjc lgrp_plat_apic_ncpus, lgrp_plat_node_memory); 1247 6706 jjc if (retval <= 0) { 1248 6706 jjc lgrp_plat_srat_error = retval; 1249 6706 jjc lgrp_plat_node_cnt = 1; 1250 6706 jjc } else { 1251 6706 jjc lgrp_plat_srat_error = 0; 1252 6706 jjc lgrp_plat_node_cnt = retval; 1253 6706 jjc } 1254 6671 jjc } 1255 0 stevel 1256 0 stevel /* 1257 6671 jjc * Try to use PCI config space registers on Opteron if there's an error 1258 6671 jjc * processing CPU to APIC ID mapping or SRAT 1259 0 stevel */ 1260 6706 jjc if ((lgrp_plat_apic_ncpus <= 0 || lgrp_plat_srat_error != 0) && 1261 6671 jjc is_opteron()) 1262 6445 jjc opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv, 1263 6445 jjc lgrp_plat_node_memory); 1264 0 stevel 1265 4898 jjc /* 1266 6445 jjc * Don't bother to setup system for multiple lgroups and only use one 1267 6445 jjc * memory node when memory is interleaved between any nodes or there is 1268 6445 jjc * only one NUMA node 1269 6445 jjc * 1270 6445 jjc * NOTE: May need to change this for Dynamic Reconfiguration (DR) 1271 6445 jjc * when and if it happens for x86/x64 1272 4898 jjc */ 1273 6445 jjc if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) { 1274 6445 jjc lgrp_plat_node_cnt = max_mem_nodes = 1; 1275 6445 jjc (void) lgrp_topo_ht_limit_set(1); 1276 6445 jjc return; 1277 4898 jjc } 1278 4898 jjc 1279 6445 jjc /* 1280 6445 jjc * Leaf lgroups on x86/x64 architectures contain one physical 1281 6445 jjc * processor chip. Tune lgrp_expand_proc_thresh and 1282 6445 jjc * lgrp_expand_proc_diff so that lgrp_choose() will spread 1283 6445 jjc * things out aggressively. 1284 6445 jjc */ 1285 6445 jjc lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2; 1286 6445 jjc lgrp_expand_proc_diff = 0; 1287 6445 jjc 1288 6445 jjc /* 1289 6445 jjc * There should be one memnode (physical page free list(s)) for 1290 6445 jjc * each node 1291 6445 jjc */ 1292 6445 jjc max_mem_nodes = lgrp_plat_node_cnt; 1293 6565 jjc 1294 6565 jjc /* 1295 6565 jjc * Initialize min and max latency before reading SLIT or probing 1296 6565 jjc */ 1297 6565 jjc lgrp_plat_lat_stats.latency_min = -1; 1298 6565 jjc lgrp_plat_lat_stats.latency_max = 0; 1299 6445 jjc 1300 6445 jjc /* 1301 6445 jjc * Determine how far each NUMA node is from each other by 1302 6445 jjc * reading ACPI System Locality Information Table (SLIT) if it 1303 6445 jjc * exists 1304 6445 jjc */ 1305 6445 jjc lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr, 1306 6445 jjc lgrp_plat_node_cnt, lgrp_plat_node_memory, 1307 6445 jjc &lgrp_plat_lat_stats); 1308 6445 jjc if (lgrp_plat_slit_error == 0) 1309 6445 jjc return; 1310 6445 jjc 1311 6445 jjc /* 1312 6445 jjc * Probe to determine latency between NUMA nodes when SLIT 1313 6445 jjc * doesn't exist or make sense 1314 6445 jjc */ 1315 6445 jjc lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE; 1316 6445 jjc 1317 6445 jjc /* 1318 6445 jjc * Specify whether to probe using vendor ID register or page copy 1319 6445 jjc * if hasn't been specified already or is overspecified 1320 6445 jjc */ 1321 6445 jjc probe_op = lgrp_plat_probe_flags & 1322 6445 jjc (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR); 1323 6445 jjc 1324 6445 jjc if (probe_op == 0 || 1325 6445 jjc probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) { 1326 6445 jjc lgrp_plat_probe_flags &= 1327 6445 jjc ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR); 1328 6445 jjc if (is_opteron()) 1329 6445 jjc lgrp_plat_probe_flags |= 1330 6445 jjc LGRP_PLAT_PROBE_VENDOR; 1331 6445 jjc else 1332 6445 jjc lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY; 1333 6445 jjc } 1334 6445 jjc 1335 6445 jjc /* 1336 6445 jjc * Probing errors can mess up the lgroup topology and 1337 6445 jjc * force us fall back to a 2 level lgroup topology. 1338 6445 jjc * Here we bound how tall the lgroup topology can grow 1339 6445 jjc * in hopes of avoiding any anamolies in probing from 1340 6445 jjc * messing up the lgroup topology by limiting the 1341 6445 jjc * accuracy of the latency topology. 1342 6445 jjc * 1343 6445 jjc * Assume that nodes will at least be configured in a 1344 6445 jjc * ring, so limit height of lgroup topology to be less 1345 6445 jjc * than number of nodes on a system with 4 or more 1346 6445 jjc * nodes 1347 6445 jjc */ 1348 6445 jjc if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() == 1349 6445 jjc lgrp_topo_ht_limit_default()) 1350 6445 jjc (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1); 1351 0 stevel } 1352 0 stevel 1353 0 stevel 1354 0 stevel /* 1355 0 stevel * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to 1356 0 stevel * be considered same 1357 0 stevel */ 1358 0 stevel #define LGRP_LAT_TOLERANCE_SHIFT 4 1359 0 stevel 1360 0 stevel int lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT; 1361 0 stevel 1362 0 stevel 1363 0 stevel /* 1364 0 stevel * Adjust latencies between nodes to be symmetric, normalize latencies between 1365 0 stevel * any nodes that are within some tolerance to be same, and make local 1366 0 stevel * latencies be same 1367 0 stevel */ 1368 0 stevel static void 1369 6445 jjc lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, 1370 6445 jjc lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats) 1371 0 stevel { 1372 0 stevel int i; 1373 0 stevel int j; 1374 0 stevel int k; 1375 0 stevel int l; 1376 0 stevel u_longlong_t max; 1377 0 stevel u_longlong_t min; 1378 0 stevel u_longlong_t t; 1379 0 stevel u_longlong_t t1; 1380 0 stevel u_longlong_t t2; 1381 2988 jjc const lgrp_config_flag_t cflag = LGRP_CONFIG_LAT_CHANGE_ALL; 1382 0 stevel int lat_corrected[MAX_NODES][MAX_NODES]; 1383 0 stevel 1384 0 stevel /* 1385 6445 jjc * Nothing to do when this is an UMA machine or don't have args needed 1386 0 stevel */ 1387 0 stevel if (max_mem_nodes == 1) 1388 0 stevel return; 1389 6445 jjc 1390 6445 jjc ASSERT(node_memory != NULL && lat_stats != NULL && 1391 6445 jjc probe_stats != NULL); 1392 0 stevel 1393 0 stevel /* 1394 0 stevel * Make sure that latencies are symmetric between any two nodes 1395 0 stevel * (ie. latency(node0, node1) == latency(node1, node0)) 1396 0 stevel */ 1397 6445 jjc for (i = 0; i < lgrp_plat_node_cnt; i++) { 1398 6445 jjc if (!node_memory[i].exists) 1399 6445 jjc continue; 1400 6445 jjc 1401 0 stevel for (j = 0; j < lgrp_plat_node_cnt; j++) { 1402 6445 jjc if (!node_memory[j].exists) 1403 6445 jjc continue; 1404 6445 jjc 1405 6445 jjc t1 = lat_stats->latencies[i][j]; 1406 6445 jjc t2 = lat_stats->latencies[j][i]; 1407 0 stevel 1408 0 stevel if (t1 == 0 || t2 == 0 || t1 == t2) 1409 0 stevel continue; 1410 0 stevel 1411 0 stevel /* 1412 0 stevel * Latencies should be same 1413 0 stevel * - Use minimum of two latencies which should be same 1414 0 stevel * - Track suspect probe times not within tolerance of 1415 0 stevel * min value 1416 0 stevel * - Remember how much values are corrected by 1417 0 stevel */ 1418 0 stevel if (t1 > t2) { 1419 0 stevel t = t2; 1420 6445 jjc probe_stats->probe_errors[i][j] += t1 - t2; 1421 0 stevel if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) { 1422 6445 jjc probe_stats->probe_suspect[i][j]++; 1423 6445 jjc probe_stats->probe_suspect[j][i]++; 1424 0 stevel } 1425 0 stevel } else if (t2 > t1) { 1426 0 stevel t = t1; 1427 6445 jjc probe_stats->probe_errors[j][i] += t2 - t1; 1428 0 stevel if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) { 1429 6445 jjc probe_stats->probe_suspect[i][j]++; 1430 6445 jjc probe_stats->probe_suspect[j][i]++; 1431 0 stevel } 1432 0 stevel } 1433 0 stevel 1434 6445 jjc lat_stats->latencies[i][j] = 1435 6445 jjc lat_stats->latencies[j][i] = t; 1436 0 stevel lgrp_config(cflag, t1, t); 1437 0 stevel lgrp_config(cflag, t2, t); 1438 0 stevel } 1439 6445 jjc } 1440 0 stevel 1441 0 stevel /* 1442 0 stevel * Keep track of which latencies get corrected 1443 0 stevel */ 1444 0 stevel for (i = 0; i < MAX_NODES; i++) 1445 0 stevel for (j = 0; j < MAX_NODES; j++) 1446 0 stevel lat_corrected[i][j] = 0; 1447 0 stevel 1448 0 stevel /* 1449 0 stevel * For every two nodes, see whether there is another pair of nodes which 1450 0 stevel * are about the same distance apart and make the latencies be the same 1451 0 stevel * if they are close enough together 1452 0 stevel */ 1453 6445 jjc for (i = 0; i < lgrp_plat_node_cnt; i++) { 1454 0 stevel for (j = 0; j < lgrp_plat_node_cnt; j++) { 1455 6445 jjc if (!node_memory[j].exists) 1456 6445 jjc continue; 1457 0 stevel /* 1458 0 stevel * Pick one pair of nodes (i, j) 1459 0 stevel * and get latency between them 1460 0 stevel */ 1461 6445 jjc t1 = lat_stats->latencies[i][j]; 1462 0 stevel 1463 0 stevel /* 1464 0 stevel * Skip this pair of nodes if there isn't a latency 1465 0 stevel * for it yet 1466 0 stevel */ 1467 0 stevel if (t1 == 0) 1468 0 stevel continue; 1469 0 stevel 1470 6445 jjc for (k = 0; k < lgrp_plat_node_cnt; k++) { 1471 0 stevel for (l = 0; l < lgrp_plat_node_cnt; l++) { 1472 6445 jjc if (!node_memory[l].exists) 1473 6445 jjc continue; 1474 0 stevel /* 1475 0 stevel * Pick another pair of nodes (k, l) 1476 0 stevel * not same as (i, j) and get latency 1477 0 stevel * between them 1478 0 stevel */ 1479 0 stevel if (k == i && l == j) 1480 0 stevel continue; 1481 0 stevel 1482 6445 jjc t2 = lat_stats->latencies[k][l]; 1483 0 stevel 1484 0 stevel /* 1485 0 stevel * Skip this pair of nodes if there 1486 0 stevel * isn't a latency for it yet 1487 0 stevel */ 1488 0 stevel 1489 0 stevel if (t2 == 0) 1490 0 stevel continue; 1491 0 stevel 1492 0 stevel /* 1493 0 stevel * Skip nodes (k, l) if they already 1494 0 stevel * have same latency as (i, j) or 1495 0 stevel * their latency isn't close enough to 1496 0 stevel * be considered/made the same 1497 0 stevel */ 1498 0 stevel if (t1 == t2 || (t1 > t2 && t1 - t2 > 1499 0 stevel t1 >> lgrp_plat_probe_lt_shift) || 1500 0 stevel (t2 > t1 && t2 - t1 > 1501 0 stevel t2 >> lgrp_plat_probe_lt_shift)) 1502 0 stevel continue; 1503 0 stevel 1504 0 stevel /* 1505 0 stevel * Make latency(i, j) same as 1506 0 stevel * latency(k, l), try to use latency 1507 0 stevel * that has been adjusted already to get 1508 0 stevel * more consistency (if possible), and 1509 0 stevel * remember which latencies were 1510 0 stevel * adjusted for next time 1511 0 stevel */ 1512 0 stevel if (lat_corrected[i][j]) { 1513 0 stevel t = t1; 1514 0 stevel lgrp_config(cflag, t2, t); 1515 0 stevel t2 = t; 1516 0 stevel } else if (lat_corrected[k][l]) { 1517 0 stevel t = t2; 1518 0 stevel lgrp_config(cflag, t1, t); 1519 0 stevel t1 = t; 1520 0 stevel } else { 1521 0 stevel if (t1 > t2) 1522 0 stevel t = t2; 1523 0 stevel else 1524 0 stevel t = t1; 1525 0 stevel lgrp_config(cflag, t1, t); 1526 0 stevel lgrp_config(cflag, t2, t); 1527 0 stevel t1 = t2 = t; 1528 0 stevel } 1529 0 stevel 1530 6445 jjc lat_stats->latencies[i][j] = 1531 6445 jjc lat_stats->latencies[k][l] = t; 1532 0 stevel 1533 0 stevel lat_corrected[i][j] = 1534 0 stevel lat_corrected[k][l] = 1; 1535 0 stevel } 1536 6445 jjc } 1537 0 stevel } 1538 6445 jjc } 1539 0 stevel 1540 0 stevel /* 1541 0 stevel * Local latencies should be same 1542 0 stevel * - Find min and max local latencies 1543 0 stevel * - Make all local latencies be minimum 1544 0 stevel */ 1545 0 stevel min = -1; 1546 0 stevel max = 0; 1547 0 stevel for (i = 0; i < lgrp_plat_node_cnt; i++) { 1548 6445 jjc if (!node_memory[i].exists) 1549 6445 jjc continue; 1550 6445 jjc t = lat_stats->latencies[i][i]; 1551 0 stevel if (t == 0) 1552 0 stevel continue; 1553 0 stevel if (min == -1 || t < min) 1554 0 stevel min = t; 1555 0 stevel if (t > max) 1556 0 stevel max = t; 1557 0 stevel } 1558 0 stevel if (min != max) { 1559 0 stevel for (i = 0; i < lgrp_plat_node_cnt; i++) { 1560 0 stevel int local; 1561 0 stevel 1562 6445 jjc if (!node_memory[i].exists) 1563 6445 jjc continue; 1564 6445 jjc 1565 6445 jjc local = lat_stats->latencies[i][i]; 1566 0 stevel if (local == 0) 1567 0 stevel continue; 1568 0 stevel 1569 0 stevel /* 1570 0 stevel * Track suspect probe times that aren't within 1571 0 stevel * tolerance of minimum local latency and how much 1572 0 stevel * probe times are corrected by 1573 0 stevel */ 1574 0 stevel if (local - min > min >> lgrp_plat_probe_lt_shift) 1575 6445 jjc probe_stats->probe_suspect[i][i]++; 1576 0 stevel 1577 6445 jjc probe_stats->probe_errors[i][i] += local - min; 1578 0 stevel 1579 0 stevel /* 1580 0 stevel * Make local latencies be minimum 1581 0 stevel */ 1582 2988 jjc lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min); 1583 6445 jjc lat_stats->latencies[i][i] = min; 1584 0 stevel } 1585 0 stevel } 1586 0 stevel 1587 0 stevel /* 1588 0 stevel * Determine max probe time again since just adjusted latencies 1589 0 stevel */ 1590 6445 jjc lat_stats->latency_max = 0; 1591 6445 jjc for (i = 0; i < lgrp_plat_node_cnt; i++) { 1592 0 stevel for (j = 0; j < lgrp_plat_node_cnt; j++) { 1593 6445 jjc if (!node_memory[j].exists) 1594 6445 jjc continue; 1595 6445 jjc t = lat_stats->latencies[i][j]; 1596 6445 jjc if (t > lat_stats->latency_max) 1597 6445 jjc lat_stats->latency_max = t; 1598 0 stevel } 1599 6445 jjc } 1600 0 stevel } 1601 0 stevel 1602 0 stevel 1603 0 stevel /* 1604 0 stevel * Verify following about latencies between nodes: 1605 0 stevel * 1606 0 stevel * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a)) 1607 0 stevel * - Local latencies same 1608 0 stevel * - Local < remote 1609 0 stevel * - Number of latencies seen is reasonable 1610 0 stevel * - Number of occurrences of a given latency should be more than 1 1611 0 stevel * 1612 0 stevel * Returns: 1613 0 stevel * 0 Success 1614 0 stevel * -1 Not symmetric 1615 0 stevel * -2 Local latencies not same 1616 0 stevel * -3 Local >= remote 1617 0 stevel */ 1618 0 stevel static int 1619 6445 jjc lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory, 1620 6445 jjc lgrp_plat_latency_stats_t *lat_stats) 1621 0 stevel { 1622 0 stevel int i; 1623 0 stevel int j; 1624 0 stevel u_longlong_t t1; 1625 0 stevel u_longlong_t t2; 1626 6445 jjc 1627 6445 jjc ASSERT(node_memory != NULL && lat_stats != NULL); 1628 0 stevel 1629 0 stevel /* 1630 50 jjc * Nothing to do when this is an UMA machine, lgroup topology is 1631 50 jjc * limited to 2 levels, or there aren't any probe times yet 1632 0 stevel */ 1633 0 stevel if (max_mem_nodes == 1 || lgrp_topo_levels < 2 || 1634 6445 jjc lat_stats->latencies[0][0] == 0) 1635 0 stevel return (0); 1636 0 stevel 1637 0 stevel /* 1638 0 stevel * Make sure that latencies are symmetric between any two nodes 1639 0 stevel * (ie. latency(node0, node1) == latency(node1, node0)) 1640 0 stevel */ 1641 6445 jjc for (i = 0; i < lgrp_plat_node_cnt; i++) { 1642 6445 jjc if (!node_memory[i].exists) 1643 6445 jjc continue; 1644 0 stevel for (j = 0; j < lgrp_plat_node_cnt; j++) { 1645 6445 jjc if (!node_memory[j].exists) 1646 6445 jjc continue; 1647 6445 jjc t1 = lat_stats->latencies[i][j]; 1648 6445 jjc t2 = lat_stats->latencies[j][i]; 1649 0 stevel 1650 0 stevel if (t1 == 0 || t2 == 0 || t1 == t2) 1651 0 stevel continue; 1652 0 stevel 1653 0 stevel return (-1); 1654 0 stevel } 1655 6445 jjc } 1656 0 stevel 1657 0 stevel /* 1658 0 stevel * Local latencies should be same 1659 0 stevel */ 1660 6445 jjc t1 = lat_stats->latencies[0][0]; 1661 0 stevel for (i = 1; i < lgrp_plat_node_cnt; i++) { 1662 6445 jjc if (!node_memory[i].exists) 1663 6445 jjc continue; 1664 6445 jjc 1665 6445 jjc t2 = lat_stats->latencies[i][i]; 1666 0 stevel if (t2 == 0) 1667 0 stevel continue; 1668 0 stevel 1669 50 jjc if (t1 == 0) { 1670 50 jjc t1 = t2; 1671 50 jjc continue; 1672 50 jjc } 1673 50 jjc 1674 0 stevel if (t1 != t2) 1675 0 stevel return (-2); 1676 0 stevel } 1677 0 stevel 1678 0 stevel /* 1679 0 stevel * Local latencies should be less than remote 1680 0 stevel */ 1681 50 jjc if (t1) { 1682 6445 jjc for (i = 0; i < lgrp_plat_node_cnt; i++) { 1683 50 jjc for (j = 0; j < lgrp_plat_node_cnt; j++) { 1684 6445 jjc if (!node_memory[j].exists) 1685 6445 jjc continue; 1686 6445 jjc t2 = lat_stats->latencies[i][j]; 1687 50 jjc if (i == j || t2 == 0) 1688 50 jjc continue; 1689 0 stevel 1690 50 jjc if (t1 >= t2) 1691 50 jjc return (-3); 1692 50 jjc } 1693 0 stevel } 1694 0 stevel } 1695 0 stevel 1696 0 stevel return (0); 1697 0 stevel } 1698 0 stevel 1699 0 stevel 1700 0 stevel /* 1701 10710 jonathan * Platform-specific initialization 1702 10710 jonathan */ 1703 10710 jonathan static void 1704 10710 jonathan lgrp_plat_main_init(void) 1705 10710 jonathan { 1706 10710 jonathan int curnode; 1707 10710 jonathan int ht_limit; 1708 10710 jonathan int i; 1709 10710 jonathan 1710 10710 jonathan /* 1711 10710 jonathan * Print a notice that MPO is disabled when memory is interleaved 1712 10710 jonathan * across nodes....Would do this when it is discovered, but can't 1713 10710 jonathan * because it happens way too early during boot.... 1714 10710 jonathan */ 1715 10710 jonathan if (lgrp_plat_mem_intrlv) 1716 10710 jonathan cmn_err(CE_NOTE, 1717 10710 jonathan "MPO disabled because memory is interleaved\n"); 1718 10710 jonathan 1719 10710 jonathan /* 1720 10710 jonathan * Don't bother to do any probing if it is disabled, there is only one 1721 10710 jonathan * node, or the height of the lgroup topology less than or equal to 2 1722 10710 jonathan */ 1723 10710 jonathan ht_limit = lgrp_topo_ht_limit(); 1724 10710 jonathan if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) || 1725 10710 jonathan max_mem_nodes == 1 || ht_limit <= 2) { 1726 10710 jonathan /* 1727 10710 jonathan * Setup lgroup latencies for 2 level lgroup topology 1728 10710 jonathan * (ie. local and remote only) if they haven't been set yet 1729 10710 jonathan */ 1730 10710 jonathan if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 && 1731 10710 jonathan lgrp_plat_lat_stats.latency_max == 0) 1732 10710 jonathan lgrp_plat_2level_setup(lgrp_plat_node_memory, 1733 10710 jonathan &lgrp_plat_lat_stats); 1734 10710 jonathan return; 1735 10710 jonathan } 1736 10710 jonathan 1737 10710 jonathan if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) { 1738 10710 jonathan /* 1739 10710 jonathan * Should have been able to probe from CPU 0 when it was added 1740 10710 jonathan * to lgroup hierarchy, but may not have been able to then 1741 10710 jonathan * because it happens so early in boot that gethrtime() hasn't 1742 10710 jonathan * been initialized. (:-( 1743 10710 jonathan */ 1744 10710 jonathan curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node, 1745 10710 jonathan lgrp_plat_cpu_node_nentries); 1746 10710 jonathan ASSERT(curnode >= 0 && curnode < lgrp_plat_node_cnt); 1747 10710 jonathan if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0) 1748 10710 jonathan lgrp_plat_probe(); 1749 10710 jonathan 1750 10710 jonathan return; 1751 10710 jonathan } 1752 10710 jonathan 1753 10710 jonathan /* 1754 10710 jonathan * When probing memory, use one page for every sample to determine 1755 10710 jonathan * lgroup topology and taking multiple samples 1756 10710 jonathan */ 1757 10710 jonathan if (lgrp_plat_probe_mem_config.probe_memsize == 0) 1758 10710 jonathan lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE * 1759 10710 jonathan lgrp_plat_probe_nsamples; 1760 10710 jonathan 1761 10710 jonathan /* 1762 10710 jonathan * Map memory in each node needed for probing to determine latency 1763 10710 jonathan * topology 1764 10710 jonathan */ 1765 10710 jonathan for (i = 0; i < lgrp_plat_node_cnt; i++) { 1766 10710 jonathan int mnode; 1767 10710 jonathan 1768 10710 jonathan /* 1769 10710 jonathan * Skip this node and leave its probe page NULL 1770 10710 jonathan * if it doesn't have any memory 1771 10710 jonathan */ 1772 10710 jonathan mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i); 1773 10710 jonathan if (!mem_node_config[mnode].exists) { 1774 10710 jonathan lgrp_plat_probe_mem_config.probe_va[i] = NULL; 1775 10710 jonathan continue; 1776 10710 jonathan } 1777 10710 jonathan 1778 10710 jonathan /* 1779 10710 jonathan * Allocate one kernel virtual page 1780 10710 jonathan */ 1781 10710 jonathan lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena, 1782 10710 jonathan lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP); 1783 10710 jonathan if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) { 1784 10710 jonathan cmn_err(CE_WARN, 1785 10710 jonathan "lgrp_plat_main_init: couldn't allocate memory"); 1786 10710 jonathan return; 1787 10710 jonathan } 1788 10710 jonathan 1789 10710 jonathan /* 1790 10710 jonathan * Get PFN for first page in each node 1791 10710 jonathan */ 1792 10710 jonathan lgrp_plat_probe_mem_config.probe_pfn[i] = 1793 10710 jonathan mem_node_config[mnode].physbase; 1794 10710 jonathan 1795 10710 jonathan /* 1796 10710 jonathan * Map virtual page to first page in node 1797 10710 jonathan */ 1798 10710 jonathan hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i], 1799 10710 jonathan lgrp_plat_probe_mem_config.probe_memsize, 1800 10710 jonathan lgrp_plat_probe_mem_config.probe_pfn[i], 1801 10710 jonathan PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE, 1802 10710 jonathan HAT_LOAD_NOCONSIST); 1803 10710 jonathan } 1804 10710 jonathan 1805 10710 jonathan /* 1806 10710 jonathan * Probe from current CPU 1807 10710 jonathan */ 1808 10710 jonathan lgrp_plat_probe(); 1809 10710 jonathan } 1810 10710 jonathan 1811 10710 jonathan 1812 10710 jonathan /* 1813 0 stevel * Return the number of free, allocatable, or installed 1814 0 stevel * pages in an lgroup 1815 0 stevel * This is a copy of the MAX_MEM_NODES == 1 version of the routine 1816 0 stevel * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup 1817 0 stevel */ 1818 0 stevel /* ARGSUSED */ 1819 0 stevel static pgcnt_t 1820 0 stevel lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query) 1821 0 stevel { 1822 0 stevel struct memlist *mlist; 1823 0 stevel pgcnt_t npgs = 0; 1824 0 stevel extern struct memlist *phys_avail; 1825 0 stevel extern struct memlist *phys_install; 1826 0 stevel 1827 0 stevel switch (query) { 1828 0 stevel case LGRP_MEM_SIZE_FREE: 1829 0 stevel return ((pgcnt_t)freemem); 1830 0 stevel case LGRP_MEM_SIZE_AVAIL: 1831 0 stevel memlist_read_lock(); 1832 0 stevel for (mlist = phys_avail; mlist; mlist = mlist->next) 1833 0 stevel npgs += btop(mlist->size); 1834 0 stevel memlist_read_unlock(); 1835 0 stevel return (npgs); 1836 0 stevel case LGRP_MEM_SIZE_INSTALL: 1837 0 stevel memlist_read_lock(); 1838 0 stevel for (mlist = phys_install; mlist; mlist = mlist->next) 1839 0 stevel npgs += btop(mlist->size); 1840 0 stevel memlist_read_unlock(); 1841 0 stevel return (npgs); 1842 0 stevel default: 1843 0 stevel return ((pgcnt_t)0); 1844 0 stevel } 1845 0 stevel } 1846 0 stevel 1847 6445 jjc 1848 0 stevel /* 1849 6445 jjc * Update node to proximity domain mappings for given domain and return node ID 1850 0 stevel */ 1851 6445 jjc static int 1852 6706 jjc lgrp_plat_node_domain_update(node_domain_map_t *node_domain, int node_cnt, 1853 6706 jjc uint32_t domain) 1854 0 stevel { 1855 6445 jjc uint_t node; 1856 6445 jjc uint_t start; 1857 0 stevel 1858 6445 jjc /* 1859 6445 jjc * Hash proximity domain ID into node to domain mapping table (array) 1860 6445 jjc * and add entry for it into first non-existent or matching entry found 1861 6445 jjc */ 1862 6706 jjc node = start = NODE_DOMAIN_HASH(domain, node_cnt); 1863 6445 jjc do { 1864 6445 jjc /* 1865 6445 jjc * Entry doesn't exist yet, so create one for this proximity 1866 6445 jjc * domain and return node ID which is index into mapping table. 1867 6445 jjc */ 1868 6445 jjc if (!node_domain[node].exists) { 1869 6445 jjc node_domain[node].exists = 1; 1870 6445 jjc node_domain[node].prox_domain = domain; 1871 6445 jjc return (node); 1872 6445 jjc } 1873 0 stevel 1874 6445 jjc /* 1875 6445 jjc * Entry exists for this proximity domain already, so just 1876 6445 jjc * return node ID (index into table). 1877 6445 jjc */ 1878 6445 jjc if (node_domain[node].prox_domain == domain) 1879 6445 jjc return (node); 1880 6706 jjc node = NODE_DOMAIN_HASH(node + 1, node_cnt); 1881 6445 jjc } while (node != start); 1882 0 stevel 1883 6445 jjc /* 1884 6445 jjc * Ran out of supported number of entries which shouldn't happen.... 1885 6445 jjc */ 1886 6445 jjc ASSERT(node != start); 1887 6445 jjc return (-1); 1888 6445 jjc } 1889 6445 jjc 1890 6445 jjc 1891 6445 jjc /* 1892 6445 jjc * Update node memory information for given proximity domain with specified 1893 6445 jjc * starting and ending physical address range (and return positive numbers for 1894 6445 jjc * success and negative ones for errors) 1895 6445 jjc */ 1896 6445 jjc static int 1897 6706 jjc lgrp_plat_node_memory_update(node_domain_map_t *node_domain, int node_cnt, 1898 6462 jjc node_phys_addr_map_t *node_memory, uint64_t start, uint64_t end, 1899 6445 jjc uint32_t domain) 1900 6445 jjc { 1901 6445 jjc int node; 1902 6445 jjc 1903 6445 jjc /* 1904 6445 jjc * Get node number for proximity domain 1905 6445 jjc */ 1906 6706 jjc node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain); 1907 6445 jjc if (node == -1) { 1908 6706 jjc node = lgrp_plat_node_domain_update(node_domain, node_cnt, 1909 6706 jjc domain); 1910 6445 jjc if (node == -1) 1911 6445 jjc return (-1); 1912 6445 jjc } 1913 6445 jjc 1914 6445 jjc /* 1915 6445 jjc * Create entry in table for node if it doesn't exist 1916 6445 jjc */ 1917 6445 jjc if (!node_memory[node].exists) { 1918 6445 jjc node_memory[node].exists = 1; 1919 6445 jjc node_memory[node].start = btop(start); 1920 6445 jjc node_memory[node].end = btop(end); 1921 6445 jjc node_memory[node].prox_domain = domain; 1922 6445 jjc return (0); 1923 6445 jjc } 1924 6445 jjc 1925 6445 jjc /* 1926 6445 jjc * Entry already exists for this proximity domain 1927 6445 jjc * 1928 6445 jjc * There may be more than one SRAT memory entry for a domain, so we may 1929 6445 jjc * need to update existing start or end address for the node. 1930 6445 jjc */ 1931 6445 jjc if (node_memory[node].prox_domain == domain) { 1932 6445 jjc if (btop(start) < node_memory[node].start) 1933 6445 jjc node_memory[node].start = btop(start); 1934 6445 jjc if (btop(end) > node_memory[node].end) 1935 6445 jjc node_memory[node].end = btop(end); 1936 6445 jjc return (1); 1937 6445 jjc } 1938 6445 jjc return (-2); 1939 6445 jjc } 1940 6445 jjc 1941 6445 jjc 1942 6445 jjc /* 1943 9716 jonathan * Have to sort node by starting physical address because VM system (physical 1944 9716 jonathan * page free list management) assumes and expects memnodes to be sorted in 1945 9716 jonathan * ascending order by physical address. If not, the kernel will panic in 1946 9716 jonathan * potentially a number of different places. (:-( 1947 9716 jonathan * NOTE: This workaround will not be sufficient if/when hotplugging memory is 1948 9716 jonathan * supported on x86/x64. 1949 9716 jonathan */ 1950 9716 jonathan static void 1951 9716 jonathan lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt, 1952 9716 jonathan cpu_node_map_t *cpu_node, int cpu_count, node_phys_addr_map_t *node_memory) 1953 9716 jonathan { 1954 9716 jonathan boolean_t found; 1955 9716 jonathan int i; 1956 9716 jonathan int j; 1957 9716 jonathan int n; 1958 9716 jonathan boolean_t sorted; 1959 9716 jonathan boolean_t swapped; 1960 9716 jonathan 1961 9716 jonathan if (!lgrp_plat_node_sort_enable || node_cnt <= 1 || 1962 9716 jonathan node_domain == NULL || node_memory == NULL) 1963 9716 jonathan return; 1964 9716 jonathan 1965 9716 jonathan /* 1966 9716 jonathan * Sorted already? 1967 9716 jonathan */ 1968 9716 jonathan sorted = B_TRUE; 1969 9716 jonathan for (i = 0; i < node_cnt - 1; i++) { 1970 9716 jonathan /* 1971 9716 jonathan * Skip entries that don't exist 1972 9716 jonathan */ 1973 9716 jonathan if (!node_memory[i].exists) 1974 9716 jonathan continue; 1975 9716 jonathan 1976 9716 jonathan /* 1977 9716 jonathan * Try to find next existing entry to compare against 1978 9716 jonathan */ 1979 9716 jonathan found = B_FALSE; 1980 9716 jonathan for (j = i + 1; j < node_cnt; j++) { 1981 9716 jonathan if (node_memory[j].exists) { 1982 9716 jonathan found = B_TRUE; 1983 9716 jonathan break; 1984 9716 jonathan } 1985 9716 jonathan } 1986 9716 jonathan 1987 9716 jonathan /* 1988 9716 jonathan * Done if no more existing entries to compare against 1989 9716 jonathan */ 1990 9716 jonathan if (found == B_FALSE) 1991 9716 jonathan break; 1992 9716 jonathan 1993 9716 jonathan /* 1994 9716 jonathan * Not sorted if starting address of current entry is bigger 1995 9716 jonathan * than starting address of next existing entry 1996 9716 jonathan */ 1997 9716 jonathan if (node_memory[i].start > node_memory[j].start) { 1998 9716 jonathan sorted = B_FALSE; 1999 9716 jonathan break; 2000 9716 jonathan } 2001 9716 jonathan } 2002 9716 jonathan 2003 9716 jonathan /* 2004 9716 jonathan * Don't need to sort if sorted already 2005 9716 jonathan */ 2006 9716 jonathan if (sorted == B_TRUE) 2007 9716 jonathan return; 2008 9716 jonathan 2009 9716 jonathan /* 2010 9716 jonathan * Just use bubble sort since number of nodes is small 2011 9716 jonathan */ 2012 9716 jonathan n = node_cnt; 2013 9716 jonathan do { 2014 9716 jonathan swapped = B_FALSE; 2015 9716 jonathan n--; 2016 9716 jonathan for (i = 0; i < n; i++) { 2017 9716 jonathan /* 2018 9716 jonathan * Skip entries that don't exist 2019 9716 jonathan */ 2020 9716 jonathan if (!node_memory[i].exists) 2021 9716 jonathan continue; 2022 9716 jonathan 2023 9716 jonathan /* 2024 9716 jonathan * Try to find next existing entry to compare against 2025 9716 jonathan */ 2026 9716 jonathan found = B_FALSE; 2027 9716 jonathan for (j = i + 1; j <= n; j++) { 2028 9716 jonathan if (node_memory[j].exists) { 2029 9716 jonathan found = B_TRUE; 2030 9716 jonathan break; 2031 9716 jonathan } 2032 9716 jonathan } 2033 9716 jonathan 2034 9716 jonathan /* 2035 9716 jonathan * Done if no more existing entries to compare against 2036 9716 jonathan */ 2037 9716 jonathan if (found == B_FALSE) 2038 9716 jonathan break; 2039 9716 jonathan 2040 9716 jonathan if (node_memory[i].start > node_memory[j].start) { 2041 9716 jonathan node_phys_addr_map_t save_addr; 2042 9716 jonathan node_domain_map_t save_node; 2043 9716 jonathan 2044 9716 jonathan /* 2045 9716 jonathan * Swap node to proxmity domain ID assignments 2046 9716 jonathan */ 2047 9716 jonathan bcopy(&node_domain[i], &save_node, 2048 9716 jonathan sizeof (node_domain_map_t)); 2049 9716 jonathan bcopy(&node_domain[j], &node_domain[i], 2050 9716 jonathan sizeof (node_domain_map_t)); 2051 9716 jonathan bcopy(&save_node, &node_domain[j], 2052 9716 jonathan sizeof (node_domain_map_t)); 2053 9716 jonathan 2054 9716 jonathan /* 2055 9716 jonathan * Swap node to physical memory assignments 2056 9716 jonathan */ 2057 9716 jonathan bcopy(&node_memory[i], &save_addr, 2058 9716 jonathan sizeof (node_phys_addr_map_t)); 2059 9716 jonathan bcopy(&node_memory[j], &node_memory[i], 2060 9716 jonathan sizeof (node_phys_addr_map_t)); 2061 9716 jonathan bcopy(&save_addr, &node_memory[j], 2062 9716 jonathan sizeof (node_phys_addr_map_t)); 2063 9716 jonathan swapped = B_TRUE; 2064 9716 jonathan } 2065 9716 jonathan } 2066 9716 jonathan } while (swapped == B_TRUE); 2067 9716 jonathan 2068 9716 jonathan /* 2069 9716 jonathan * Check to make sure that CPUs assigned to correct node IDs now since 2070 9716 jonathan * node to proximity domain ID assignments may have been changed above 2071 9716 jonathan */ 2072 9716 jonathan if (n == node_cnt - 1 || cpu_node == NULL || cpu_count < 1) 2073 9716 jonathan return; 2074 9716 jonathan for (i = 0; i < cpu_count; i++) { 2075 9716 jonathan int node; 2076 9716 jonathan 2077 9716 jonathan node = lgrp_plat_domain_to_node(node_domain, node_cnt, 2078 9716 jonathan cpu_node[i].prox_domain); 2079 9716 jonathan if (cpu_node[i].node != node) 2080 9716 jonathan cpu_node[i].node = node; 2081 9716 jonathan } 2082 9716 jonathan 2083 9716 jonathan } 2084 9716 jonathan 2085 9716 jonathan 2086 9716 jonathan /* 2087 6445 jjc * Return time needed to probe from current CPU to memory in given node 2088 6445 jjc */ 2089 6445 jjc static hrtime_t 2090 10710 jonathan lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, int cpu_node_nentries, 2091 6445 jjc lgrp_plat_probe_mem_config_t *probe_mem_config, 2092 6445 jjc lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats) 2093 6445 jjc { 2094 6445 jjc caddr_t buf; 2095 6445 jjc hrtime_t elapsed; 2096 6445 jjc hrtime_t end; 2097 6445 jjc int from; 2098 6445 jjc int i; 2099 6445 jjc int ipl; 2100 6445 jjc hrtime_t max; 2101 6445 jjc hrtime_t min; 2102 6445 jjc hrtime_t start; 2103 6445 jjc extern int use_sse_pagecopy; 2104 6445 jjc 2105 6445 jjc /* 2106 6445 jjc * Determine ID of node containing current CPU 2107 6445 jjc */ 2108 10710 jonathan from = lgrp_plat_cpu_to_node(CPU, cpu_node, cpu_node_nentries); 2109 6445 jjc ASSERT(from >= 0 && from < lgrp_plat_node_cnt); 2110 6445 jjc 2111 6445 jjc /* 2112 6445 jjc * Do common work for probing main memory 2113 6445 jjc */ 2114 6445 jjc if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_PGCPY) { 2115 6445 jjc /* 2116 6445 jjc * Skip probing any nodes without memory and 2117 6445 jjc * set probe time to 0 2118 6445 jjc */ 2119 6445 jjc if (probe_mem_config->probe_va[to] == NULL) { 2120 6445 jjc lat_stats->latencies[from][to] = 0; 2121 6445 jjc return (0); 2122 6445 jjc } 2123 6445 jjc 2124 6445 jjc /* 2125 6445 jjc * Invalidate caches once instead of once every sample 2126 6445 jjc * which should cut cost of probing by a lot 2127 6445 jjc */ 2128 6445 jjc probe_stats->flush_cost = gethrtime(); 2129 6445 jjc invalidate_cache(); 2130 6445 jjc probe_stats->flush_cost = gethrtime() - 2131 6445 jjc probe_stats->flush_cost; 2132 6445 jjc probe_stats->probe_cost_total += probe_stats->flush_cost; 2133 6445 jjc } 2134 6445 jjc 2135 6445 jjc /* 2136 6445 jjc * Probe from current CPU to given memory using specified operation 2137 6445 jjc * and take specified number of samples 2138 6445 jjc */ 2139 6445 jjc max = 0; 2140 6445 jjc min = -1; 2141 6445 jjc for (i = 0; i < lgrp_plat_probe_nsamples; i++) { 2142 6445 jjc probe_stats->probe_cost = gethrtime(); 2143 6445 jjc 2144 6445 jjc /* 2145 6445 jjc * Can't measure probe time if gethrtime() isn't working yet 2146 6445 jjc */ 2147 6445 jjc if (probe_stats->probe_cost == 0 && gethrtime() == 0) 2148 6445 jjc return (0); 2149 6445 jjc 2150 6445 jjc if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) { 2151 6445 jjc /* 2152 6445 jjc * Measure how long it takes to read vendor ID from 2153 6445 jjc * Northbridge 2154 6445 jjc */ 2155 6445 jjc elapsed = opt_probe_vendor(to, lgrp_plat_probe_nreads); 2156 6445 jjc } else { 2157 6445 jjc /* 2158 6445 jjc * Measure how long it takes to copy page 2159 6445 jjc * on top of itself 2160 6445 jjc */ 2161 6445 jjc buf = probe_mem_config->probe_va[to] + (i * PAGESIZE); 2162 6445 jjc 2163 6445 jjc kpreempt_disable(); 2164 6445 jjc ipl = splhigh(); 2165 6445 jjc start = gethrtime(); 2166 6445 jjc if (use_sse_pagecopy) 2167 6445 jjc hwblkpagecopy(buf, buf); 2168 6445 jjc else 2169 6445 jjc bcopy(buf, buf, PAGESIZE); 2170 6445 jjc end = gethrtime(); 2171 6445 jjc elapsed = end - start; 2172 6445 jjc splx(ipl); 2173 6445 jjc kpreempt_enable(); 2174 6445 jjc } 2175 6445 jjc 2176 6445 jjc probe_stats->probe_cost = gethrtime() - 2177 6445 jjc probe_stats->probe_cost; 2178 6445 jjc probe_stats->probe_cost_total += probe_stats->probe_cost; 2179 6445 jjc 2180 6445 jjc if (min == -1 || elapsed < min) 2181 6445 jjc min = elapsed; 2182 6445 jjc if (elapsed > max) 2183 6445 jjc max = elapsed; 2184 6445 jjc } 2185 6445 jjc 2186 6445 jjc /* 2187 6445 jjc * Update minimum and maximum probe times between 2188 6445 jjc * these two nodes 2189 6445 jjc */ 2190 6445 jjc if (min < probe_stats->probe_min[from][to] || 2191 6445 jjc probe_stats->probe_min[from][to] == 0) 2192 6445 jjc probe_stats->probe_min[from][to] = min; 2193 6445 jjc 2194 6445 jjc if (max > probe_stats->probe_max[from][to]) 2195 6445 jjc probe_stats->probe_max[from][to] = max; 2196 6445 jjc 2197 6445 jjc return (min); 2198 6445 jjc } 2199 6445 jjc 2200 6445 jjc 2201 6445 jjc /* 2202 6706 jjc * Read boot property with CPU to APIC ID array, fill in CPU to node ID 2203 10710 jonathan * mapping table with APIC ID for each CPU (if pointer to table isn't NULL), 2204 10710 jonathan * and return number of CPU APIC IDs. 2205 6671 jjc * 2206 6671 jjc * NOTE: This code assumes that CPU IDs are assigned in order that they appear 2207 6671 jjc * in in cpu_apicid_array boot property which is based on and follows 2208 6671 jjc * same ordering as processor list in ACPI MADT. If the code in 2209 6671 jjc * usr/src/uts/i86pc/io/pcplusmp/apic.c that reads MADT and assigns 2210 6671 jjc * CPU IDs ever changes, then this code will need to change too.... 2211 6671 jjc */ 2212 6671 jjc static int 2213 6706 jjc lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node) 2214 6671 jjc { 2215 6706 jjc int boot_prop_len; 2216 6671 jjc char *boot_prop_name = BP_CPU_APICID_ARRAY; 2217 6671 jjc uint8_t cpu_apicid_array[UINT8_MAX + 1]; 2218 6671 jjc int i; 2219 6706 jjc int n; 2220 6671 jjc 2221 6671 jjc /* 2222 6671 jjc * Check length of property value 2223 6671 jjc */ 2224 6671 jjc boot_prop_len = BOP_GETPROPLEN(bootops, boot_prop_name); 2225 6706 jjc if (boot_prop_len <= 0 || boot_prop_len > sizeof (cpu_apicid_array)) 2226 10710 jonathan return (-1); 2227 6706 jjc 2228 6706 jjc /* 2229 6706 jjc * Calculate number of entries in array and return when there's just 2230 6706 jjc * one CPU since that's not very interesting for NUMA 2231 6706 jjc */ 2232 6706 jjc n = boot_prop_len / sizeof (uint8_t); 2233 6706 jjc if (n == 1) 2234 10710 jonathan return (-2); 2235 6671 jjc 2236 6671 jjc /* 2237 6671 jjc * Get CPU to APIC ID property value 2238 6671 jjc */ 2239 6671 jjc if (BOP_GETPROP(bootops, boot_prop_name, cpu_apicid_array) < 0) 2240 10710 jonathan return (-3); 2241 10710 jonathan 2242 10710 jonathan /* 2243 10710 jonathan * Just return number of CPU APIC IDs if CPU to node mapping table is 2244 10710 jonathan * NULL 2245 10710 jonathan */ 2246 10710 jonathan if (cpu_node == NULL) 2247 10710 jonathan return (n); 2248 6671 jjc 2249 6671 jjc /* 2250 6671 jjc * Fill in CPU to node ID mapping table with APIC ID for each CPU 2251 6671 jjc */ 2252 6706 jjc for (i = 0; i < n; i++) { 2253 6671 jjc cpu_node[i].exists = 1; 2254 6671 jjc cpu_node[i].apicid = cpu_apicid_array[i]; 2255 6671 jjc } 2256 6671 jjc 2257 6706 jjc /* 2258 6706 jjc * Return number of CPUs based on number of APIC IDs 2259 6706 jjc */ 2260 6706 jjc return (n); 2261 6671 jjc } 2262 6671 jjc 2263 6671 jjc 2264 6671 jjc /* 2265 6445 jjc * Read ACPI System Locality Information Table (SLIT) to determine how far each 2266 6445 jjc * NUMA node is from each other 2267 6445 jjc */ 2268 6445 jjc static int 2269 6445 jjc lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt, 2270 6445 jjc node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats) 2271 6445 jjc { 2272 6445 jjc int i; 2273 6445 jjc int j; 2274 6445 jjc int localities; 2275 6445 jjc hrtime_t max; 2276 6445 jjc hrtime_t min; 2277 6445 jjc int retval; 2278 6445 jjc uint8_t *slit_entries; 2279 6445 jjc 2280 6445 jjc if (tp == NULL || !lgrp_plat_slit_enable) 2281 6445 jjc return (1); 2282 6445 jjc 2283 6445 jjc if (lat_stats == NULL) 2284 6445 jjc return (2); 2285 6445 jjc 2286 6445 jjc localities = tp->number; 2287 6445 jjc if (localities != node_cnt) 2288 6445 jjc return (3); 2289 6445 jjc 2290 6445 jjc min = lat_stats->latency_min; 2291 6445 jjc max = lat_stats->latency_max; 2292 6445 jjc 2293 6445 jjc /* 2294 6445 jjc * Fill in latency matrix based on SLIT entries 2295 6445 jjc */ 2296 6445 jjc slit_entries = tp->entry; 2297 6445 jjc for (i = 0; i < localities; i++) { 2298 6445 jjc for (j = 0; j < localities; j++) { 2299 6445 jjc uint8_t latency; 2300 6445 jjc 2301 6445 jjc latency = slit_entries[(i * localities) + j]; 2302 6445 jjc lat_stats->latencies[i][j] = latency; 2303 6565 jjc if (latency < min || min == -1) 2304 6445 jjc min = latency; 2305 6445 jjc if (latency > max) 2306 6445 jjc max = latency; 2307 6445 jjc } 2308 6445 jjc } 2309 6445 jjc 2310 6445 jjc /* 2311 6445 jjc * Verify that latencies/distances given in SLIT look reasonable 2312 6445 jjc */ 2313 6445 jjc retval = lgrp_plat_latency_verify(node_memory, lat_stats); 2314 6445 jjc 2315 6445 jjc if (retval) { 2316 6445 jjc /* 2317 6445 jjc * Reinitialize (zero) latency table since SLIT doesn't look 2318 6445 jjc * right 2319 6445 jjc */ 2320 6445 jjc for (i = 0; i < localities; i++) { 2321 6445 jjc for (j = 0; j < localities; j++) 2322 6445 jjc lat_stats->latencies[i][j] = 0; 2323 6445 jjc } 2324 6445 jjc } else { 2325 6445 jjc /* 2326 6445 jjc * Update min and max latencies seen since SLIT looks valid 2327 6445 jjc */ 2328 6445 jjc lat_stats->latency_min = min; 2329 6445 jjc lat_stats->latency_max = max; 2330 6445 jjc } 2331 6445 jjc 2332 6445 jjc return (retval); 2333 6445 jjc } 2334 6445 jjc 2335 6445 jjc 2336 6445 jjc /* 2337 6445 jjc * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs 2338 6706 jjc * and memory are local to each other in the same NUMA node and return number 2339 6706 jjc * of nodes 2340 6445 jjc */ 2341 6445 jjc static int 2342 9716 jonathan lgrp_plat_process_srat(struct srat *tp, uint32_t *prox_domain_min, 2343 9716 jonathan node_domain_map_t *node_domain, cpu_node_map_t *cpu_node, int cpu_count, 2344 9716 jonathan node_phys_addr_map_t *node_memory) 2345 6445 jjc { 2346 6565 jjc struct srat_item *srat_end; 2347 6445 jjc int i; 2348 6445 jjc struct srat_item *item; 2349 6706 jjc int node_cnt; 2350 6671 jjc int proc_entry_count; 2351 6445 jjc 2352 6706 jjc /* 2353 6706 jjc * Nothing to do when no SRAT or disabled 2354 6706 jjc */ 2355 6445 jjc if (tp == NULL || !lgrp_plat_srat_enable) 2356 6706 jjc return (-1); 2357 6445 jjc 2358 6445 jjc /* 2359 6445 jjc * Determine number of nodes by counting number of proximity domains in 2360 6706 jjc * SRAT and return if number of nodes is 1 or less since don't need to 2361 6706 jjc * read SRAT then 2362 6445 jjc */ 2363 9716 jonathan node_cnt = lgrp_plat_srat_domains(tp, prox_domain_min); 2364 6706 jjc if (node_cnt == 1) 2365 6706 jjc return (1); 2366 6706 jjc else if (node_cnt <= 0) 2367 6706 jjc return (-2); 2368 6445 jjc 2369 6445 jjc /* 2370 6445 jjc * Walk through SRAT, examining each CPU and memory entry to determine 2371 6445 jjc * which CPUs and memory belong to which node. 2372 6445 jjc */ 2373 6445 jjc item = tp->list; 2374 6565 jjc srat_end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp); 2375 6671 jjc proc_entry_count = 0; 2376 6565 jjc while (item < srat_end) { 2377 6445 jjc uint32_t apic_id; 2378 6445 jjc uint32_t domain; 2379 6445 jjc uint64_t end; 2380 6445 jjc uint64_t length; 2381 6445 jjc uint64_t start; 2382 6445 jjc 2383 6445 jjc switch (item->type) { 2384 6445 jjc case SRAT_PROCESSOR: /* CPU entry */ 2385 6445 jjc if (!(item->i.p.flags & SRAT_ENABLED) || 2386 6445 jjc cpu_node == NULL) 2387 0 stevel break; 2388 6445 jjc 2389 6445 jjc /* 2390 6445 jjc * Calculate domain (node) ID and fill in APIC ID to 2391 6445 jjc * domain/node mapping table 2392 6445 jjc */ 2393 6445 jjc domain = item->i.p.domain1; 2394 6445 jjc for (i = 0; i < 3; i++) { 2395 6445 jjc domain += item->i.p.domain2[i] << 2396 6445 jjc ((i + 1) * 8); 2397 6445 jjc } 2398 6445 jjc apic_id = item->i.p.apic_id; 2399 6445 jjc 2400 6706 jjc if (lgrp_plat_cpu_node_update(node_domain, node_cnt, 2401 6706 jjc cpu_node, cpu_count, apic_id, domain) < 0) 2402 6706 jjc return (-3); 2403 6671 jjc 2404 6671 jjc proc_entry_count++; 2405 6445 jjc break; 2406 6445 jjc 2407 6445 jjc case SRAT_MEMORY: /* memory entry */ 2408 6445 jjc if (!(item->i.m.flags & SRAT_ENABLED) || 2409 6445 jjc node_memory == NULL) 2410 0 stevel break; 2411 6445 jjc 2412 6445 jjc /* 2413 6445 jjc * Get domain (node) ID and fill in domain/node 2414 6445 jjc * to memory mapping table 2415 6445 jjc */ 2416 6445 jjc domain = item->i.m.domain; 2417 6445 jjc start = item->i.m.base_addr; 2418 6445 jjc length = item->i.m.len; 2419 6445 jjc end = start + length - 1; 2420 6445 jjc 2421 6706 jjc if (lgrp_plat_node_memory_update(node_domain, node_cnt, 2422 6445 jjc node_memory, start, end, domain) < 0) 2423 6706 jjc return (-4); 2424 6445 jjc break; 2425 7282 mishra case SRAT_X2APIC: /* x2apic CPU entry */ 2426 7282 mishra if (!(item->i.xp.flags & SRAT_ENABLED) || 2427 7282 mishra cpu_node == NULL) 2428 7282 mishra break; 2429 7282 mishra 2430 7282 mishra /* 2431 7282 mishra * Calculate domain (node) ID and fill in APIC ID to 2432 7282 mishra * domain/node mapping table 2433 7282 mishra */ 2434 7282 mishra domain = item->i.xp.domain; 2435 7282 mishra apic_id = item->i.xp.x2apic_id; 2436 7282 mishra 2437 7282 mishra if (lgrp_plat_cpu_node_update(node_domain, node_cnt, 2438 7282 mishra cpu_node, cpu_count, apic_id, domain) < 0) 2439 7282 mishra return (-3); 2440 7282 mishra 2441 7282 mishra proc_entry_count++; 2442 7282 mishra break; 2443 6445 jjc 2444 6445 jjc default: 2445 6445 jjc break; 2446 6445 jjc } 2447 6445 jjc 2448 6445 jjc item = (struct srat_item *)((uintptr_t)item + item->len); 2449 6445 jjc } 2450 6671 jjc 2451 6671 jjc /* 2452 6671 jjc * Should have seen at least as many SRAT processor entries as CPUs 2453 6671 jjc */ 2454 6706 jjc if (proc_entry_count < cpu_count) 2455 6706 jjc return (-5); 2456 6671 jjc 2457 9716 jonathan /* 2458 9716 jonathan * Need to sort nodes by starting physical address since VM system 2459 9716 jonathan * assumes and expects memnodes to be sorted in ascending order by 2460 9716 jonathan * physical address 2461 9716 jonathan */ 2462 9716 jonathan lgrp_plat_node_sort(node_domain, node_cnt, cpu_node, cpu_count, 2463 9716 jonathan node_memory); 2464 9716 jonathan 2465 6706 jjc return (node_cnt); 2466 10710 jonathan } 2467 10710 jonathan 2468 10710 jonathan 2469 10710 jonathan /* 2470 10710 jonathan * Allocate permanent memory for any temporary memory that we needed to 2471 10710 jonathan * allocate using BOP_ALLOC() before kmem_alloc() and VM system were 2472 10710 jonathan * initialized and copy everything from temporary to permanent memory since 2473 10710 jonathan * temporary boot memory will eventually be released during boot 2474 10710 jonathan */ 2475 10710 jonathan static void 2476 10710 jonathan lgrp_plat_release_bootstrap(void) 2477 10710 jonathan { 2478 10710 jonathan void *buf; 2479 10710 jonathan size_t size; 2480 10710 jonathan 2481 10710 jonathan if (lgrp_plat_cpu_node_nentries > 0) { 2482 10710 jonathan size = lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t); 2483 10710 jonathan buf = kmem_alloc(size, KM_SLEEP); 2484 10710 jonathan bcopy(lgrp_plat_cpu_node, buf, size); 2485 10710 jonathan lgrp_plat_cpu_node = buf; 2486 10710 jonathan } 2487 6445 jjc } 2488 6445 jjc 2489 6445 jjc 2490 6445 jjc /* 2491 6445 jjc * Return number of proximity domains given in ACPI SRAT 2492 6445 jjc */ 2493 6445 jjc static int 2494 9716 jonathan lgrp_plat_srat_domains(struct srat *tp, uint32_t *prox_domain_min) 2495 6445 jjc { 2496 6445 jjc int domain_cnt; 2497 9716 jonathan uint32_t domain_min; 2498 6445 jjc struct srat_item *end; 2499 6445 jjc int i; 2500 6445 jjc struct srat_item *item; 2501 6445 jjc node_domain_map_t node_domain[MAX_NODES]; 2502 6445 jjc 2503 6445 jjc 2504 6445 jjc if (tp == NULL || !lgrp_plat_srat_enable) 2505 6445 jjc return (1); 2506 9716 jonathan 2507 9716 jonathan /* 2508 9716 jonathan * Walk through SRAT to find minimum proximity domain ID 2509 9716 jonathan */ 2510 9716 jonathan domain_min = UINT32_MAX; 2511 9716 jonathan item = tp->list; 2512 9716 jonathan end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp); 2513 9716 jonathan while (item < end) { 2514 9716 jonathan uint32_t domain; 2515 9716 jonathan 2516 9716 jonathan switch (item->type) { 2517 9716 jonathan case SRAT_PROCESSOR: /* CPU entry */ 2518 9716 jonathan if (!(item->i.p.flags & SRAT_ENABLED)) { 2519 9716 jonathan item = (struct srat_item *)((uintptr_t)item + 2520 9716 jonathan item->len); 2521 9716 jonathan continue; 2522 9716 jonathan } 2523 9716 jonathan domain = item->i.p.domain1; 2524 9716 jonathan for (i = 0; i < 3; i++) { 2525 9716 jonathan domain += item->i.p.domain2[i] << 2526 9716 jonathan ((i + 1) * 8); 2527 9716 jonathan } 2528 9716 jonathan break; 2529 9716 jonathan 2530 9716 jonathan case SRAT_MEMORY: /* memory entry */ 2531 9716 jonathan if (!(item->i.m.flags & SRAT_ENABLED)) { 2532 9716 jonathan item = (struct srat_item *)((uintptr_t)item + 2533 9716 jonathan item->len); 2534 9716 jonathan continue; 2535 9716 jonathan } 2536 9716 jonathan domain = item->i.m.domain; 2537 9716 jonathan break; 2538 9716 jonathan 2539 9716 jonathan case SRAT_X2APIC: /* x2apic CPU entry */ 2540 9716 jonathan if (!(item->i.xp.flags & SRAT_ENABLED)) { 2541 9716 jonathan item = (struct srat_item *)((uintptr_t)item + 2542 9716 jonathan item->len); 2543 9716 jonathan continue; 2544 9716 jonathan } 2545 9716 jonathan domain = item->i.xp.domain; 2546 9716 jonathan break; 2547 9716 jonathan 2548 9716 jonathan default: 2549 9716 jonathan item = (struct srat_item *)((uintptr_t)item + 2550 9716 jonathan item->len); 2551 9716 jonathan continue; 2552 9716 jonathan } 2553 9716 jonathan 2554 9716 jonathan /* 2555 9716 jonathan * Keep track of minimum proximity domain ID 2556 9716 jonathan */ 2557 9716 jonathan if (domain < domain_min) 2558 9716 jonathan domain_min = domain; 2559 9716 jonathan 2560 9716 jonathan item = (struct srat_item *)((uintptr_t)item + item->len); 2561 9716 jonathan } 2562 9716 jonathan if (lgrp_plat_domain_min_enable && prox_domain_min != NULL) 2563 9716 jonathan *prox_domain_min = domain_min; 2564 6445 jjc 2565 6445 jjc /* 2566 6445 jjc * Walk through SRAT, examining each CPU and memory entry to determine 2567 6445 jjc * proximity domain ID for each. 2568 6445 jjc */ 2569 6445 jjc domain_cnt = 0; 2570 6445 jjc item = tp->list; 2571 6445 jjc end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp); 2572 6445 jjc bzero(node_domain, MAX_NODES * sizeof (node_domain_map_t)); 2573 6445 jjc while (item < end) { 2574 6445 jjc uint32_t domain; 2575 6445 jjc boolean_t overflow; 2576 6445 jjc uint_t start; 2577 6445 jjc 2578 6445 jjc switch (item->type) { 2579 6445 jjc case SRAT_PROCESSOR: /* CPU entry */ 2580 9716 jonathan if (!(item->i.p.flags & SRAT_ENABLED)) { 2581 9716 jonathan item = (struct srat_item *)((uintptr_t)item + 2582 9716 jonathan item->len); 2583 9716 jonathan continue; 2584 9716 jonathan } 2585 6445 jjc domain = item->i.p.domain1; 2586 6445 jjc for (i = 0; i < 3; i++) { 2587 6445 jjc domain += item->i.p.domain2[i] << 2588 6445 jjc ((i + 1) * 8); 2589 6445 jjc } 2590 6445 jjc break; 2591 6445 jjc 2592 6445 jjc case SRAT_MEMORY: /* memory entry */ 2593 9716 jonathan if (!(item->i.m.flags & SRAT_ENABLED)) { 2594 9716 jonathan item = (struct srat_item *)((uintptr_t)item + 2595 9716 jonathan item->len); 2596 9716 jonathan continue; 2597 9716 jonathan } 2598 6445 jjc domain = item->i.m.domain; 2599 7282 mishra break; 2600 7282 mishra 2601 7282 mishra case SRAT_X2APIC: /* x2apic CPU entry */ 2602 9716 jonathan if (!(item->i.xp.flags & SRAT_ENABLED)) { 2603 9716 jonathan item = (struct srat_item *)((uintptr_t)item + 2604 9716 jonathan item->len); 2605 9716 jonathan continue; 2606 9716 jonathan } 2607 7282 mishra domain = item->i.xp.domain; 2608 6445 jjc break; 2609 6445 jjc 2610 6445 jjc default: 2611 9716 jonathan item = (struct srat_item *)((uintptr_t)item + 2612 9716 jonathan item->len); 2613 9716 jonathan continue; 2614 6445 jjc } 2615 6445 jjc 2616 6445 jjc /* 2617 6445 jjc * Count and keep track of which proximity domain IDs seen 2618 6445 jjc */ 2619 6445 jjc start = i = domain % MAX_NODES; 2620 6445 jjc overflow = B_TRUE; 2621 6445 jjc do { 2622 6445 jjc /* 2623 6445 jjc * Create entry for proximity domain and increment 2624 6445 jjc * count when no entry exists where proximity domain 2625 6445 jjc * hashed 2626 6445 jjc */ 2627 6445 jjc if (!node_domain[i].exists) { 2628 6445 jjc node_domain[i].exists = 1; 2629 6445 jjc node_domain[i].prox_domain = domain; 2630 6445 jjc domain_cnt++; 2631 6445 jjc overflow = B_FALSE; 2632 0 stevel break; 2633 0 stevel } 2634 6445 jjc 2635 6445 jjc /* 2636 6445 jjc * Nothing to do when proximity domain seen already 2637 6445 jjc * and its entry exists 2638 6445 jjc */ 2639 6445 jjc if (node_domain[i].prox_domain == domain) { 2640 6445 jjc overflow = B_FALSE; 2641 6445 jjc break; 2642 6445 jjc } 2643 6445 jjc 2644 6445 jjc /* 2645 6445 jjc * Entry exists where proximity domain hashed, but for 2646 6445 jjc * different proximity domain so keep search for empty 2647 6445 jjc * slot to put it or matching entry whichever comes 2648 6445 jjc * first. 2649 6445 jjc */ 2650 6445 jjc i = (i + 1) % MAX_NODES; 2651 6445 jjc } while (i != start); 2652 6445 jjc 2653 6445 jjc /* 2654 6445 jjc * Didn't find empty or matching entry which means have more 2655 6445 jjc * proximity domains than supported nodes (:-( 2656 6445 jjc */ 2657 6445 jjc ASSERT(overflow != B_TRUE); 2658 6445 jjc if (overflow == B_TRUE) 2659 6445 jjc return (-1); 2660 6445 jjc 2661 6445 jjc item = (struct srat_item *)((uintptr_t)item + item->len); 2662 6445 jjc } 2663 6445 jjc return (domain_cnt); 2664 6445 jjc } 2665 6445 jjc 2666 6445 jjc 2667 6445 jjc /* 2668 6445 jjc * Set lgroup latencies for 2 level lgroup topology 2669 6445 jjc */ 2670 6445 jjc static void 2671 6445 jjc lgrp_plat_2level_setup(node_phys_addr_map_t *node_memory, 2672 6445 jjc lgrp_plat_latency_stats_t *lat_stats) 2673 6445 jjc { 2674 6445 jjc int i; 2675 6445 jjc 2676 6445 jjc ASSERT(node_memory != NULL && lat_stats != NULL); 2677 6445 jjc 2678 6445 jjc if (lgrp_plat_node_cnt >= 4) 2679 6445 jjc cmn_err(CE_NOTE, 2680 6445 jjc "MPO only optimizing for local and remote\n"); 2681 6445 jjc for (i = 0; i < lgrp_plat_node_cnt; i++) { 2682 6445 jjc int j; 2683 6445 jjc 2684 6445 jjc if (!node_memory[i].exists) 2685 6445 jjc continue; 2686 6445 jjc for (j = 0; j < lgrp_plat_node_cnt; j++) { 2687 6445 jjc if (!node_memory[j].exists) 2688 6445 jjc continue; 2689 6445 jjc if (i == j) 2690 6445 jjc lat_stats->latencies[i][j] = 2; 2691 6445 jjc else 2692 6445 jjc lat_stats->latencies[i][j] = 3; 2693 0 stevel } 2694 0 stevel } 2695 6445 jjc lat_stats->latency_min = 2; 2696 6445 jjc lat_stats->latency_max = 3; 2697 6445 jjc lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0); 2698 0 stevel } 2699 0 stevel 2700 6445 jjc 2701 0 stevel /* 2702 6445 jjc * The following Opteron specific constants, macros, types, and routines define 2703 6445 jjc * PCI configuration space registers and how to read them to determine the NUMA 2704 6445 jjc * configuration of *supported* Opteron processors. They provide the same 2705 6445 jjc * information that may be gotten from the ACPI System Resource Affinity Table 2706 6445 jjc * (SRAT) if it exists on the machine of interest. 2707 0 stevel * 2708 6445 jjc * The AMD BIOS and Kernel Developer's Guide (BKDG) for the processor family 2709 6445 jjc * of interest describes all of these registers and their contents. The main 2710 6445 jjc * registers used by this code to determine the NUMA configuration of the 2711 6445 jjc * machine are the node ID register for the number of NUMA nodes and the DRAM 2712 6445 jjc * address map registers for the physical address range of each node. 2713 6445 jjc * 2714 6445 jjc * NOTE: The format and how to determine the NUMA configuration using PCI 2715 6445 jjc * config space registers may change or may not be supported in future 2716 6445 jjc * Opteron processor families. 2717 0 stevel */ 2718 6445 jjc 2719 6445 jjc /* 2720 6445 jjc * How many bits to shift Opteron DRAM Address Map base and limit registers 2721 6445 jjc * to get actual value 2722 6445 jjc */ 2723 6445 jjc #define OPT_DRAMADDR_HI_LSHIFT_ADDR 40 /* shift left for address */ 2724 6445 jjc #define OPT_DRAMADDR_LO_LSHIFT_ADDR 8 /* shift left for address */ 2725 6445 jjc 2726 6445 jjc #define OPT_DRAMADDR_HI_MASK_ADDR 0x000000FF /* address bits 47-40 */ 2727 6445 jjc #define OPT_DRAMADDR_LO_MASK_ADDR 0xFFFF0000 /* address bits 39-24 */ 2728 6445 jjc 2729 6445 jjc #define OPT_DRAMADDR_LO_MASK_OFF 0xFFFFFF /* offset for address */ 2730 6445 jjc 2731 6445 jjc /* 2732 6445 jjc * Macros to derive addresses from Opteron DRAM Address Map registers 2733 6445 jjc */ 2734 6445 jjc #define OPT_DRAMADDR_HI(reg) \ 2735 6445 jjc (((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \ 2736 6445 jjc OPT_DRAMADDR_HI_LSHIFT_ADDR) 2737 6445 jjc 2738 6445 jjc #define OPT_DRAMADDR_LO(reg) \ 2739 6445 jjc (((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \ 2740 6445 jjc OPT_DRAMADDR_LO_LSHIFT_ADDR) 2741 6445 jjc 2742 6445 jjc #define OPT_DRAMADDR(high, low) \ 2743 6445 jjc (OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low)) 2744 6445 jjc 2745 6445 jjc /* 2746 6445 jjc * Bit masks defining what's in Opteron DRAM Address Map base register 2747 6445 jjc */ 2748 6445 jjc #define OPT_DRAMBASE_LO_MASK_RE 0x1 /* read enable */ 2749 6445 jjc #define OPT_DRAMBASE_LO_MASK_WE 0x2 /* write enable */ 2750 6445 jjc #define OPT_DRAMBASE_LO_MASK_INTRLVEN 0x700 /* interleave */ 2751 6445 jjc 2752 6445 jjc /* 2753 6445 jjc * Bit masks defining what's in Opteron DRAM Address Map limit register 2754 6445 jjc */ 2755 6445 jjc #define OPT_DRAMLIMIT_LO_MASK_DSTNODE 0x7 /* destination node */ 2756 6445 jjc #define OPT_DRAMLIMIT_LO_MASK_INTRLVSEL 0x700 /* interleave select */ 2757 6445 jjc 2758 6445 jjc 2759 6445 jjc /* 2760 6445 jjc * Opteron Node ID register in PCI configuration space contains 2761 6445 jjc * number of nodes in system, etc. for Opteron K8. The following 2762 6445 jjc * constants and macros define its contents, structure, and access. 2763 6445 jjc */ 2764 6445 jjc 2765 6445 jjc /* 2766 6445 jjc * Bit masks defining what's in Opteron Node ID register 2767 6445 jjc */ 2768 6445 jjc #define OPT_NODE_MASK_ID 0x7 /* node ID */ 2769 6445 jjc #define OPT_NODE_MASK_CNT 0x70 /* node count */ 2770 6445 jjc #define OPT_NODE_MASK_IONODE 0x700 /* Hypertransport I/O hub node ID */ 2771 6445 jjc #define OPT_NODE_MASK_LCKNODE 0x7000 /* lock controller node ID */ 2772 6445 jjc #define OPT_NODE_MASK_CPUCNT 0xF0000 /* CPUs in system (0 means 1 CPU) */ 2773 6445 jjc 2774 6445 jjc /* 2775 6445 jjc * How many bits in Opteron Node ID register to shift right to get actual value 2776 6445 jjc */ 2777 6445 jjc #define OPT_NODE_RSHIFT_CNT 0x4 /* shift right for node count value */ 2778 6445 jjc 2779 6445 jjc /* 2780 6445 jjc * Macros to get values from Opteron Node ID register 2781 6445 jjc */ 2782 6445 jjc #define OPT_NODE_CNT(reg) \ 2783 6445 jjc ((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT) 2784 6445 jjc 2785 6445 jjc /* 2786 6445 jjc * Macro to setup PCI Extended Configuration Space (ECS) address to give to 2787 6445 jjc * "in/out" instructions 2788 6445 jjc * 2789 6445 jjc * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any 2790 6445 jjc * other uses should just do MMIO to access PCI ECS. 2791 6445 jjc * Must enable special bit in Northbridge Configuration Register on 2792 6445 jjc * Greyhound for extended CF8 space access to be able to access PCI ECS 2793 6445 jjc * using "in/out" instructions and restore special bit after done 2794 6445 jjc * accessing PCI ECS. 2795 6445 jjc */ 2796 6445 jjc #define OPT_PCI_ECS_ADDR(bus, device, function, reg) \ 2797 6445 jjc (PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11) | \ 2798 6445 jjc (((function) & 0x7) << 8) | ((reg) & 0xfc) | \ 2799 6445 jjc ((((reg) >> 8) & 0xf) << 24)) 2800 6445 jjc 2801 6445 jjc /* 2802 6445 jjc * PCI configuration space registers accessed by specifying 2803 6445 jjc * a bus, device, function, and offset. The following constants 2804 6445 jjc * define the values needed to access Opteron K8 configuration 2805 6445 jjc * info to determine its node topology 2806 6445 jjc */ 2807 6445 jjc 2808 6445 jjc #define OPT_PCS_BUS_CONFIG 0 /* Hypertransport config space bus */ 2809 6445 jjc 2810 6445 jjc /* 2811 6445 jjc * Opteron PCI configuration space register function values 2812 6445 jjc */ 2813 6445 jjc #define OPT_PCS_FUNC_HT 0 /* Hypertransport configuration */ 2814 6445 jjc #define OPT_PCS_FUNC_ADDRMAP 1 /* Address map configuration */ 2815 6445 jjc #define OPT_PCS_FUNC_DRAM 2 /* DRAM configuration */ 2816 6445 jjc #define OPT_PCS_FUNC_MISC 3 /* Miscellaneous configuration */ 2817 6445 jjc 2818 6445 jjc /* 2819 6445 jjc * PCI Configuration Space register offsets 2820 6445 jjc */ 2821 6445 jjc #define OPT_PCS_OFF_VENDOR 0x0 /* device/vendor ID register */ 2822 6445 jjc #define OPT_PCS_OFF_DRAMBASE_HI 0x140 /* DRAM Base register (node 0) */ 2823 6445 jjc #define OPT_PCS_OFF_DRAMBASE_LO 0x40 /* DRAM Base register (node 0) */ 2824 6445 jjc #define OPT_PCS_OFF_NODEID 0x60 /* Node ID register */ 2825 6445 jjc 2826 6445 jjc /* 2827 6445 jjc * Opteron PCI Configuration Space device IDs for nodes 2828 6445 jjc */ 2829 6445 jjc #define OPT_PCS_DEV_NODE0 24 /* device number for node 0 */ 2830 6445 jjc 2831 6445 jjc 2832 6445 jjc /* 2833 6445 jjc * Opteron DRAM address map gives base and limit for physical memory in a node 2834 6445 jjc */ 2835 6445 jjc typedef struct opt_dram_addr_map { 2836 6445 jjc uint32_t base_hi; 2837 6445 jjc uint32_t base_lo; 2838 6445 jjc uint32_t limit_hi; 2839 6445 jjc uint32_t limit_lo; 2840 6445 jjc } opt_dram_addr_map_t; 2841 6445 jjc 2842 6445 jjc 2843 6445 jjc /* 2844 6445 jjc * Supported AMD processor families 2845 6445 jjc */ 2846 6445 jjc #define AMD_FAMILY_HAMMER 15 2847 6445 jjc #define AMD_FAMILY_GREYHOUND 16 2848 6445 jjc 2849 6445 jjc /* 2850 6445 jjc * Whether to have is_opteron() return 1 even when processor isn't supported 2851 6445 jjc */ 2852 6445 jjc uint_t is_opteron_override = 0; 2853 6445 jjc 2854 6445 jjc /* 2855 6445 jjc * AMD processor family for current CPU 2856 6445 jjc */ 2857 6445 jjc uint_t opt_family = 0; 2858 6445 jjc 2859 6445 jjc 2860 6445 jjc /* 2861 6445 jjc * Determine whether we're running on a supported AMD Opteron since reading 2862 6445 jjc * node count and DRAM address map registers may have different format or 2863 6445 jjc * may not be supported across processor families 2864 6445 jjc */ 2865 6445 jjc static int 2866 6445 jjc is_opteron(void) 2867 0 stevel { 2868 0 stevel 2869 6445 jjc if (x86_vendor != X86_VENDOR_AMD) 2870 0 stevel return (0); 2871 0 stevel 2872 6445 jjc opt_family = cpuid_getfamily(CPU); 2873 6445 jjc if (opt_family == AMD_FAMILY_HAMMER || 2874 6445 jjc opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override) 2875 6445 jjc return (1); 2876 6445 jjc else 2877 6445 jjc return (0); 2878 6445 jjc } 2879 0 stevel 2880 6445 jjc 2881 6445 jjc /* 2882 6445 jjc * Determine NUMA configuration for Opteron from registers that live in PCI 2883 6445 jjc * configuration space 2884 6445 jjc */ 2885 6445 jjc static void 2886 6445 jjc opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv, 2887 6445 jjc node_phys_addr_map_t *node_memory) 2888 6445 jjc { 2889 6445 jjc uint_t bus; 2890 6445 jjc uint_t dev; 2891 6445 jjc struct opt_dram_addr_map dram_map[MAX_NODES]; 2892 6445 jjc uint_t node; 2893 6445 jjc uint_t node_info[MAX_NODES]; 2894 6445 jjc uint_t off_hi; 2895 6445 jjc uint_t off_lo; 2896 6445 jjc uint64_t nb_cfg_reg; 2897 0 stevel 2898 0 stevel /* 2899 6445 jjc * Read configuration registers from PCI configuration space to 2900 6445 jjc * determine node information, which memory is in each node, etc. 2901 6445 jjc * 2902 6445 jjc * Write to PCI configuration space address register to specify 2903 6445 jjc * which configuration register to read and read/write PCI 2904 6445 jjc * configuration space data register to get/set contents 2905 0 stevel */ 2906 6445 jjc bus = OPT_PCS_BUS_CONFIG; 2907 6445 jjc dev = OPT_PCS_DEV_NODE0; 2908 6445 jjc off_hi = OPT_PCS_OFF_DRAMBASE_HI; 2909 6445 jjc off_lo = OPT_PCS_OFF_DRAMBASE_LO; 2910 0 stevel 2911 0 stevel /* 2912 6445 jjc * Read node ID register for node 0 to get node count 2913 0 stevel */ 2914 6445 jjc node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT, 2915 6445 jjc OPT_PCS_OFF_NODEID); 2916 6445 jjc *node_cnt = OPT_NODE_CNT(node_info[0]) + 1; 2917 0 stevel 2918 6445 jjc /* 2919 6445 jjc * If number of nodes is more than maximum supported, then set node 2920 6445 jjc * count to 1 and treat system as UMA instead of NUMA. 2921 6445 jjc */ 2922 6445 jjc if (*node_cnt > MAX_NODES) { 2923 6445 jjc *node_cnt = 1; 2924 6445 jjc return; 2925 6445 jjc } 2926 6445 jjc 2927 6445 jjc /* 2928 6445 jjc * For Greyhound, PCI Extended Configuration Space must be enabled to 2929 6445 jjc * read high DRAM address map base and limit registers 2930 6445 jjc */ 2931 6445 jjc if (opt_family == AMD_FAMILY_GREYHOUND) { 2932 6445 jjc nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG); 2933 6445 jjc if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 2934 6445 jjc wrmsr(MSR_AMD_NB_CFG, 2935 6445 jjc nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS); 2936 6445 jjc } 2937 6445 jjc 2938 6445 jjc for (node = 0; node < *node_cnt; node++) { 2939 6445 jjc uint32_t base_hi; 2940 6445 jjc uint32_t base_lo; 2941 6445 jjc uint32_t limit_hi; 2942 6445 jjc uint32_t limit_lo; 2943 6445 jjc 2944 6445 jjc /* 2945 6445 jjc * Read node ID register (except for node 0 which we just read) 2946 6445 jjc */ 2947 6445 jjc if (node > 0) { 2948 6445 jjc node_info[node] = pci_getl_func(bus, dev, 2949 6445 jjc OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID); 2950 6445 jjc } 2951 6445 jjc 2952 6445 jjc /* 2953 6445 jjc * Read DRAM base and limit registers which specify 2954 6445 jjc * physical memory range of each node 2955 6445 jjc */ 2956 6445 jjc if (opt_family != AMD_FAMILY_GREYHOUND) 2957 6445 jjc base_hi = 0; 2958 6445 jjc else { 2959 6445 jjc outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 2960 6445 jjc OPT_PCS_FUNC_ADDRMAP, off_hi)); 2961 6445 jjc base_hi = dram_map[node].base_hi = 2962 6445 jjc inl(PCI_CONFDATA); 2963 6445 jjc } 2964 6445 jjc base_lo = dram_map[node].base_lo = pci_getl_func(bus, dev, 2965 6445 jjc OPT_PCS_FUNC_ADDRMAP, off_lo); 2966 6445 jjc 2967 6445 jjc if ((dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) && 2968 6445 jjc mem_intrlv) 2969 6445 jjc *mem_intrlv = *mem_intrlv + 1; 2970 6445 jjc 2971 6445 jjc off_hi += 4; /* high limit register offset */ 2972 6445 jjc if (opt_family != AMD_FAMILY_GREYHOUND) 2973 6445 jjc limit_hi = 0; 2974 6445 jjc else { 2975 6445 jjc outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 2976 6445 jjc OPT_PCS_FUNC_ADDRMAP, off_hi)); 2977 6445 jjc limit_hi = dram_map[node].limit_hi = 2978 6445 jjc inl(PCI_CONFDATA); 2979 6445 jjc } 2980 6445 jjc 2981 6445 jjc off_lo += 4; /* low limit register offset */ 2982 6445 jjc limit_lo = dram_map[node].limit_lo = pci_getl_func(bus, 2983 6445 jjc dev, OPT_PCS_FUNC_ADDRMAP, off_lo); 2984 6445 jjc 2985 6445 jjc /* 2986 6445 jjc * Increment device number to next node and register offsets 2987 6445 jjc * for DRAM base register of next node 2988 6445 jjc */ 2989 6445 jjc off_hi += 4; 2990 6445 jjc off_lo += 4; 2991 6445 jjc dev++; 2992 6445 jjc 2993 6445 jjc /* 2994 6445 jjc * Both read and write enable bits must be enabled in DRAM 2995 6445 jjc * address map base register for physical memory to exist in 2996 6445 jjc * node 2997 6445 jjc */ 2998 6445 jjc if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 || 2999 6445 jjc (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) { 3000 6445 jjc /* 3001 6445 jjc * Mark node memory as non-existent and set start and 3002 6445 jjc * end addresses to be same in node_memory[] 3003 6445 jjc */ 3004 6445 jjc node_memory[node].exists = 0; 3005 6445 jjc node_memory[node].start = node_memory[node].end = 3006 6445 jjc (pfn_t)-1; 3007 6445 jjc continue; 3008 6445 jjc } 3009 6445 jjc 3010 6445 jjc /* 3011 6445 jjc * Mark node memory as existing and remember physical address 3012 6445 jjc * range of each node for use later 3013 6445 jjc */ 3014 6445 jjc node_memory[node].exists = 1; 3015 6445 jjc 3016 6445 jjc node_memory[node].start = btop(OPT_DRAMADDR(base_hi, base_lo)); 3017 6445 jjc 3018 6445 jjc node_memory[node].end = btop(OPT_DRAMADDR(limit_hi, limit_lo) | 3019 6445 jjc OPT_DRAMADDR_LO_MASK_OFF); 3020 6445 jjc } 3021 6445 jjc 3022 6445 jjc /* 3023 6445 jjc * Restore PCI Extended Configuration Space enable bit 3024 6445 jjc */ 3025 6445 jjc if (opt_family == AMD_FAMILY_GREYHOUND) { 3026 6445 jjc if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 3027 6445 jjc wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg); 3028 6445 jjc } 3029 0 stevel } 3030 0 stevel 3031 6445 jjc 3032 0 stevel /* 3033 6445 jjc * Return average amount of time to read vendor ID register on Northbridge 3034 6445 jjc * N times on specified destination node from current CPU 3035 0 stevel */ 3036 6445 jjc static hrtime_t 3037 6445 jjc opt_probe_vendor(int dest_node, int nreads) 3038 0 stevel { 3039 6445 jjc int cnt; 3040 6445 jjc uint_t dev; 3041 6445 jjc /* LINTED: set but not used in function */ 3042 6445 jjc volatile uint_t dev_vendor; 3043 6445 jjc hrtime_t elapsed; 3044 6445 jjc hrtime_t end; 3045 6445 jjc int ipl; 3046 6445 jjc hrtime_t start; 3047 6445 jjc 3048 6445 jjc dev = OPT_PCS_DEV_NODE0 + dest_node; 3049 6445 jjc kpreempt_disable(); 3050 6445 jjc ipl = spl8(); 3051 6445 jjc outl(PCI_CONFADD, PCI_CADDR1(0, dev, OPT_PCS_FUNC_DRAM, 3052 6445 jjc OPT_PCS_OFF_VENDOR)); 3053 6445 jjc start = gethrtime(); 3054 6445 jjc for (cnt = 0; cnt < nreads; cnt++) 3055 6445 jjc dev_vendor = inl(PCI_CONFDATA); 3056 6445 jjc end = gethrtime(); 3057 6445 jjc elapsed = (end - start) / nreads; 3058 6445 jjc splx(ipl); 3059 6445 jjc kpreempt_enable(); 3060 6445 jjc return (elapsed); 3061 0 stevel } 3062