Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * Kernel statistics framework
     28  */
     29 
     30 #include <sys/types.h>
     31 #include <sys/time.h>
     32 #include <sys/systm.h>
     33 #include <sys/vmsystm.h>
     34 #include <sys/t_lock.h>
     35 #include <sys/param.h>
     36 #include <sys/errno.h>
     37 #include <sys/vmem.h>
     38 #include <sys/sysmacros.h>
     39 #include <sys/cmn_err.h>
     40 #include <sys/kstat.h>
     41 #include <sys/sysinfo.h>
     42 #include <sys/cpuvar.h>
     43 #include <sys/fcntl.h>
     44 #include <sys/flock.h>
     45 #include <sys/vnode.h>
     46 #include <sys/vfs.h>
     47 #include <sys/dnlc.h>
     48 #include <sys/var.h>
     49 #include <sys/vmmeter.h>
     50 #include <sys/debug.h>
     51 #include <sys/kobj.h>
     52 #include <sys/avl.h>
     53 #include <sys/pool_pset.h>
     54 #include <sys/cpupart.h>
     55 #include <sys/zone.h>
     56 #include <sys/loadavg.h>
     57 #include <vm/page.h>
     58 #include <vm/anon.h>
     59 #include <vm/seg_kmem.h>
     60 
     61 /*
     62  * Global lock to protect the AVL trees and kstat_chain_id.
     63  */
     64 static kmutex_t kstat_chain_lock;
     65 
     66 /*
     67  * Every install/delete kstat bumps kstat_chain_id.  This is used by:
     68  *
     69  * (1)	/dev/kstat, to detect changes in the kstat chain across ioctls;
     70  *
     71  * (2)	kstat_create(), to assign a KID (kstat ID) to each new kstat.
     72  *	/dev/kstat uses the KID as a cookie for kstat lookups.
     73  *
     74  * We reserve the first two IDs because some kstats are created before
     75  * the well-known ones (kstat_headers = 0, kstat_types = 1).
     76  *
     77  * We also bump the kstat_chain_id if a zone is gaining or losing visibility
     78  * into a particular kstat, which is logically equivalent to a kstat being
     79  * installed/deleted.
     80  */
     81 
     82 kid_t kstat_chain_id = 2;
     83 
     84 /*
     85  * As far as zones are concerned, there are 3 types of kstat:
     86  *
     87  * 1) Those which have a well-known name, and which should return per-zone data
     88  * depending on which zone is doing the kstat_read().  sockfs:0:sock_unix_list
     89  * is an example of this type of kstat.
     90  *
     91  * 2) Those which should only be exported to a particular list of zones.
     92  * For example, in the case of nfs:*:mntinfo, we don't want zone A to be
     93  * able to see NFS mounts associated with zone B, while we want the
     94  * global zone to be able to see all mounts on the system.
     95  *
     96  * 3) Those that can be exported to all zones.  Most system-related
     97  * kstats fall within this category.
     98  *
     99  * An ekstat_t thus contains a list of kstats that the zone is to be
    100  * exported to.  The lookup of a name:instance:module thus translates to a
    101  * lookup of name:instance:module:myzone; if the kstat is not exported
    102  * to all zones, and does not have the caller's zoneid explicitly
    103  * enumerated in the list of zones to be exported to, it is the same as
    104  * if the kstat didn't exist.
    105  *
    106  * Writing to kstats is currently disallowed from within a non-global
    107  * zone, although this restriction could be removed in the future.
    108  */
    109 typedef struct kstat_zone {
    110 	zoneid_t zoneid;
    111 	struct kstat_zone *next;
    112 } kstat_zone_t;
    113 
    114 /*
    115  * Extended kstat structure -- for internal use only.
    116  */
    117 typedef struct ekstat {
    118 	kstat_t		e_ks;		/* the kstat itself */
    119 	size_t		e_size;		/* total allocation size */
    120 	kthread_t	*e_owner;	/* thread holding this kstat */
    121 	kcondvar_t	e_cv;		/* wait for owner == NULL */
    122 	avl_node_t	e_avl_bykid;	/* AVL tree to sort by KID */
    123 	avl_node_t	e_avl_byname;	/* AVL tree to sort by name */
    124 	kstat_zone_t	e_zone;		/* zone to export stats to */
    125 } ekstat_t;
    126 
    127 static uint64_t kstat_initial[8192];
    128 static void *kstat_initial_ptr = kstat_initial;
    129 static size_t kstat_initial_avail = sizeof (kstat_initial);
    130 static vmem_t *kstat_arena;
    131 
    132 #define	KSTAT_ALIGN	(sizeof (uint64_t))
    133 
    134 static avl_tree_t kstat_avl_bykid;
    135 static avl_tree_t kstat_avl_byname;
    136 
    137 /*
    138  * Various pointers we need to create kstats at boot time in kstat_init()
    139  */
    140 extern	kstat_named_t	*segmapcnt_ptr;
    141 extern	uint_t		segmapcnt_ndata;
    142 extern	int		segmap_kstat_update(kstat_t *, int);
    143 extern	kstat_named_t	*biostats_ptr;
    144 extern	uint_t		biostats_ndata;
    145 extern	kstat_named_t	*pollstats_ptr;
    146 extern	uint_t		pollstats_ndata;
    147 
    148 extern	int	vac;
    149 extern	uint_t	nproc;
    150 extern	time_t	boot_time;
    151 extern	sysinfo_t	sysinfo;
    152 extern	vminfo_t	vminfo;
    153 
    154 struct {
    155 	kstat_named_t ncpus;
    156 	kstat_named_t lbolt;
    157 	kstat_named_t deficit;
    158 	kstat_named_t clk_intr;
    159 	kstat_named_t vac;
    160 	kstat_named_t nproc;
    161 	kstat_named_t avenrun_1min;
    162 	kstat_named_t avenrun_5min;
    163 	kstat_named_t avenrun_15min;
    164 	kstat_named_t boot_time;
    165 } system_misc_kstat = {
    166 	{ "ncpus",		KSTAT_DATA_UINT32 },
    167 	{ "lbolt",		KSTAT_DATA_UINT32 },
    168 	{ "deficit",		KSTAT_DATA_UINT32 },
    169 	{ "clk_intr",		KSTAT_DATA_UINT32 },
    170 	{ "vac",		KSTAT_DATA_UINT32 },
    171 	{ "nproc",		KSTAT_DATA_UINT32 },
    172 	{ "avenrun_1min",	KSTAT_DATA_UINT32 },
    173 	{ "avenrun_5min",	KSTAT_DATA_UINT32 },
    174 	{ "avenrun_15min",	KSTAT_DATA_UINT32 },
    175 	{ "boot_time",		KSTAT_DATA_UINT32 },
    176 };
    177 
    178 struct {
    179 	kstat_named_t physmem;
    180 	kstat_named_t nalloc;
    181 	kstat_named_t nfree;
    182 	kstat_named_t nalloc_calls;
    183 	kstat_named_t nfree_calls;
    184 	kstat_named_t kernelbase;
    185 	kstat_named_t econtig;
    186 	kstat_named_t freemem;
    187 	kstat_named_t availrmem;
    188 	kstat_named_t lotsfree;
    189 	kstat_named_t desfree;
    190 	kstat_named_t minfree;
    191 	kstat_named_t fastscan;
    192 	kstat_named_t slowscan;
    193 	kstat_named_t nscan;
    194 	kstat_named_t desscan;
    195 	kstat_named_t pp_kernel;
    196 	kstat_named_t pagesfree;
    197 	kstat_named_t pageslocked;
    198 	kstat_named_t pagestotal;
    199 } system_pages_kstat = {
    200 	{ "physmem",		KSTAT_DATA_ULONG },
    201 	{ "nalloc",		KSTAT_DATA_ULONG },
    202 	{ "nfree",		KSTAT_DATA_ULONG },
    203 	{ "nalloc_calls",	KSTAT_DATA_ULONG },
    204 	{ "nfree_calls",	KSTAT_DATA_ULONG },
    205 	{ "kernelbase",		KSTAT_DATA_ULONG },
    206 	{ "econtig", 		KSTAT_DATA_ULONG },
    207 	{ "freemem", 		KSTAT_DATA_ULONG },
    208 	{ "availrmem", 		KSTAT_DATA_ULONG },
    209 	{ "lotsfree", 		KSTAT_DATA_ULONG },
    210 	{ "desfree", 		KSTAT_DATA_ULONG },
    211 	{ "minfree", 		KSTAT_DATA_ULONG },
    212 	{ "fastscan", 		KSTAT_DATA_ULONG },
    213 	{ "slowscan", 		KSTAT_DATA_ULONG },
    214 	{ "nscan", 		KSTAT_DATA_ULONG },
    215 	{ "desscan", 		KSTAT_DATA_ULONG },
    216 	{ "pp_kernel", 		KSTAT_DATA_ULONG },
    217 	{ "pagesfree", 		KSTAT_DATA_ULONG },
    218 	{ "pageslocked", 	KSTAT_DATA_ULONG },
    219 	{ "pagestotal",		KSTAT_DATA_ULONG },
    220 };
    221 
    222 static int header_kstat_update(kstat_t *, int);
    223 static int header_kstat_snapshot(kstat_t *, void *, int);
    224 static int system_misc_kstat_update(kstat_t *, int);
    225 static int system_pages_kstat_update(kstat_t *, int);
    226 
    227 static struct {
    228 	char	name[KSTAT_STRLEN];
    229 	size_t	size;
    230 	uint_t	min_ndata;
    231 	uint_t	max_ndata;
    232 } kstat_data_type[KSTAT_NUM_TYPES] = {
    233 	{ "raw",		1,			0,	INT_MAX	},
    234 	{ "name=value",		sizeof (kstat_named_t),	0,	INT_MAX	},
    235 	{ "interrupt",		sizeof (kstat_intr_t),	1,	1	},
    236 	{ "i/o",		sizeof (kstat_io_t),	1,	1	},
    237 	{ "event_timer",	sizeof (kstat_timer_t),	0,	INT_MAX	},
    238 };
    239 
    240 int
    241 kstat_zone_find(kstat_t *k, zoneid_t zoneid)
    242 {
    243 	ekstat_t *e = (ekstat_t *)k;
    244 	kstat_zone_t *kz;
    245 
    246 	ASSERT(MUTEX_HELD(&kstat_chain_lock));
    247 	for (kz = &e->e_zone; kz != NULL; kz = kz->next) {
    248 		if (zoneid == ALL_ZONES || kz->zoneid == ALL_ZONES)
    249 			return (1);
    250 		if (zoneid == kz->zoneid)
    251 			return (1);
    252 	}
    253 	return (0);
    254 }
    255 
    256 void
    257 kstat_zone_remove(kstat_t *k, zoneid_t zoneid)
    258 {
    259 	ekstat_t *e = (ekstat_t *)k;
    260 	kstat_zone_t *kz, *t = NULL;
    261 
    262 	mutex_enter(&kstat_chain_lock);
    263 	if (zoneid == e->e_zone.zoneid) {
    264 		kz = e->e_zone.next;
    265 		ASSERT(kz != NULL);
    266 		e->e_zone.zoneid = kz->zoneid;
    267 		e->e_zone.next = kz->next;
    268 		goto out;
    269 	}
    270 	for (kz = &e->e_zone; kz->next != NULL; kz = kz->next) {
    271 		if (kz->next->zoneid == zoneid) {
    272 			t = kz->next;
    273 			kz->next = t->next;
    274 			break;
    275 		}
    276 	}
    277 	ASSERT(t != NULL);	/* we removed something */
    278 	kz = t;
    279 out:
    280 	kstat_chain_id++;
    281 	mutex_exit(&kstat_chain_lock);
    282 	kmem_free(kz, sizeof (*kz));
    283 }
    284 
    285 void
    286 kstat_zone_add(kstat_t *k, zoneid_t zoneid)
    287 {
    288 	ekstat_t *e = (ekstat_t *)k;
    289 	kstat_zone_t *kz;
    290 
    291 	kz = kmem_alloc(sizeof (*kz), KM_NOSLEEP);
    292 	if (kz == NULL)
    293 		return;
    294 	mutex_enter(&kstat_chain_lock);
    295 	kz->zoneid = zoneid;
    296 	kz->next = e->e_zone.next;
    297 	e->e_zone.next = kz;
    298 	kstat_chain_id++;
    299 	mutex_exit(&kstat_chain_lock);
    300 }
    301 
    302 /*
    303  * Compare the list of zones for the given kstats, returning 0 if they match
    304  * (ie, one list contains ALL_ZONES or both lists contain the same zoneid).
    305  * In practice, this is called indirectly by kstat_hold_byname(), so one of the
    306  * two lists always has one element, and this is an O(n) operation rather than
    307  * O(n^2).
    308  */
    309 static int
    310 kstat_zone_compare(ekstat_t *e1, ekstat_t *e2)
    311 {
    312 	kstat_zone_t *kz1, *kz2;
    313 
    314 	ASSERT(MUTEX_HELD(&kstat_chain_lock));
    315 	for (kz1 = &e1->e_zone; kz1 != NULL; kz1 = kz1->next) {
    316 		for (kz2 = &e2->e_zone; kz2 != NULL; kz2 = kz2->next) {
    317 			if (kz1->zoneid == ALL_ZONES ||
    318 			    kz2->zoneid == ALL_ZONES)
    319 				return (0);
    320 			if (kz1->zoneid == kz2->zoneid)
    321 				return (0);
    322 		}
    323 	}
    324 	return (e1->e_zone.zoneid < e2->e_zone.zoneid ? -1 : 1);
    325 }
    326 
    327 /*
    328  * Support for keeping kstats sorted in AVL trees for fast lookups.
    329  */
    330 static int
    331 kstat_compare_bykid(const void *a1, const void *a2)
    332 {
    333 	const kstat_t *k1 = a1;
    334 	const kstat_t *k2 = a2;
    335 
    336 	if (k1->ks_kid < k2->ks_kid)
    337 		return (-1);
    338 	if (k1->ks_kid > k2->ks_kid)
    339 		return (1);
    340 	return (kstat_zone_compare((ekstat_t *)k1, (ekstat_t *)k2));
    341 }
    342 
    343 static int
    344 kstat_compare_byname(const void *a1, const void *a2)
    345 {
    346 	const kstat_t *k1 = a1;
    347 	const kstat_t *k2 = a2;
    348 	int s;
    349 
    350 	s = strcmp(k1->ks_module, k2->ks_module);
    351 	if (s > 0)
    352 		return (1);
    353 	if (s < 0)
    354 		return (-1);
    355 
    356 	if (k1->ks_instance < k2->ks_instance)
    357 		return (-1);
    358 	if (k1->ks_instance > k2->ks_instance)
    359 		return (1);
    360 
    361 	s = strcmp(k1->ks_name, k2->ks_name);
    362 	if (s > 0)
    363 		return (1);
    364 	if (s < 0)
    365 		return (-1);
    366 
    367 	return (kstat_zone_compare((ekstat_t *)k1, (ekstat_t *)k2));
    368 }
    369 
    370 static kstat_t *
    371 kstat_hold(avl_tree_t *t, ekstat_t *template)
    372 {
    373 	kstat_t *ksp;
    374 	ekstat_t *e;
    375 
    376 	mutex_enter(&kstat_chain_lock);
    377 	for (;;) {
    378 		ksp = avl_find(t, template, NULL);
    379 		if (ksp == NULL)
    380 			break;
    381 		e = (ekstat_t *)ksp;
    382 		if (e->e_owner == NULL) {
    383 			e->e_owner = curthread;
    384 			break;
    385 		}
    386 		cv_wait(&e->e_cv, &kstat_chain_lock);
    387 	}
    388 	mutex_exit(&kstat_chain_lock);
    389 	return (ksp);
    390 }
    391 
    392 void
    393 kstat_rele(kstat_t *ksp)
    394 {
    395 	ekstat_t *e = (ekstat_t *)ksp;
    396 
    397 	mutex_enter(&kstat_chain_lock);
    398 	ASSERT(e->e_owner == curthread);
    399 	e->e_owner = NULL;
    400 	cv_broadcast(&e->e_cv);
    401 	mutex_exit(&kstat_chain_lock);
    402 }
    403 
    404 kstat_t *
    405 kstat_hold_bykid(kid_t kid, zoneid_t zoneid)
    406 {
    407 	ekstat_t e;
    408 
    409 	e.e_ks.ks_kid = kid;
    410 	e.e_zone.zoneid = zoneid;
    411 	e.e_zone.next = NULL;
    412 
    413 	return (kstat_hold(&kstat_avl_bykid, &e));
    414 }
    415 
    416 kstat_t *
    417 kstat_hold_byname(const char *ks_module, int ks_instance, const char *ks_name,
    418     zoneid_t ks_zoneid)
    419 {
    420 	ekstat_t e;
    421 
    422 	kstat_set_string(e.e_ks.ks_module, ks_module);
    423 	e.e_ks.ks_instance = ks_instance;
    424 	kstat_set_string(e.e_ks.ks_name, ks_name);
    425 	e.e_zone.zoneid = ks_zoneid;
    426 	e.e_zone.next = NULL;
    427 	return (kstat_hold(&kstat_avl_byname, &e));
    428 }
    429 
    430 static ekstat_t *
    431 kstat_alloc(size_t size)
    432 {
    433 	ekstat_t *e = NULL;
    434 
    435 	size = P2ROUNDUP(sizeof (ekstat_t) + size, KSTAT_ALIGN);
    436 
    437 	if (kstat_arena == NULL) {
    438 		if (size <= kstat_initial_avail) {
    439 			e = kstat_initial_ptr;
    440 			kstat_initial_ptr = (char *)kstat_initial_ptr + size;
    441 			kstat_initial_avail -= size;
    442 		}
    443 	} else {
    444 		e = vmem_alloc(kstat_arena, size, VM_NOSLEEP);
    445 	}
    446 
    447 	if (e != NULL) {
    448 		bzero(e, size);
    449 		e->e_size = size;
    450 		cv_init(&e->e_cv, NULL, CV_DEFAULT, NULL);
    451 	}
    452 
    453 	return (e);
    454 }
    455 
    456 static void
    457 kstat_free(ekstat_t *e)
    458 {
    459 	cv_destroy(&e->e_cv);
    460 	vmem_free(kstat_arena, e, e->e_size);
    461 }
    462 
    463 /*
    464  * Create various system kstats.
    465  */
    466 void
    467 kstat_init(void)
    468 {
    469 	kstat_t *ksp;
    470 	ekstat_t *e;
    471 	avl_tree_t *t = &kstat_avl_bykid;
    472 
    473 	/*
    474 	 * Set up the kstat vmem arena.
    475 	 */
    476 	kstat_arena = vmem_create("kstat",
    477 	    kstat_initial, sizeof (kstat_initial), KSTAT_ALIGN,
    478 	    segkmem_alloc, segkmem_free, heap_arena, 0, VM_SLEEP);
    479 
    480 	/*
    481 	 * Make initial kstats appear as though they were allocated.
    482 	 */
    483 	for (e = avl_first(t); e != NULL; e = avl_walk(t, e, AVL_AFTER))
    484 		(void) vmem_xalloc(kstat_arena, e->e_size, KSTAT_ALIGN,
    485 		    0, 0, e, (char *)e + e->e_size,
    486 		    VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
    487 
    488 	/*
    489 	 * The mother of all kstats.  The first kstat in the system, which
    490 	 * always has KID 0, has the headers for all kstats (including itself)
    491 	 * as its data.  Thus, the kstat driver does not need any special
    492 	 * interface to extract the kstat chain.
    493 	 */
    494 	kstat_chain_id = 0;
    495 	ksp = kstat_create("unix", 0, "kstat_headers", "kstat", KSTAT_TYPE_RAW,
    496 	    0, KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_VAR_SIZE);
    497 	if (ksp) {
    498 		ksp->ks_lock = &kstat_chain_lock;
    499 		ksp->ks_update = header_kstat_update;
    500 		ksp->ks_snapshot = header_kstat_snapshot;
    501 		kstat_install(ksp);
    502 	} else {
    503 		panic("cannot create kstat 'kstat_headers'");
    504 	}
    505 
    506 	ksp = kstat_create("unix", 0, "kstat_types", "kstat",
    507 	    KSTAT_TYPE_NAMED, KSTAT_NUM_TYPES, 0);
    508 	if (ksp) {
    509 		int i;
    510 		kstat_named_t *kn = KSTAT_NAMED_PTR(ksp);
    511 
    512 		for (i = 0; i < KSTAT_NUM_TYPES; i++) {
    513 			kstat_named_init(&kn[i], kstat_data_type[i].name,
    514 			    KSTAT_DATA_ULONG);
    515 			kn[i].value.ul = i;
    516 		}
    517 		kstat_install(ksp);
    518 	}
    519 
    520 	ksp = kstat_create("unix", 0, "sysinfo", "misc", KSTAT_TYPE_RAW,
    521 	    sizeof (sysinfo_t), KSTAT_FLAG_VIRTUAL);
    522 	if (ksp) {
    523 		ksp->ks_data = (void *) &sysinfo;
    524 		kstat_install(ksp);
    525 	}
    526 
    527 	ksp = kstat_create("unix", 0, "vminfo", "vm", KSTAT_TYPE_RAW,
    528 	    sizeof (vminfo_t), KSTAT_FLAG_VIRTUAL);
    529 	if (ksp) {
    530 		ksp->ks_data = (void *) &vminfo;
    531 		kstat_install(ksp);
    532 	}
    533 
    534 	ksp = kstat_create("unix", 0, "segmap", "vm", KSTAT_TYPE_NAMED,
    535 	    segmapcnt_ndata, KSTAT_FLAG_VIRTUAL);
    536 	if (ksp) {
    537 		ksp->ks_data = (void *) segmapcnt_ptr;
    538 		ksp->ks_update = segmap_kstat_update;
    539 		kstat_install(ksp);
    540 	}
    541 
    542 	ksp = kstat_create("unix", 0, "biostats", "misc", KSTAT_TYPE_NAMED,
    543 	    biostats_ndata, KSTAT_FLAG_VIRTUAL);
    544 	if (ksp) {
    545 		ksp->ks_data = (void *) biostats_ptr;
    546 		kstat_install(ksp);
    547 	}
    548 
    549 #ifdef VAC
    550 	ksp = kstat_create("unix", 0, "flushmeter", "hat", KSTAT_TYPE_RAW,
    551 	    sizeof (struct flushmeter), KSTAT_FLAG_VIRTUAL);
    552 	if (ksp) {
    553 		ksp->ks_data = (void *) &flush_cnt;
    554 		kstat_install(ksp);
    555 	}
    556 #endif	/* VAC */
    557 
    558 	ksp = kstat_create("unix", 0, "var", "misc", KSTAT_TYPE_RAW,
    559 	    sizeof (struct var), KSTAT_FLAG_VIRTUAL);
    560 	if (ksp) {
    561 		ksp->ks_data = (void *) &v;
    562 		kstat_install(ksp);
    563 	}
    564 
    565 	ksp = kstat_create("unix", 0, "system_misc", "misc", KSTAT_TYPE_NAMED,
    566 	    sizeof (system_misc_kstat) / sizeof (kstat_named_t),
    567 	    KSTAT_FLAG_VIRTUAL);
    568 	if (ksp) {
    569 		ksp->ks_data = (void *) &system_misc_kstat;
    570 		ksp->ks_update = system_misc_kstat_update;
    571 		kstat_install(ksp);
    572 	}
    573 
    574 	ksp = kstat_create("unix", 0, "system_pages", "pages", KSTAT_TYPE_NAMED,
    575 	    sizeof (system_pages_kstat) / sizeof (kstat_named_t),
    576 	    KSTAT_FLAG_VIRTUAL);
    577 	if (ksp) {
    578 		ksp->ks_data = (void *) &system_pages_kstat;
    579 		ksp->ks_update = system_pages_kstat_update;
    580 		kstat_install(ksp);
    581 	}
    582 
    583 	ksp = kstat_create("poll", 0, "pollstats", "misc", KSTAT_TYPE_NAMED,
    584 	    pollstats_ndata, KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
    585 
    586 	if (ksp) {
    587 		ksp->ks_data = pollstats_ptr;
    588 		kstat_install(ksp);
    589 	}
    590 }
    591 
    592 /*
    593  * Caller of this should ensure that the string pointed by src
    594  * doesn't change while kstat's lock is held. Not doing so defeats
    595  * kstat's snapshot strategy as explained in <sys/kstat.h>
    596  */
    597 void
    598 kstat_named_setstr(kstat_named_t *knp, const char *src)
    599 {
    600 	if (knp->data_type != KSTAT_DATA_STRING)
    601 		panic("kstat_named_setstr('%p', '%p'): "
    602 		    "named kstat is not of type KSTAT_DATA_STRING",
    603 		    (void *)knp, (void *)src);
    604 
    605 	KSTAT_NAMED_STR_PTR(knp) = (char *)src;
    606 	if (src != NULL)
    607 		KSTAT_NAMED_STR_BUFLEN(knp) = strlen(src) + 1;
    608 	else
    609 		KSTAT_NAMED_STR_BUFLEN(knp) = 0;
    610 }
    611 
    612 void
    613 kstat_set_string(char *dst, const char *src)
    614 {
    615 	bzero(dst, KSTAT_STRLEN);
    616 	(void) strncpy(dst, src, KSTAT_STRLEN - 1);
    617 }
    618 
    619 void
    620 kstat_named_init(kstat_named_t *knp, const char *name, uchar_t data_type)
    621 {
    622 	kstat_set_string(knp->name, name);
    623 	knp->data_type = data_type;
    624 
    625 	if (data_type == KSTAT_DATA_STRING)
    626 		kstat_named_setstr(knp, NULL);
    627 }
    628 
    629 void
    630 kstat_timer_init(kstat_timer_t *ktp, const char *name)
    631 {
    632 	kstat_set_string(ktp->name, name);
    633 }
    634 
    635 /* ARGSUSED */
    636 static int
    637 default_kstat_update(kstat_t *ksp, int rw)
    638 {
    639 	uint_t i;
    640 	size_t len = 0;
    641 	kstat_named_t *knp;
    642 
    643 	/*
    644 	 * Named kstats with variable-length long strings have a standard
    645 	 * way of determining how much space is needed to hold the snapshot:
    646 	 */
    647 	if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED &&
    648 	    (ksp->ks_flags & KSTAT_FLAG_VAR_SIZE)) {
    649 
    650 		/*
    651 		 * Add in the space required for the strings
    652 		 */
    653 		knp = KSTAT_NAMED_PTR(ksp);
    654 		for (i = 0; i < ksp->ks_ndata; i++, knp++) {
    655 			if (knp->data_type == KSTAT_DATA_STRING)
    656 				len += KSTAT_NAMED_STR_BUFLEN(knp);
    657 		}
    658 		ksp->ks_data_size =
    659 		    ksp->ks_ndata * sizeof (kstat_named_t) + len;
    660 	}
    661 	return (0);
    662 }
    663 
    664 static int
    665 default_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
    666 {
    667 	kstat_io_t *kiop;
    668 	hrtime_t cur_time;
    669 	size_t	namedsz;
    670 
    671 	ksp->ks_snaptime = cur_time = gethrtime();
    672 
    673 	if (rw == KSTAT_WRITE) {
    674 		if (!(ksp->ks_flags & KSTAT_FLAG_WRITABLE))
    675 			return (EACCES);
    676 		bcopy(buf, ksp->ks_data, ksp->ks_data_size);
    677 		return (0);
    678 	}
    679 
    680 	/*
    681 	 * KSTAT_TYPE_NAMED kstats are defined to have ks_ndata
    682 	 * number of kstat_named_t structures, followed by an optional
    683 	 * string segment. The ks_data generally holds only the
    684 	 * kstat_named_t structures. So we copy it first. The strings,
    685 	 * if any, are copied below. For other kstat types, ks_data holds the
    686 	 * entire buffer.
    687 	 */
    688 
    689 	namedsz = sizeof (kstat_named_t) * ksp->ks_ndata;
    690 	if (ksp->ks_type == KSTAT_TYPE_NAMED && ksp->ks_data_size > namedsz)
    691 		bcopy(ksp->ks_data, buf, namedsz);
    692 	else
    693 		bcopy(ksp->ks_data, buf, ksp->ks_data_size);
    694 
    695 	/*
    696 	 * Apply kstat type-specific data massaging
    697 	 */
    698 	switch (ksp->ks_type) {
    699 
    700 	case KSTAT_TYPE_IO:
    701 		/*
    702 		 * Normalize time units and deal with incomplete transactions
    703 		 */
    704 		kiop = (kstat_io_t *)buf;
    705 
    706 		scalehrtime(&kiop->wtime);
    707 		scalehrtime(&kiop->wlentime);
    708 		scalehrtime(&kiop->wlastupdate);
    709 		scalehrtime(&kiop->rtime);
    710 		scalehrtime(&kiop->rlentime);
    711 		scalehrtime(&kiop->rlastupdate);
    712 
    713 		if (kiop->wcnt != 0) {
    714 			/* like kstat_waitq_exit */
    715 			hrtime_t wfix = cur_time - kiop->wlastupdate;
    716 			kiop->wlastupdate = cur_time;
    717 			kiop->wlentime += kiop->wcnt * wfix;
    718 			kiop->wtime += wfix;
    719 		}
    720 
    721 		if (kiop->rcnt != 0) {
    722 			/* like kstat_runq_exit */
    723 			hrtime_t rfix = cur_time - kiop->rlastupdate;
    724 			kiop->rlastupdate = cur_time;
    725 			kiop->rlentime += kiop->rcnt * rfix;
    726 			kiop->rtime += rfix;
    727 		}
    728 		break;
    729 
    730 	case KSTAT_TYPE_NAMED:
    731 		/*
    732 		 * Massage any long strings in at the end of the buffer
    733 		 */
    734 		if (ksp->ks_data_size > namedsz) {
    735 			uint_t i;
    736 			kstat_named_t *knp = buf;
    737 			char *dst = (char *)(knp + ksp->ks_ndata);
    738 			/*
    739 			 * Copy strings and update pointers
    740 			 */
    741 			for (i = 0; i < ksp->ks_ndata; i++, knp++) {
    742 				if (knp->data_type == KSTAT_DATA_STRING &&
    743 				    KSTAT_NAMED_STR_PTR(knp) != NULL) {
    744 					bcopy(KSTAT_NAMED_STR_PTR(knp), dst,
    745 					    KSTAT_NAMED_STR_BUFLEN(knp));
    746 					KSTAT_NAMED_STR_PTR(knp) = dst;
    747 					dst += KSTAT_NAMED_STR_BUFLEN(knp);
    748 				}
    749 			}
    750 			ASSERT(dst <= ((char *)buf + ksp->ks_data_size));
    751 		}
    752 		break;
    753 	}
    754 	return (0);
    755 }
    756 
    757 static int
    758 header_kstat_update(kstat_t *header_ksp, int rw)
    759 {
    760 	int nkstats = 0;
    761 	ekstat_t *e;
    762 	avl_tree_t *t = &kstat_avl_bykid;
    763 	zoneid_t zoneid;
    764 
    765 	if (rw == KSTAT_WRITE)
    766 		return (EACCES);
    767 
    768 	ASSERT(MUTEX_HELD(&kstat_chain_lock));
    769 
    770 	zoneid = getzoneid();
    771 	for (e = avl_first(t); e != NULL; e = avl_walk(t, e, AVL_AFTER)) {
    772 		if (kstat_zone_find((kstat_t *)e, zoneid)) {
    773 			nkstats++;
    774 		}
    775 	}
    776 	header_ksp->ks_ndata = nkstats;
    777 	header_ksp->ks_data_size = nkstats * sizeof (kstat_t);
    778 	return (0);
    779 }
    780 
    781 /*
    782  * Copy out the data section of kstat 0, which consists of the list
    783  * of all kstat headers.  By specification, these headers must be
    784  * copied out in order of increasing KID.
    785  */
    786 static int
    787 header_kstat_snapshot(kstat_t *header_ksp, void *buf, int rw)
    788 {
    789 	ekstat_t *e;
    790 	avl_tree_t *t = &kstat_avl_bykid;
    791 	zoneid_t zoneid;
    792 
    793 	header_ksp->ks_snaptime = gethrtime();
    794 
    795 	if (rw == KSTAT_WRITE)
    796 		return (EACCES);
    797 
    798 	ASSERT(MUTEX_HELD(&kstat_chain_lock));
    799 
    800 	zoneid = getzoneid();
    801 	for (e = avl_first(t); e != NULL; e = avl_walk(t, e, AVL_AFTER)) {
    802 		if (kstat_zone_find((kstat_t *)e, zoneid)) {
    803 			bcopy(&e->e_ks, buf, sizeof (kstat_t));
    804 			buf = (char *)buf + sizeof (kstat_t);
    805 		}
    806 	}
    807 
    808 	return (0);
    809 }
    810 
    811 /* ARGSUSED */
    812 static int
    813 system_misc_kstat_update(kstat_t *ksp, int rw)
    814 {
    815 	int myncpus = ncpus;
    816 	int *loadavgp = &avenrun[0];
    817 	int loadavg[LOADAVG_NSTATS];
    818 	time_t zone_boot_time;
    819 	clock_t zone_lbolt;
    820 	hrtime_t zone_hrtime;
    821 
    822 	if (rw == KSTAT_WRITE)
    823 		return (EACCES);
    824 
    825 	if (!INGLOBALZONE(curproc)) {
    826 		/*
    827 		 * Here we grab cpu_lock which is OK as long as no-one in the
    828 		 * future attempts to lookup this particular kstat
    829 		 * (unix:0:system_misc) while holding cpu_lock.
    830 		 */
    831 		mutex_enter(&cpu_lock);
    832 		if (pool_pset_enabled()) {
    833 			psetid_t mypsid = zone_pset_get(curproc->p_zone);
    834 			int error;
    835 
    836 			myncpus = zone_ncpus_get(curproc->p_zone);
    837 			ASSERT(myncpus > 0);
    838 			error = cpupart_get_loadavg(mypsid, &loadavg[0],
    839 			    LOADAVG_NSTATS);
    840 			ASSERT(error == 0);
    841 			loadavgp = &loadavg[0];
    842 		}
    843 		mutex_exit(&cpu_lock);
    844 	}
    845 
    846 	if (curproc->p_zone->zone_id == 0) {
    847 		zone_boot_time = boot_time;
    848 		zone_lbolt = ddi_get_lbolt();
    849 	} else {
    850 		struct timeval tvp;
    851 		hrt2tv(curproc->p_zone->zone_zsched->p_mstart, &tvp);
    852 		zone_boot_time = tvp.tv_sec;
    853 
    854 		zone_hrtime = gethrtime();
    855 		zone_lbolt = (clock_t)(NSEC_TO_TICK(zone_hrtime) -
    856 		    NSEC_TO_TICK(curproc->p_zone->zone_zsched->p_mstart));
    857 	}
    858 
    859 	system_misc_kstat.ncpus.value.ui32		= (uint32_t)myncpus;
    860 	system_misc_kstat.lbolt.value.ui32		= (uint32_t)zone_lbolt;
    861 	system_misc_kstat.deficit.value.ui32		= (uint32_t)deficit;
    862 	system_misc_kstat.clk_intr.value.ui32		= (uint32_t)zone_lbolt;
    863 	system_misc_kstat.vac.value.ui32		= (uint32_t)vac;
    864 	system_misc_kstat.nproc.value.ui32		= (uint32_t)nproc;
    865 	system_misc_kstat.avenrun_1min.value.ui32	= (uint32_t)loadavgp[0];
    866 	system_misc_kstat.avenrun_5min.value.ui32	= (uint32_t)loadavgp[1];
    867 	system_misc_kstat.avenrun_15min.value.ui32	= (uint32_t)loadavgp[2];
    868 	system_misc_kstat.boot_time.value.ui32		= (uint32_t)
    869 	    zone_boot_time;
    870 	return (0);
    871 }
    872 
    873 #ifdef	__sparc
    874 extern caddr_t	econtig32;
    875 #else	/* !__sparc */
    876 extern caddr_t	econtig;
    877 #endif	/* __sparc */
    878 
    879 /* ARGSUSED */
    880 static int
    881 system_pages_kstat_update(kstat_t *ksp, int rw)
    882 {
    883 	kobj_stat_t kobj_stat;
    884 
    885 	if (rw == KSTAT_WRITE) {
    886 		return (EACCES);
    887 	}
    888 
    889 	kobj_stat_get(&kobj_stat);
    890 	system_pages_kstat.physmem.value.ul	= (ulong_t)physmem;
    891 	system_pages_kstat.nalloc.value.ul	= kobj_stat.nalloc;
    892 	system_pages_kstat.nfree.value.ul	= kobj_stat.nfree;
    893 	system_pages_kstat.nalloc_calls.value.ul = kobj_stat.nalloc_calls;
    894 	system_pages_kstat.nfree_calls.value.ul	= kobj_stat.nfree_calls;
    895 	system_pages_kstat.kernelbase.value.ul	= (ulong_t)KERNELBASE;
    896 
    897 #ifdef	__sparc
    898 	/*
    899 	 * kstat should REALLY be modified to also report kmem64_base and
    900 	 * kmem64_end (see sun4u/os/startup.c), as the virtual address range
    901 	 * [ kernelbase .. econtig ] no longer is truly reflective of the
    902 	 * kernel's vallocs...
    903 	 */
    904 	system_pages_kstat.econtig.value.ul	= (ulong_t)econtig32;
    905 #else	/* !__sparc */
    906 	system_pages_kstat.econtig.value.ul	= (ulong_t)econtig;
    907 #endif	/* __sparc */
    908 
    909 	system_pages_kstat.freemem.value.ul	= (ulong_t)freemem;
    910 	system_pages_kstat.availrmem.value.ul	= (ulong_t)availrmem;
    911 	system_pages_kstat.lotsfree.value.ul	= (ulong_t)lotsfree;
    912 	system_pages_kstat.desfree.value.ul	= (ulong_t)desfree;
    913 	system_pages_kstat.minfree.value.ul	= (ulong_t)minfree;
    914 	system_pages_kstat.fastscan.value.ul	= (ulong_t)fastscan;
    915 	system_pages_kstat.slowscan.value.ul	= (ulong_t)slowscan;
    916 	system_pages_kstat.nscan.value.ul	= (ulong_t)nscan;
    917 	system_pages_kstat.desscan.value.ul	= (ulong_t)desscan;
    918 	system_pages_kstat.pagesfree.value.ul	= (ulong_t)freemem;
    919 	system_pages_kstat.pageslocked.value.ul	= (ulong_t)(availrmem_initial -
    920 	    availrmem);
    921 	system_pages_kstat.pagestotal.value.ul	= (ulong_t)total_pages;
    922 	/*
    923 	 * pp_kernel represents total pages used by the kernel since the
    924 	 * startup. This formula takes into account the boottime kernel
    925 	 * footprint and also considers the availrmem changes because of
    926 	 * user explicit page locking.
    927 	 */
    928 	system_pages_kstat.pp_kernel.value.ul   = (ulong_t)(physinstalled -
    929 	    obp_pages - availrmem - k_anoninfo.ani_mem_resv -
    930 	    anon_segkp_pages_locked - pages_locked -
    931 	    pages_claimed - pages_useclaim);
    932 
    933 	return (0);
    934 }
    935 
    936 kstat_t *
    937 kstat_create(const char *ks_module, int ks_instance, const char *ks_name,
    938     const char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags)
    939 {
    940 	return (kstat_create_zone(ks_module, ks_instance, ks_name, ks_class,
    941 	    ks_type, ks_ndata, ks_flags, ALL_ZONES));
    942 }
    943 
    944 /*
    945  * Allocate and initialize a kstat structure.  Or, if a dormant kstat with
    946  * the specified name exists, reactivate it.  Returns a pointer to the kstat
    947  * on success, NULL on failure.  The kstat will not be visible to the
    948  * kstat driver until kstat_install().
    949  */
    950 kstat_t *
    951 kstat_create_zone(const char *ks_module, int ks_instance, const char *ks_name,
    952     const char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags,
    953     zoneid_t ks_zoneid)
    954 {
    955 	size_t ks_data_size;
    956 	kstat_t *ksp;
    957 	ekstat_t *e;
    958 	avl_index_t where;
    959 	char namebuf[KSTAT_STRLEN + 16];
    960 
    961 	if (avl_numnodes(&kstat_avl_bykid) == 0) {
    962 		avl_create(&kstat_avl_bykid, kstat_compare_bykid,
    963 		    sizeof (ekstat_t), offsetof(struct ekstat, e_avl_bykid));
    964 
    965 		avl_create(&kstat_avl_byname, kstat_compare_byname,
    966 		    sizeof (ekstat_t), offsetof(struct ekstat, e_avl_byname));
    967 	}
    968 
    969 	/*
    970 	 * If ks_name == NULL, set the ks_name to <module><instance>.
    971 	 */
    972 	if (ks_name == NULL) {
    973 		char buf[KSTAT_STRLEN];
    974 		kstat_set_string(buf, ks_module);
    975 		(void) sprintf(namebuf, "%s%d", buf, ks_instance);
    976 		ks_name = namebuf;
    977 	}
    978 
    979 	/*
    980 	 * Make sure it's a valid kstat data type
    981 	 */
    982 	if (ks_type >= KSTAT_NUM_TYPES) {
    983 		cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): "
    984 		    "invalid kstat type %d",
    985 		    ks_module, ks_instance, ks_name, ks_type);
    986 		return (NULL);
    987 	}
    988 
    989 	/*
    990 	 * Don't allow persistent virtual kstats -- it makes no sense.
    991 	 * ks_data points to garbage when the client goes away.
    992 	 */
    993 	if ((ks_flags & KSTAT_FLAG_PERSISTENT) &&
    994 	    (ks_flags & KSTAT_FLAG_VIRTUAL)) {
    995 		cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): "
    996 		    "cannot create persistent virtual kstat",
    997 		    ks_module, ks_instance, ks_name);
    998 		return (NULL);
    999 	}
   1000 
   1001 	/*
   1002 	 * Don't allow variable-size physical kstats, since the framework's
   1003 	 * memory allocation for physical kstat data is fixed at creation time.
   1004 	 */
   1005 	if ((ks_flags & KSTAT_FLAG_VAR_SIZE) &&
   1006 	    !(ks_flags & KSTAT_FLAG_VIRTUAL)) {
   1007 		cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): "
   1008 		    "cannot create variable-size physical kstat",
   1009 		    ks_module, ks_instance, ks_name);
   1010 		return (NULL);
   1011 	}
   1012 
   1013 	/*
   1014 	 * Make sure the number of data fields is within legal range
   1015 	 */
   1016 	if (ks_ndata < kstat_data_type[ks_type].min_ndata ||
   1017 	    ks_ndata > kstat_data_type[ks_type].max_ndata) {
   1018 		cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): "
   1019 		    "ks_ndata=%d out of range [%d, %d]",
   1020 		    ks_module, ks_instance, ks_name, (int)ks_ndata,
   1021 		    kstat_data_type[ks_type].min_ndata,
   1022 		    kstat_data_type[ks_type].max_ndata);
   1023 		return (NULL);
   1024 	}
   1025 
   1026 	ks_data_size = kstat_data_type[ks_type].size * ks_ndata;
   1027 
   1028 	/*
   1029 	 * If the named kstat already exists and is dormant, reactivate it.
   1030 	 */
   1031 	ksp = kstat_hold_byname(ks_module, ks_instance, ks_name, ks_zoneid);
   1032 	if (ksp != NULL) {
   1033 		if (!(ksp->ks_flags & KSTAT_FLAG_DORMANT)) {
   1034 			/*
   1035 			 * The named kstat exists but is not dormant --
   1036 			 * this is a kstat namespace collision.
   1037 			 */
   1038 			kstat_rele(ksp);
   1039 			cmn_err(CE_WARN,
   1040 			    "kstat_create('%s', %d, '%s'): namespace collision",
   1041 			    ks_module, ks_instance, ks_name);
   1042 			return (NULL);
   1043 		}
   1044 		if ((strcmp(ksp->ks_class, ks_class) != 0) ||
   1045 		    (ksp->ks_type != ks_type) ||
   1046 		    (ksp->ks_ndata != ks_ndata) ||
   1047 		    (ks_flags & KSTAT_FLAG_VIRTUAL)) {
   1048 			/*
   1049 			 * The name is the same, but the other key parameters
   1050 			 * differ from those of the dormant kstat -- bogus.
   1051 			 */
   1052 			kstat_rele(ksp);
   1053 			cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): "
   1054 			    "invalid reactivation of dormant kstat",
   1055 			    ks_module, ks_instance, ks_name);
   1056 			return (NULL);
   1057 		}
   1058 		/*
   1059 		 * Return dormant kstat pointer to caller.  As usual,
   1060 		 * the kstat is marked invalid until kstat_install().
   1061 		 */
   1062 		ksp->ks_flags |= KSTAT_FLAG_INVALID;
   1063 		kstat_rele(ksp);
   1064 		return (ksp);
   1065 	}
   1066 
   1067 	/*
   1068 	 * Allocate memory for the new kstat header and, if this is a physical
   1069 	 * kstat, the data section.
   1070 	 */
   1071 	e = kstat_alloc(ks_flags & KSTAT_FLAG_VIRTUAL ? 0 : ks_data_size);
   1072 	if (e == NULL) {
   1073 		cmn_err(CE_NOTE, "kstat_create('%s', %d, '%s'): "
   1074 		    "insufficient kernel memory",
   1075 		    ks_module, ks_instance, ks_name);
   1076 		return (NULL);
   1077 	}
   1078 
   1079 	/*
   1080 	 * Initialize as many fields as we can.  The caller may reset
   1081 	 * ks_lock, ks_update, ks_private, and ks_snapshot as necessary.
   1082 	 * Creators of virtual kstats may also reset ks_data.  It is
   1083 	 * also up to the caller to initialize the kstat data section,
   1084 	 * if necessary.  All initialization must be complete before
   1085 	 * calling kstat_install().
   1086 	 */
   1087 	e->e_zone.zoneid = ks_zoneid;
   1088 	e->e_zone.next = NULL;
   1089 
   1090 	ksp = &e->e_ks;
   1091 	ksp->ks_crtime		= gethrtime();
   1092 	kstat_set_string(ksp->ks_module, ks_module);
   1093 	ksp->ks_instance	= ks_instance;
   1094 	kstat_set_string(ksp->ks_name, ks_name);
   1095 	ksp->ks_type		= ks_type;
   1096 	kstat_set_string(ksp->ks_class, ks_class);
   1097 	ksp->ks_flags		= ks_flags | KSTAT_FLAG_INVALID;
   1098 	if (ks_flags & KSTAT_FLAG_VIRTUAL)
   1099 		ksp->ks_data	= NULL;
   1100 	else
   1101 		ksp->ks_data	= (void *)(e + 1);
   1102 	ksp->ks_ndata		= ks_ndata;
   1103 	ksp->ks_data_size	= ks_data_size;
   1104 	ksp->ks_snaptime	= ksp->ks_crtime;
   1105 	ksp->ks_update		= default_kstat_update;
   1106 	ksp->ks_private		= NULL;
   1107 	ksp->ks_snapshot	= default_kstat_snapshot;
   1108 	ksp->ks_lock		= NULL;
   1109 
   1110 	mutex_enter(&kstat_chain_lock);
   1111 
   1112 	/*
   1113 	 * Add our kstat to the AVL trees.
   1114 	 */
   1115 	if (avl_find(&kstat_avl_byname, e, &where) != NULL) {
   1116 		mutex_exit(&kstat_chain_lock);
   1117 		cmn_err(CE_WARN,
   1118 		    "kstat_create('%s', %d, '%s'): namespace collision",
   1119 		    ks_module, ks_instance, ks_name);
   1120 		kstat_free(e);
   1121 		return (NULL);
   1122 	}
   1123 	avl_insert(&kstat_avl_byname, e, where);
   1124 
   1125 	/*
   1126 	 * Loop around until we find an unused KID.
   1127 	 */
   1128 	do {
   1129 		ksp->ks_kid = kstat_chain_id++;
   1130 	} while (avl_find(&kstat_avl_bykid, e, &where) != NULL);
   1131 	avl_insert(&kstat_avl_bykid, e, where);
   1132 
   1133 	mutex_exit(&kstat_chain_lock);
   1134 
   1135 	return (ksp);
   1136 }
   1137 
   1138 /*
   1139  * Activate a fully initialized kstat and make it visible to /dev/kstat.
   1140  */
   1141 void
   1142 kstat_install(kstat_t *ksp)
   1143 {
   1144 	zoneid_t zoneid = ((ekstat_t *)ksp)->e_zone.zoneid;
   1145 
   1146 	/*
   1147 	 * If this is a variable-size kstat, it MUST provide kstat data locking
   1148 	 * to prevent data-size races with kstat readers.
   1149 	 */
   1150 	if ((ksp->ks_flags & KSTAT_FLAG_VAR_SIZE) && ksp->ks_lock == NULL) {
   1151 		panic("kstat_install('%s', %d, '%s'): "
   1152 		    "cannot create variable-size kstat without data lock",
   1153 		    ksp->ks_module, ksp->ks_instance, ksp->ks_name);
   1154 	}
   1155 
   1156 	if (kstat_hold_bykid(ksp->ks_kid, zoneid) != ksp) {
   1157 		cmn_err(CE_WARN, "kstat_install(%p): does not exist",
   1158 		    (void *)ksp);
   1159 		return;
   1160 	}
   1161 
   1162 	if (ksp->ks_type == KSTAT_TYPE_NAMED && ksp->ks_data != NULL) {
   1163 		int has_long_strings = 0;
   1164 		uint_t i;
   1165 		kstat_named_t *knp = KSTAT_NAMED_PTR(ksp);
   1166 
   1167 		for (i = 0; i < ksp->ks_ndata; i++, knp++) {
   1168 			if (knp->data_type == KSTAT_DATA_STRING) {
   1169 				has_long_strings = 1;
   1170 				break;
   1171 			}
   1172 		}
   1173 		/*
   1174 		 * It is an error for a named kstat with fields of
   1175 		 * KSTAT_DATA_STRING to be non-virtual.
   1176 		 */
   1177 		if (has_long_strings && !(ksp->ks_flags & KSTAT_FLAG_VIRTUAL)) {
   1178 			panic("kstat_install('%s', %d, '%s'): "
   1179 			    "named kstat containing KSTAT_DATA_STRING "
   1180 			    "is not virtual",
   1181 			    ksp->ks_module, ksp->ks_instance,
   1182 			    ksp->ks_name);
   1183 		}
   1184 		/*
   1185 		 * The default snapshot routine does not handle KSTAT_WRITE
   1186 		 * for long strings.
   1187 		 */
   1188 		if (has_long_strings && (ksp->ks_flags & KSTAT_FLAG_WRITABLE) &&
   1189 		    (ksp->ks_snapshot == default_kstat_snapshot)) {
   1190 			panic("kstat_install('%s', %d, '%s'): "
   1191 			    "named kstat containing KSTAT_DATA_STRING "
   1192 			    "is writable but uses default snapshot routine",
   1193 			    ksp->ks_module, ksp->ks_instance, ksp->ks_name);
   1194 		}
   1195 	}
   1196 
   1197 	if (ksp->ks_flags & KSTAT_FLAG_DORMANT) {
   1198 
   1199 		/*
   1200 		 * We are reactivating a dormant kstat.  Initialize the
   1201 		 * caller's underlying data to the value it had when the
   1202 		 * kstat went dormant, and mark the kstat as active.
   1203 		 * Grab the provider's kstat lock if it's not already held.
   1204 		 */
   1205 		kmutex_t *lp = ksp->ks_lock;
   1206 		if (lp != NULL && MUTEX_NOT_HELD(lp)) {
   1207 			mutex_enter(lp);
   1208 			(void) KSTAT_UPDATE(ksp, KSTAT_WRITE);
   1209 			mutex_exit(lp);
   1210 		} else {
   1211 			(void) KSTAT_UPDATE(ksp, KSTAT_WRITE);
   1212 		}
   1213 		ksp->ks_flags &= ~KSTAT_FLAG_DORMANT;
   1214 	}
   1215 
   1216 	/*
   1217 	 * Now that the kstat is active, make it visible to the kstat driver.
   1218 	 */
   1219 	ksp->ks_flags &= ~KSTAT_FLAG_INVALID;
   1220 	kstat_rele(ksp);
   1221 }
   1222 
   1223 /*
   1224  * Remove a kstat from the system.  Or, if it's a persistent kstat,
   1225  * just update the data and mark it as dormant.
   1226  */
   1227 void
   1228 kstat_delete(kstat_t *ksp)
   1229 {
   1230 	kmutex_t *lp;
   1231 	ekstat_t *e = (ekstat_t *)ksp;
   1232 	zoneid_t zoneid = e->e_zone.zoneid;
   1233 	kstat_zone_t *kz;
   1234 
   1235 	if (ksp == NULL)
   1236 		return;
   1237 
   1238 	lp = ksp->ks_lock;
   1239 
   1240 	if (lp != NULL && MUTEX_HELD(lp)) {
   1241 		panic("kstat_delete(%p): caller holds data lock %p",
   1242 		    (void *)ksp, (void *)lp);
   1243 	}
   1244 
   1245 	if (kstat_hold_bykid(ksp->ks_kid, zoneid) != ksp) {
   1246 		cmn_err(CE_WARN, "kstat_delete(%p): does not exist",
   1247 		    (void *)ksp);
   1248 		return;
   1249 	}
   1250 
   1251 	if (ksp->ks_flags & KSTAT_FLAG_PERSISTENT) {
   1252 		/*
   1253 		 * Update the data one last time, so that all activity
   1254 		 * prior to going dormant has been accounted for.
   1255 		 */
   1256 		KSTAT_ENTER(ksp);
   1257 		(void) KSTAT_UPDATE(ksp, KSTAT_READ);
   1258 		KSTAT_EXIT(ksp);
   1259 
   1260 		/*
   1261 		 * Mark the kstat as dormant and restore caller-modifiable
   1262 		 * fields to default values, so the kstat is readable during
   1263 		 * the dormant phase.
   1264 		 */
   1265 		ksp->ks_flags |= KSTAT_FLAG_DORMANT;
   1266 		ksp->ks_lock = NULL;
   1267 		ksp->ks_update = default_kstat_update;
   1268 		ksp->ks_private = NULL;
   1269 		ksp->ks_snapshot = default_kstat_snapshot;
   1270 		kstat_rele(ksp);
   1271 		return;
   1272 	}
   1273 
   1274 	/*
   1275 	 * Remove the kstat from the framework's AVL trees,
   1276 	 * free the allocated memory, and increment kstat_chain_id so
   1277 	 * /dev/kstat clients can detect the event.
   1278 	 */
   1279 	mutex_enter(&kstat_chain_lock);
   1280 	avl_remove(&kstat_avl_bykid, e);
   1281 	avl_remove(&kstat_avl_byname, e);
   1282 	kstat_chain_id++;
   1283 	mutex_exit(&kstat_chain_lock);
   1284 
   1285 	kz = e->e_zone.next;
   1286 	while (kz != NULL) {
   1287 		kstat_zone_t *t = kz;
   1288 
   1289 		kz = kz->next;
   1290 		kmem_free(t, sizeof (*t));
   1291 	}
   1292 	kstat_rele(ksp);
   1293 	kstat_free(e);
   1294 }
   1295 
   1296 void
   1297 kstat_delete_byname_zone(const char *ks_module, int ks_instance,
   1298     const char *ks_name, zoneid_t ks_zoneid)
   1299 {
   1300 	kstat_t *ksp;
   1301 
   1302 	ksp = kstat_hold_byname(ks_module, ks_instance, ks_name, ks_zoneid);
   1303 	if (ksp != NULL) {
   1304 		kstat_rele(ksp);
   1305 		kstat_delete(ksp);
   1306 	}
   1307 }
   1308 
   1309 void
   1310 kstat_delete_byname(const char *ks_module, int ks_instance, const char *ks_name)
   1311 {
   1312 	kstat_delete_byname_zone(ks_module, ks_instance, ks_name, ALL_ZONES);
   1313 }
   1314 
   1315 /*
   1316  * The sparc V9 versions of these routines can be much cheaper than
   1317  * the poor 32-bit compiler can comprehend, so they're in sparcv9_subr.s.
   1318  * For simplicity, however, we always feed the C versions to lint.
   1319  */
   1320 #if !defined(__sparc) || defined(lint) || defined(__lint)
   1321 
   1322 void
   1323 kstat_waitq_enter(kstat_io_t *kiop)
   1324 {
   1325 	hrtime_t new, delta;
   1326 	ulong_t wcnt;
   1327 
   1328 	new = gethrtime_unscaled();
   1329 	delta = new - kiop->wlastupdate;
   1330 	kiop->wlastupdate = new;
   1331 	wcnt = kiop->wcnt++;
   1332 	if (wcnt != 0) {
   1333 		kiop->wlentime += delta * wcnt;
   1334 		kiop->wtime += delta;
   1335 	}
   1336 }
   1337 
   1338 void
   1339 kstat_waitq_exit(kstat_io_t *kiop)
   1340 {
   1341 	hrtime_t new, delta;
   1342 	ulong_t wcnt;
   1343 
   1344 	new = gethrtime_unscaled();
   1345 	delta = new - kiop->wlastupdate;
   1346 	kiop->wlastupdate = new;
   1347 	wcnt = kiop->wcnt--;
   1348 	ASSERT((int)wcnt > 0);
   1349 	kiop->wlentime += delta * wcnt;
   1350 	kiop->wtime += delta;
   1351 }
   1352 
   1353 void
   1354 kstat_runq_enter(kstat_io_t *kiop)
   1355 {
   1356 	hrtime_t new, delta;
   1357 	ulong_t rcnt;
   1358 
   1359 	new = gethrtime_unscaled();
   1360 	delta = new - kiop->rlastupdate;
   1361 	kiop->rlastupdate = new;
   1362 	rcnt = kiop->rcnt++;
   1363 	if (rcnt != 0) {
   1364 		kiop->rlentime += delta * rcnt;
   1365 		kiop->rtime += delta;
   1366 	}
   1367 }
   1368 
   1369 void
   1370 kstat_runq_exit(kstat_io_t *kiop)
   1371 {
   1372 	hrtime_t new, delta;
   1373 	ulong_t rcnt;
   1374 
   1375 	new = gethrtime_unscaled();
   1376 	delta = new - kiop->rlastupdate;
   1377 	kiop->rlastupdate = new;
   1378 	rcnt = kiop->rcnt--;
   1379 	ASSERT((int)rcnt > 0);
   1380 	kiop->rlentime += delta * rcnt;
   1381 	kiop->rtime += delta;
   1382 }
   1383 
   1384 void
   1385 kstat_waitq_to_runq(kstat_io_t *kiop)
   1386 {
   1387 	hrtime_t new, delta;
   1388 	ulong_t wcnt, rcnt;
   1389 
   1390 	new = gethrtime_unscaled();
   1391 
   1392 	delta = new - kiop->wlastupdate;
   1393 	kiop->wlastupdate = new;
   1394 	wcnt = kiop->wcnt--;
   1395 	ASSERT((int)wcnt > 0);
   1396 	kiop->wlentime += delta * wcnt;
   1397 	kiop->wtime += delta;
   1398 
   1399 	delta = new - kiop->rlastupdate;
   1400 	kiop->rlastupdate = new;
   1401 	rcnt = kiop->rcnt++;
   1402 	if (rcnt != 0) {
   1403 		kiop->rlentime += delta * rcnt;
   1404 		kiop->rtime += delta;
   1405 	}
   1406 }
   1407 
   1408 void
   1409 kstat_runq_back_to_waitq(kstat_io_t *kiop)
   1410 {
   1411 	hrtime_t new, delta;
   1412 	ulong_t wcnt, rcnt;
   1413 
   1414 	new = gethrtime_unscaled();
   1415 
   1416 	delta = new - kiop->rlastupdate;
   1417 	kiop->rlastupdate = new;
   1418 	rcnt = kiop->rcnt--;
   1419 	ASSERT((int)rcnt > 0);
   1420 	kiop->rlentime += delta * rcnt;
   1421 	kiop->rtime += delta;
   1422 
   1423 	delta = new - kiop->wlastupdate;
   1424 	kiop->wlastupdate = new;
   1425 	wcnt = kiop->wcnt++;
   1426 	if (wcnt != 0) {
   1427 		kiop->wlentime += delta * wcnt;
   1428 		kiop->wtime += delta;
   1429 	}
   1430 }
   1431 
   1432 #endif
   1433 
   1434 void
   1435 kstat_timer_start(kstat_timer_t *ktp)
   1436 {
   1437 	ktp->start_time = gethrtime();
   1438 }
   1439 
   1440 void
   1441 kstat_timer_stop(kstat_timer_t *ktp)
   1442 {
   1443 	hrtime_t	etime;
   1444 	u_longlong_t	num_events;
   1445 
   1446 	ktp->stop_time = etime = gethrtime();
   1447 	etime -= ktp->start_time;
   1448 	num_events = ktp->num_events;
   1449 	if (etime < ktp->min_time || num_events == 0)
   1450 		ktp->min_time = etime;
   1451 	if (etime > ktp->max_time)
   1452 		ktp->max_time = etime;
   1453 	ktp->elapsed_time += etime;
   1454 	ktp->num_events = num_events + 1;
   1455 }
   1456