Home | History | Annotate | Download | only in io
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 
     27 /*
     28  * CPU Performance Counter system calls and device driver.
     29  *
     30  * This module uses a combination of thread context operators, and
     31  * thread-specific data to export CPU performance counters
     32  * via both a system call and a driver interface.
     33  *
     34  * There are three access methods exported - the 'shared' device
     35  * and the 'private' and 'agent' variants of the system call.
     36  *
     37  * The shared device treats the performance counter registers as
     38  * a processor metric, regardless of the work scheduled on them.
     39  * The private system call treats the performance counter registers
     40  * as a property of a single lwp.  This is achieved by using the
     41  * thread context operators to virtualize the contents of the
     42  * performance counter registers between lwps.
     43  *
     44  * The agent method is like the private method, except that it must
     45  * be accessed via /proc's agent lwp to allow the counter context of
     46  * other threads to be examined safely.
     47  *
     48  * The shared usage fundamentally conflicts with the agent and private usage;
     49  * almost all of the complexity of the module is needed to allow these two
     50  * models to co-exist in a reasonable way.
     51  */
     52 
     53 #include <sys/types.h>
     54 #include <sys/file.h>
     55 #include <sys/errno.h>
     56 #include <sys/open.h>
     57 #include <sys/cred.h>
     58 #include <sys/conf.h>
     59 #include <sys/stat.h>
     60 #include <sys/processor.h>
     61 #include <sys/cpuvar.h>
     62 #include <sys/disp.h>
     63 #include <sys/kmem.h>
     64 #include <sys/modctl.h>
     65 #include <sys/ddi.h>
     66 #include <sys/sunddi.h>
     67 #include <sys/nvpair.h>
     68 #include <sys/policy.h>
     69 #include <sys/machsystm.h>
     70 #include <sys/cpc_impl.h>
     71 #include <sys/cpc_pcbe.h>
     72 #include <sys/kcpc.h>
     73 
     74 static int kcpc_copyin_set(kcpc_set_t **set, void *ubuf, size_t len);
     75 static int kcpc_verify_set(kcpc_set_t *set);
     76 static uint32_t kcpc_nvlist_npairs(nvlist_t *list);
     77 
     78 /*
     79  * Generic attributes supported regardless of processor.
     80  */
     81 
     82 #define	ATTRLIST "picnum"
     83 #define	SEPARATOR ","
     84 
     85 /*
     86  * System call to access CPU performance counters.
     87  */
     88 static int
     89 cpc(int cmd, id_t lwpid, void *udata1, void *udata2, void *udata3)
     90 {
     91 	kthread_t	*t;
     92 	int		error;
     93 	int		size;
     94 	const char	*str;
     95 	int		code;
     96 
     97 	/*
     98 	 * This CPC syscall should only be loaded if it found a PCBE to use.
     99 	 */
    100 	ASSERT(pcbe_ops != NULL);
    101 
    102 	if (curproc->p_agenttp == curthread) {
    103 		/*
    104 		 * Only if /proc is invoking this system call from
    105 		 * the agent thread do we allow the caller to examine
    106 		 * the contexts of other lwps in the process.  And
    107 		 * because we know we're the agent, we know we don't
    108 		 * have to grab p_lock because no-one else can change
    109 		 * the state of the process.
    110 		 */
    111 		if ((t = idtot(curproc, lwpid)) == NULL || t == curthread)
    112 			return (set_errno(ESRCH));
    113 		ASSERT(t->t_tid == lwpid && ttolwp(t) != NULL);
    114 	} else
    115 		t = curthread;
    116 
    117 	if (t->t_cpc_set == NULL && (cmd == CPC_SAMPLE || cmd == CPC_RELE))
    118 		return (set_errno(EINVAL));
    119 
    120 	switch (cmd) {
    121 	case CPC_BIND:
    122 		/*
    123 		 * udata1 = pointer to packed nvlist buffer
    124 		 * udata2 = size of packed nvlist buffer
    125 		 * udata3 = User addr to return error subcode in.
    126 		 */
    127 
    128 		rw_enter(&kcpc_cpuctx_lock, RW_READER);
    129 		if (kcpc_cpuctx) {
    130 			rw_exit(&kcpc_cpuctx_lock);
    131 			return (set_errno(EAGAIN));
    132 		}
    133 
    134 		if (kcpc_hw_lwp_hook() != 0) {
    135 			rw_exit(&kcpc_cpuctx_lock);
    136 			return (set_errno(EACCES));
    137 		}
    138 
    139 		/*
    140 		 * An LWP may only have one set bound to it at a time; if there
    141 		 * is a set bound to this LWP already, we unbind it here.
    142 		 */
    143 		if (t->t_cpc_set != NULL)
    144 			(void) kcpc_unbind(t->t_cpc_set);
    145 		ASSERT(t->t_cpc_set == NULL);
    146 
    147 		if ((error = kcpc_copyin_set(&t->t_cpc_set, udata1,
    148 		    (size_t)udata2)) != 0) {
    149 			rw_exit(&kcpc_cpuctx_lock);
    150 			return (set_errno(error));
    151 		}
    152 
    153 		if ((error = kcpc_verify_set(t->t_cpc_set)) != 0) {
    154 			rw_exit(&kcpc_cpuctx_lock);
    155 			kcpc_free_set(t->t_cpc_set);
    156 			t->t_cpc_set = NULL;
    157 			if (copyout(&error, udata3, sizeof (error)) == -1)
    158 				return (set_errno(EFAULT));
    159 			return (set_errno(EINVAL));
    160 		}
    161 
    162 		if ((error = kcpc_bind_thread(t->t_cpc_set, t, &code)) != 0) {
    163 			rw_exit(&kcpc_cpuctx_lock);
    164 			kcpc_free_set(t->t_cpc_set);
    165 			t->t_cpc_set = NULL;
    166 			/*
    167 			 * EINVAL and EACCES are the only errors with more
    168 			 * specific subcodes.
    169 			 */
    170 			if ((error == EINVAL || error == EACCES) &&
    171 			    copyout(&code, udata3, sizeof (code)) == -1)
    172 				return (set_errno(EFAULT));
    173 			return (set_errno(error));
    174 		}
    175 
    176 		rw_exit(&kcpc_cpuctx_lock);
    177 		return (0);
    178 	case CPC_SAMPLE:
    179 		/*
    180 		 * udata1 = pointer to user's buffer
    181 		 * udata2 = pointer to user's hrtime
    182 		 * udata3 = pointer to user's tick
    183 		 */
    184 		/*
    185 		 * We only allow thread-bound sets to be sampled via the
    186 		 * syscall, so if this set has a CPU-bound context, return an
    187 		 * error.
    188 		 */
    189 		if (t->t_cpc_set->ks_ctx->kc_cpuid != -1)
    190 			return (set_errno(EINVAL));
    191 		if ((error = kcpc_sample(t->t_cpc_set, udata1, udata2,
    192 		    udata3)) != 0)
    193 			return (set_errno(error));
    194 
    195 		return (0);
    196 	case CPC_PRESET:
    197 	case CPC_RESTART:
    198 		/*
    199 		 * These are valid only if this lwp has a bound set.
    200 		 */
    201 		if (t->t_cpc_set == NULL)
    202 			return (set_errno(EINVAL));
    203 		if (cmd == CPC_PRESET) {
    204 			/*
    205 			 * The preset is shipped up to us from userland in two
    206 			 * parts. This lets us handle 64-bit values from 32-bit
    207 			 * and 64-bit applications in the same manner.
    208 			 *
    209 			 * udata1 = index of request to preset
    210 			 * udata2 = new 64-bit preset (most sig. 32 bits)
    211 			 * udata3 = new 64-bit preset (least sig. 32 bits)
    212 			 */
    213 			if ((error = kcpc_preset(t->t_cpc_set, (intptr_t)udata1,
    214 			    ((uint64_t)(uintptr_t)udata2 << 32ULL) |
    215 			    (uint64_t)(uintptr_t)udata3)) != 0)
    216 				return (set_errno(error));
    217 		} else {
    218 			/*
    219 			 * udata[1-3] = unused
    220 			 */
    221 			if ((error = kcpc_restart(t->t_cpc_set)) != 0)
    222 				return (set_errno(error));
    223 		}
    224 		return (0);
    225 	case CPC_ENABLE:
    226 	case CPC_DISABLE:
    227 		udata1 = 0;
    228 		/*FALLTHROUGH*/
    229 	case CPC_USR_EVENTS:
    230 	case CPC_SYS_EVENTS:
    231 		if (t != curthread || t->t_cpc_set == NULL)
    232 			return (set_errno(EINVAL));
    233 		/*
    234 		 * Provided for backwards compatibility with CPCv1.
    235 		 *
    236 		 * Stop the counters and record the current counts. Use the
    237 		 * counts as the preset to rebind a new set with the requests
    238 		 * reconfigured as requested.
    239 		 *
    240 		 * udata1: 1 == enable; 0 == disable
    241 		 * udata{2,3}: unused
    242 		 */
    243 		rw_enter(&kcpc_cpuctx_lock, RW_READER);
    244 		if ((error = kcpc_enable(t,
    245 		    cmd, (int)(uintptr_t)udata1)) != 0) {
    246 			rw_exit(&kcpc_cpuctx_lock);
    247 			return (set_errno(error));
    248 		}
    249 		rw_exit(&kcpc_cpuctx_lock);
    250 		return (0);
    251 	case CPC_NPIC:
    252 		return (cpc_ncounters);
    253 	case CPC_CAPS:
    254 		return (pcbe_ops->pcbe_caps);
    255 	case CPC_EVLIST_SIZE:
    256 	case CPC_LIST_EVENTS:
    257 		/*
    258 		 * udata1 = pointer to user's int or buffer
    259 		 * udata2 = picnum
    260 		 * udata3 = unused
    261 		 */
    262 		if ((uintptr_t)udata2 >= cpc_ncounters)
    263 			return (set_errno(EINVAL));
    264 
    265 		size = strlen(
    266 		    pcbe_ops->pcbe_list_events((uintptr_t)udata2)) + 1;
    267 
    268 		if (cmd == CPC_EVLIST_SIZE) {
    269 			if (suword32(udata1, size) == -1)
    270 				return (set_errno(EFAULT));
    271 		} else {
    272 			if (copyout(
    273 			    pcbe_ops->pcbe_list_events((uintptr_t)udata2),
    274 			    udata1, size) == -1)
    275 				return (set_errno(EFAULT));
    276 		}
    277 		return (0);
    278 	case CPC_ATTRLIST_SIZE:
    279 	case CPC_LIST_ATTRS:
    280 		/*
    281 		 * udata1 = pointer to user's int or buffer
    282 		 * udata2 = unused
    283 		 * udata3 = unused
    284 		 *
    285 		 * attrlist size is length of PCBE-supported attributes, plus
    286 		 * room for "picnum\0" plus an optional ',' separator char.
    287 		 */
    288 		str = pcbe_ops->pcbe_list_attrs();
    289 		size = strlen(str) + sizeof (SEPARATOR ATTRLIST) + 1;
    290 		if (str[0] != '\0')
    291 			/*
    292 			 * A ',' separator character is necessary.
    293 			 */
    294 			size += 1;
    295 
    296 		if (cmd == CPC_ATTRLIST_SIZE) {
    297 			if (suword32(udata1, size) == -1)
    298 				return (set_errno(EFAULT));
    299 		} else {
    300 			/*
    301 			 * Copyout the PCBE attributes, and then append the
    302 			 * generic attribute list (with separator if necessary).
    303 			 */
    304 			if (copyout(str, udata1, strlen(str)) == -1)
    305 				return (set_errno(EFAULT));
    306 			if (str[0] != '\0') {
    307 				if (copyout(SEPARATOR ATTRLIST,
    308 				    ((char *)udata1) + strlen(str),
    309 				    strlen(SEPARATOR ATTRLIST) + 1)
    310 				    == -1)
    311 					return (set_errno(EFAULT));
    312 			} else
    313 				if (copyout(ATTRLIST,
    314 				    (char *)udata1 + strlen(str),
    315 				    strlen(ATTRLIST) + 1) == -1)
    316 					return (set_errno(EFAULT));
    317 		}
    318 		return (0);
    319 	case CPC_IMPL_NAME:
    320 	case CPC_CPUREF:
    321 		/*
    322 		 * udata1 = pointer to user's buffer
    323 		 * udata2 = unused
    324 		 * udata3 = unused
    325 		 */
    326 		if (cmd == CPC_IMPL_NAME) {
    327 			str = pcbe_ops->pcbe_impl_name();
    328 			ASSERT(strlen(str) < CPC_MAX_IMPL_NAME);
    329 		} else {
    330 			str = pcbe_ops->pcbe_cpuref();
    331 			ASSERT(strlen(str) < CPC_MAX_CPUREF);
    332 		}
    333 
    334 		if (copyout(str, udata1, strlen(str) + 1) != 0)
    335 			return (set_errno(EFAULT));
    336 		return (0);
    337 	case CPC_INVALIDATE:
    338 		kcpc_invalidate(t);
    339 		return (0);
    340 	case CPC_RELE:
    341 		if ((error = kcpc_unbind(t->t_cpc_set)) != 0)
    342 			return (set_errno(error));
    343 		return (0);
    344 	default:
    345 		return (set_errno(EINVAL));
    346 	}
    347 }
    348 
    349 /*
    350  * The 'shared' device allows direct access to the
    351  * performance counter control register of the current CPU.
    352  * The major difference between the contexts created here and those
    353  * above is that the context handlers are -not- installed, thus
    354  * no context switching behaviour occurs.
    355  *
    356  * Because they manipulate per-cpu state, these ioctls can
    357  * only be invoked from a bound lwp, by a caller with the cpc_cpu privilege
    358  * who can open the relevant entry in /devices (the act of holding it open
    359  * causes other uses of the counters to be suspended).
    360  *
    361  * Note that for correct results, the caller -must- ensure that
    362  * all existing per-lwp contexts are either inactive or marked invalid;
    363  * that's what the open routine does.
    364  */
    365 /*ARGSUSED*/
    366 static int
    367 kcpc_ioctl(dev_t dev, int cmd, intptr_t data, int flags, cred_t *cr, int *rvp)
    368 {
    369 	kthread_t	*t = curthread;
    370 	processorid_t	cpuid;
    371 	void		*udata1 = NULL;
    372 	void		*udata2 = NULL;
    373 	void		*udata3 = NULL;
    374 	int		error;
    375 	int		code;
    376 
    377 	STRUCT_DECL(__cpc_args, args);
    378 
    379 	STRUCT_INIT(args, flags);
    380 
    381 	if (curthread->t_bind_cpu != getminor(dev))
    382 		return (EAGAIN);  /* someone unbound it? */
    383 
    384 	cpuid = getminor(dev);
    385 
    386 	if (cmd == CPCIO_BIND || cmd == CPCIO_SAMPLE) {
    387 		if (copyin((void *)data, STRUCT_BUF(args),
    388 		    STRUCT_SIZE(args)) == -1)
    389 			return (EFAULT);
    390 
    391 		udata1 = STRUCT_FGETP(args, udata1);
    392 		udata2 = STRUCT_FGETP(args, udata2);
    393 		udata3 = STRUCT_FGETP(args, udata3);
    394 	}
    395 
    396 	switch (cmd) {
    397 	case CPCIO_BIND:
    398 		/*
    399 		 * udata1 = pointer to packed nvlist buffer
    400 		 * udata2 = size of packed nvlist buffer
    401 		 * udata3 = User addr to return error subcode in.
    402 		 */
    403 		if (t->t_cpc_set != NULL) {
    404 			(void) kcpc_unbind(t->t_cpc_set);
    405 			ASSERT(t->t_cpc_set == NULL);
    406 		}
    407 
    408 		if ((error = kcpc_copyin_set(&t->t_cpc_set, udata1,
    409 		    (size_t)udata2)) != 0) {
    410 			return (error);
    411 		}
    412 
    413 		if ((error = kcpc_verify_set(t->t_cpc_set)) != 0) {
    414 			kcpc_free_set(t->t_cpc_set);
    415 			t->t_cpc_set = NULL;
    416 			if (copyout(&error, udata3, sizeof (error)) == -1)
    417 				return (EFAULT);
    418 			return (EINVAL);
    419 		}
    420 
    421 		if ((error = kcpc_bind_cpu(t->t_cpc_set, cpuid, &code)) != 0) {
    422 			kcpc_free_set(t->t_cpc_set);
    423 			t->t_cpc_set = NULL;
    424 			/*
    425 			 * Subcodes are only returned for EINVAL and EACCESS.
    426 			 */
    427 			if ((error == EINVAL || error == EACCES) &&
    428 			    copyout(&code, udata3, sizeof (code)) == -1)
    429 				return (EFAULT);
    430 			return (error);
    431 		}
    432 
    433 		return (0);
    434 	case CPCIO_SAMPLE:
    435 		/*
    436 		 * udata1 = pointer to user's buffer
    437 		 * udata2 = pointer to user's hrtime
    438 		 * udata3 = pointer to user's tick
    439 		 */
    440 		/*
    441 		 * Only CPU-bound sets may be sampled via the ioctl(). If this
    442 		 * set has no CPU-bound context, return an error.
    443 		 */
    444 		if (t->t_cpc_set == NULL)
    445 			return (EINVAL);
    446 		if ((error = kcpc_sample(t->t_cpc_set, udata1, udata2,
    447 		    udata3)) != 0)
    448 			return (error);
    449 		return (0);
    450 	case CPCIO_RELE:
    451 		if (t->t_cpc_set == NULL)
    452 			return (EINVAL);
    453 		return (kcpc_unbind(t->t_cpc_set));
    454 	default:
    455 		return (EINVAL);
    456 	}
    457 }
    458 
    459 /*
    460  * The device supports multiple opens, but only one open
    461  * is allowed per processor.  This is to enable multiple
    462  * instances of tools looking at different processors.
    463  */
    464 #define	KCPC_MINOR_SHARED		((minor_t)0x3fffful)
    465 
    466 static ulong_t *kcpc_cpumap;		/* bitmap of cpus */
    467 
    468 /*ARGSUSED1*/
    469 static int
    470 kcpc_open(dev_t *dev, int flags, int otyp, cred_t *cr)
    471 {
    472 	processorid_t	cpuid;
    473 	int		error;
    474 
    475 	ASSERT(pcbe_ops != NULL);
    476 
    477 	if ((error = secpolicy_cpc_cpu(cr)) != 0)
    478 		return (error);
    479 	if (getminor(*dev) != KCPC_MINOR_SHARED)
    480 		return (ENXIO);
    481 	if ((cpuid = curthread->t_bind_cpu) == PBIND_NONE)
    482 		return (EINVAL);
    483 	if (cpuid > max_cpuid)
    484 		return (EINVAL);
    485 
    486 	rw_enter(&kcpc_cpuctx_lock, RW_WRITER);
    487 	if (++kcpc_cpuctx == 1) {
    488 		ASSERT(kcpc_cpumap == NULL);
    489 		kcpc_cpumap = kmem_zalloc(BT_SIZEOFMAP(max_cpuid + 1),
    490 		    KM_SLEEP);
    491 		/*
    492 		 * When this device is open for processor-based contexts,
    493 		 * no further lwp-based contexts can be created.
    494 		 *
    495 		 * Since this is the first open, ensure that all existing
    496 		 * contexts are invalidated.
    497 		 */
    498 		kcpc_invalidate_all();
    499 	} else if (BT_TEST(kcpc_cpumap, cpuid)) {
    500 		kcpc_cpuctx--;
    501 		rw_exit(&kcpc_cpuctx_lock);
    502 		return (EAGAIN);
    503 	} else if (kcpc_hw_cpu_hook(cpuid, kcpc_cpumap) != 0) {
    504 		kcpc_cpuctx--;
    505 		rw_exit(&kcpc_cpuctx_lock);
    506 		return (EACCES);
    507 	}
    508 	BT_SET(kcpc_cpumap, cpuid);
    509 	rw_exit(&kcpc_cpuctx_lock);
    510 
    511 	*dev = makedevice(getmajor(*dev), (minor_t)cpuid);
    512 
    513 	return (0);
    514 }
    515 
    516 /*ARGSUSED1*/
    517 static int
    518 kcpc_close(dev_t dev, int flags, int otyp, cred_t *cr)
    519 {
    520 	rw_enter(&kcpc_cpuctx_lock, RW_WRITER);
    521 	BT_CLEAR(kcpc_cpumap, getminor(dev));
    522 	if (--kcpc_cpuctx == 0) {
    523 		kmem_free(kcpc_cpumap, BT_SIZEOFMAP(max_cpuid + 1));
    524 		kcpc_cpumap = NULL;
    525 	}
    526 	ASSERT(kcpc_cpuctx >= 0);
    527 	rw_exit(&kcpc_cpuctx_lock);
    528 
    529 	return (0);
    530 }
    531 
    532 /*
    533  * Sane boundaries on the size of packed lists. In bytes.
    534  */
    535 #define	CPC_MIN_PACKSIZE 4
    536 #define	CPC_MAX_PACKSIZE 10000
    537 
    538 /*
    539  * Sane boundary on the number of requests a set can contain.
    540  */
    541 #define	CPC_MAX_NREQS 100
    542 
    543 /*
    544  * Sane boundary on the number of attributes a request can contain.
    545  */
    546 #define	CPC_MAX_ATTRS 50
    547 
    548 /*
    549  * Copy in a packed nvlist from the user and create a request set out of it.
    550  * If successful, return 0 and store a pointer to the set we've created. Returns
    551  * error code on error.
    552  */
    553 int
    554 kcpc_copyin_set(kcpc_set_t **inset, void *ubuf, size_t len)
    555 {
    556 	kcpc_set_t	*set;
    557 	int		i;
    558 	int		j;
    559 	char		*packbuf;
    560 
    561 	nvlist_t	*nvl;
    562 	nvpair_t	*nvp = NULL;
    563 
    564 	nvlist_t	*attrs;
    565 	nvpair_t	*nvp_attr;
    566 	kcpc_attr_t	*attrp;
    567 
    568 	nvlist_t	**reqlist;
    569 	uint_t		nreqs;
    570 	uint64_t	uint64;
    571 	uint32_t	uint32;
    572 	uint32_t	setflags = (uint32_t)-1;
    573 	char		*string;
    574 	char		*name;
    575 
    576 	if (len < CPC_MIN_PACKSIZE || len > CPC_MAX_PACKSIZE)
    577 		return (EINVAL);
    578 
    579 	packbuf = kmem_alloc(len, KM_SLEEP);
    580 
    581 	if (copyin(ubuf, packbuf, len) == -1) {
    582 		kmem_free(packbuf, len);
    583 		return (EFAULT);
    584 	}
    585 
    586 	if (nvlist_unpack(packbuf, len, &nvl, KM_SLEEP) != 0) {
    587 		kmem_free(packbuf, len);
    588 		return (EINVAL);
    589 	}
    590 
    591 	/*
    592 	 * The nvlist has been unpacked so there is no need for the packed
    593 	 * representation from this point on.
    594 	 */
    595 	kmem_free(packbuf, len);
    596 
    597 	i = 0;
    598 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
    599 		switch (nvpair_type(nvp)) {
    600 		case DATA_TYPE_UINT32:
    601 			if (strcmp(nvpair_name(nvp), "flags") != 0 ||
    602 			    nvpair_value_uint32(nvp, &setflags) != 0) {
    603 				nvlist_free(nvl);
    604 				return (EINVAL);
    605 			}
    606 			break;
    607 		case DATA_TYPE_NVLIST_ARRAY:
    608 			if (strcmp(nvpair_name(nvp), "reqs") != 0 ||
    609 			    nvpair_value_nvlist_array(nvp, &reqlist,
    610 			    &nreqs) != 0) {
    611 				nvlist_free(nvl);
    612 				return (EINVAL);
    613 			}
    614 			break;
    615 		default:
    616 			nvlist_free(nvl);
    617 			return (EINVAL);
    618 		}
    619 		i++;
    620 	}
    621 
    622 	/*
    623 	 * There should be two members in the top-level nvlist:
    624 	 * an array of nvlists consisting of the requests, and flags.
    625 	 * Anything else is an invalid set.
    626 	 */
    627 	if (i != 2) {
    628 		nvlist_free(nvl);
    629 		return (EINVAL);
    630 	}
    631 
    632 	if (nreqs > CPC_MAX_NREQS) {
    633 		nvlist_free(nvl);
    634 		return (EINVAL);
    635 	}
    636 
    637 	/*
    638 	 * The requests are now stored in the nvlist array at reqlist.
    639 	 * Note that the use of kmem_zalloc() to alloc the kcpc_set_t means
    640 	 * we don't need to call the init routines for ks_lock and ks_condv.
    641 	 */
    642 	set = kmem_zalloc(sizeof (kcpc_set_t), KM_SLEEP);
    643 	set->ks_req = (kcpc_request_t *)kmem_zalloc(sizeof (kcpc_request_t) *
    644 	    nreqs, KM_SLEEP);
    645 	set->ks_nreqs = nreqs;
    646 	/*
    647 	 * If the nvlist didn't contain a flags member, setflags was initialized
    648 	 * with an illegal value and this set will fail sanity checks later on.
    649 	 */
    650 	set->ks_flags = setflags;
    651 	/*
    652 	 * Initialize bind/unbind set synchronization.
    653 	 */
    654 	set->ks_state &= ~KCPC_SET_BOUND;
    655 
    656 	/*
    657 	 * Build the set up one request at a time, always keeping it self-
    658 	 * consistent so we can give it to kcpc_free_set() if we need to back
    659 	 * out and return and error.
    660 	 */
    661 	for (i = 0; i < nreqs; i++) {
    662 		nvp = NULL;
    663 		set->ks_req[i].kr_picnum = -1;
    664 		while ((nvp = nvlist_next_nvpair(reqlist[i], nvp)) != NULL) {
    665 			name = nvpair_name(nvp);
    666 			switch (nvpair_type(nvp)) {
    667 			case DATA_TYPE_UINT32:
    668 				if (nvpair_value_uint32(nvp, &uint32) == EINVAL)
    669 					goto inval;
    670 				if (strcmp(name, "cr_flags") == 0)
    671 					set->ks_req[i].kr_flags = uint32;
    672 				if (strcmp(name, "cr_index") == 0)
    673 					set->ks_req[i].kr_index = uint32;
    674 				break;
    675 			case DATA_TYPE_UINT64:
    676 				if (nvpair_value_uint64(nvp, &uint64) == EINVAL)
    677 					goto inval;
    678 				if (strcmp(name, "cr_preset") == 0)
    679 					set->ks_req[i].kr_preset = uint64;
    680 				break;
    681 			case DATA_TYPE_STRING:
    682 				if (nvpair_value_string(nvp, &string) == EINVAL)
    683 					goto inval;
    684 				if (strcmp(name, "cr_event") == 0)
    685 					(void) strncpy(set->ks_req[i].kr_event,
    686 					    string, CPC_MAX_EVENT_LEN);
    687 				break;
    688 			case DATA_TYPE_NVLIST:
    689 				if (strcmp(name, "cr_attr") != 0)
    690 					goto inval;
    691 				if (nvpair_value_nvlist(nvp, &attrs) == EINVAL)
    692 					goto inval;
    693 				nvp_attr = NULL;
    694 				/*
    695 				 * If the picnum has been specified as an
    696 				 * attribute, consume that attribute here and
    697 				 * remove it from the list of attributes.
    698 				 */
    699 				if (nvlist_lookup_uint64(attrs, "picnum",
    700 				    &uint64) == 0) {
    701 					if (nvlist_remove(attrs, "picnum",
    702 					    DATA_TYPE_UINT64) != 0)
    703 						panic("nvlist %p faulty",
    704 						    (void *)attrs);
    705 					set->ks_req[i].kr_picnum = uint64;
    706 				}
    707 
    708 				if ((set->ks_req[i].kr_nattrs =
    709 				    kcpc_nvlist_npairs(attrs)) == 0)
    710 					break;
    711 
    712 				if (set->ks_req[i].kr_nattrs > CPC_MAX_ATTRS)
    713 					goto inval;
    714 
    715 				set->ks_req[i].kr_attr =
    716 				    kmem_alloc(set->ks_req[i].kr_nattrs *
    717 				    sizeof (kcpc_attr_t), KM_SLEEP);
    718 				j = 0;
    719 
    720 				while ((nvp_attr = nvlist_next_nvpair(attrs,
    721 				    nvp_attr)) != NULL) {
    722 					attrp = &set->ks_req[i].kr_attr[j];
    723 
    724 					if (nvpair_type(nvp_attr) !=
    725 					    DATA_TYPE_UINT64)
    726 						goto inval;
    727 
    728 					(void) strncpy(attrp->ka_name,
    729 					    nvpair_name(nvp_attr),
    730 					    CPC_MAX_ATTR_LEN);
    731 
    732 					if (nvpair_value_uint64(nvp_attr,
    733 					    &(attrp->ka_val)) == EINVAL)
    734 						goto inval;
    735 					j++;
    736 				}
    737 				ASSERT(j == set->ks_req[i].kr_nattrs);
    738 			default:
    739 				break;
    740 			}
    741 		}
    742 	}
    743 
    744 	nvlist_free(nvl);
    745 	*inset = set;
    746 	return (0);
    747 
    748 inval:
    749 	nvlist_free(nvl);
    750 	kcpc_free_set(set);
    751 	return (EINVAL);
    752 }
    753 
    754 /*
    755  * Count the number of nvpairs in the supplied nvlist.
    756  */
    757 static uint32_t
    758 kcpc_nvlist_npairs(nvlist_t *list)
    759 {
    760 	nvpair_t *nvp = NULL;
    761 	uint32_t n = 0;
    762 
    763 	while ((nvp = nvlist_next_nvpair(list, nvp)) != NULL)
    764 		n++;
    765 
    766 	return (n);
    767 }
    768 
    769 /*
    770  * Performs sanity checks on the given set.
    771  * Returns 0 if the set checks out OK.
    772  * Returns a detailed error subcode, or -1 if there is no applicable subcode.
    773  */
    774 static int
    775 kcpc_verify_set(kcpc_set_t *set)
    776 {
    777 	kcpc_request_t	*rp;
    778 	int		i;
    779 	uint64_t	bitmap = 0;
    780 	int		n;
    781 
    782 	if (set->ks_nreqs > cpc_ncounters)
    783 		return (-1);
    784 
    785 	if (CPC_SET_VALID_FLAGS(set->ks_flags) == 0)
    786 		return (-1);
    787 
    788 	for (i = 0; i < set->ks_nreqs; i++) {
    789 		rp = &set->ks_req[i];
    790 
    791 		/*
    792 		 * The following comparison must cast cpc_ncounters to an int,
    793 		 * because kr_picnum will be -1 if the request didn't explicitly
    794 		 * choose a PIC.
    795 		 */
    796 		if (rp->kr_picnum >= (int)cpc_ncounters)
    797 			return (CPC_INVALID_PICNUM);
    798 
    799 		/*
    800 		 * Of the pics whose physical picnum has been specified, make
    801 		 * sure each PIC appears only once in set.
    802 		 */
    803 		if ((n = set->ks_req[i].kr_picnum) != -1) {
    804 			if ((bitmap & (1 << n)) != 0)
    805 				return (-1);
    806 			bitmap |= (1 << n);
    807 		}
    808 
    809 		/*
    810 		 * Make sure the requested index falls within the range of all
    811 		 * requests.
    812 		 */
    813 		if (rp->kr_index < 0 || rp->kr_index >= set->ks_nreqs)
    814 			return (-1);
    815 
    816 		/*
    817 		 * Make sure there are no unknown flags.
    818 		 */
    819 		if (KCPC_REQ_VALID_FLAGS(rp->kr_flags) == 0)
    820 			return (CPC_REQ_INVALID_FLAGS);
    821 	}
    822 
    823 	return (0);
    824 }
    825 
    826 static struct cb_ops cb_ops = {
    827 	kcpc_open,
    828 	kcpc_close,
    829 	nodev,		/* strategy */
    830 	nodev,		/* print */
    831 	nodev,		/* dump */
    832 	nodev,		/* read */
    833 	nodev,		/* write */
    834 	kcpc_ioctl,
    835 	nodev,		/* devmap */
    836 	nodev,		/* mmap */
    837 	nodev,		/* segmap */
    838 	nochpoll,	/* poll */
    839 	ddi_prop_op,
    840 	NULL,
    841 	D_NEW | D_MP
    842 };
    843 
    844 /*ARGSUSED*/
    845 static int
    846 kcpc_probe(dev_info_t *devi)
    847 {
    848 	return (DDI_PROBE_SUCCESS);
    849 }
    850 
    851 static dev_info_t *kcpc_devi;
    852 
    853 static int
    854 kcpc_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
    855 {
    856 	if (cmd != DDI_ATTACH)
    857 		return (DDI_FAILURE);
    858 	kcpc_devi = devi;
    859 	return (ddi_create_minor_node(devi, "shared", S_IFCHR,
    860 	    KCPC_MINOR_SHARED, DDI_PSEUDO, 0));
    861 }
    862 
    863 /*ARGSUSED*/
    864 static int
    865 kcpc_getinfo(dev_info_t *devi, ddi_info_cmd_t cmd, void *arg, void **result)
    866 {
    867 	switch (cmd) {
    868 	case DDI_INFO_DEVT2DEVINFO:
    869 		switch (getminor((dev_t)arg)) {
    870 		case KCPC_MINOR_SHARED:
    871 			*result = kcpc_devi;
    872 			return (DDI_SUCCESS);
    873 		default:
    874 			break;
    875 		}
    876 		break;
    877 	case DDI_INFO_DEVT2INSTANCE:
    878 		*result = 0;
    879 		return (DDI_SUCCESS);
    880 	default:
    881 		break;
    882 	}
    883 
    884 	return (DDI_FAILURE);
    885 }
    886 
    887 static struct dev_ops dev_ops = {
    888 	DEVO_REV,
    889 	0,
    890 	kcpc_getinfo,
    891 	nulldev,		/* identify */
    892 	kcpc_probe,
    893 	kcpc_attach,
    894 	nodev,			/* detach */
    895 	nodev,			/* reset */
    896 	&cb_ops,
    897 	(struct bus_ops *)0,
    898 	NULL,
    899 	ddi_quiesce_not_needed,		/* quiesce */
    900 };
    901 
    902 static struct modldrv modldrv = {
    903 	&mod_driverops,
    904 	"cpc sampling driver",
    905 	&dev_ops
    906 };
    907 
    908 static struct sysent cpc_sysent = {
    909 	5,
    910 	SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
    911 	cpc
    912 };
    913 
    914 static struct modlsys modlsys = {
    915 	&mod_syscallops,
    916 	"cpc sampling system call",
    917 	&cpc_sysent
    918 };
    919 
    920 #ifdef _SYSCALL32_IMPL
    921 static struct modlsys modlsys32 = {
    922 	&mod_syscallops32,
    923 	"32-bit cpc sampling system call",
    924 	&cpc_sysent
    925 };
    926 #endif
    927 
    928 static struct modlinkage modl = {
    929 	MODREV_1,
    930 	&modldrv,
    931 	&modlsys,
    932 #ifdef _SYSCALL32_IMPL
    933 	&modlsys32,
    934 #endif
    935 };
    936 
    937 static void
    938 kcpc_init(void)
    939 {
    940 	long hash;
    941 
    942 	rw_init(&kcpc_cpuctx_lock, NULL, RW_DEFAULT, NULL);
    943 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
    944 		mutex_init(&kcpc_ctx_llock[hash],
    945 		    NULL, MUTEX_DRIVER, (void *)(uintptr_t)15);
    946 }
    947 
    948 static void
    949 kcpc_fini(void)
    950 {
    951 	long hash;
    952 
    953 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
    954 		mutex_destroy(&kcpc_ctx_llock[hash]);
    955 	rw_destroy(&kcpc_cpuctx_lock);
    956 }
    957 
    958 int
    959 _init(void)
    960 {
    961 	int ret;
    962 
    963 	if (kcpc_hw_load_pcbe() != 0)
    964 		return (ENOTSUP);
    965 
    966 	kcpc_init();
    967 	if ((ret = mod_install(&modl)) != 0)
    968 		kcpc_fini();
    969 	return (ret);
    970 }
    971 
    972 int
    973 _fini(void)
    974 {
    975 	int ret;
    976 
    977 	if ((ret = mod_remove(&modl)) == 0)
    978 		kcpc_fini();
    979 	return (ret);
    980 }
    981 
    982 int
    983 _info(struct modinfo *mi)
    984 {
    985 	return (mod_info(&modl, mi));
    986 }
    987