Home | History | Annotate | Download | only in dtrace
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 
     27 #include <sys/errno.h>
     28 #include <sys/stat.h>
     29 #include <sys/modctl.h>
     30 #include <sys/conf.h>
     31 #include <sys/systm.h>
     32 #include <sys/ddi.h>
     33 #include <sys/sunddi.h>
     34 #include <sys/cpuvar.h>
     35 #include <sys/kmem.h>
     36 #include <sys/strsubr.h>
     37 #include <sys/dtrace.h>
     38 #include <sys/cyclic.h>
     39 #include <sys/atomic.h>
     40 
     41 static dev_info_t *profile_devi;
     42 static dtrace_provider_id_t profile_id;
     43 
     44 /*
     45  * Regardless of platform, the stack frames look like this in the case of the
     46  * profile provider:
     47  *
     48  *	profile_fire
     49  *	cyclic_expire
     50  *	cyclic_fire
     51  *	[ cbe ]
     52  *	[ interrupt code ]
     53  *
     54  * On x86, there are five frames from the generic interrupt code; further, the
     55  * interrupted instruction appears as its own stack frame, giving us a total of
     56  * 10.
     57  *
     58  * On SPARC, the picture is further complicated because the compiler
     59  * optimizes away tail-calls -- so the following frames are optimized away:
     60  *
     61  * 	profile_fire
     62  *	cyclic_expire
     63  *
     64  * This gives three frames.  However, on DEBUG kernels, the cyclic_expire
     65  * frame cannot be tail-call eliminated, yielding four frames in this case.
     66  *
     67  * All of the above constraints lead to the mess below.  Yes, the profile
     68  * provider should ideally figure this out on-the-fly by hitting one of its own
     69  * probes and then walking its own stack trace.  This is complicated, however,
     70  * and the static definition doesn't seem to be overly brittle.  Still, we
     71  * allow for a manual override in case we get it completely wrong.
     72  */
     73 #ifdef __x86
     74 #define	PROF_ARTIFICIAL_FRAMES	10
     75 #else
     76 #ifdef __sparc
     77 #ifdef DEBUG
     78 #define	PROF_ARTIFICIAL_FRAMES	4
     79 #else
     80 #define	PROF_ARTIFICIAL_FRAMES	3
     81 #endif
     82 #endif
     83 #endif
     84 
     85 #define	PROF_NAMELEN		15
     86 
     87 #define	PROF_PROFILE		0
     88 #define	PROF_TICK		1
     89 #define	PROF_PREFIX_PROFILE	"profile-"
     90 #define	PROF_PREFIX_TICK	"tick-"
     91 
     92 typedef struct profile_probe {
     93 	char		prof_name[PROF_NAMELEN];
     94 	dtrace_id_t	prof_id;
     95 	int		prof_kind;
     96 	hrtime_t	prof_interval;
     97 	cyclic_id_t	prof_cyclic;
     98 } profile_probe_t;
     99 
    100 typedef struct profile_probe_percpu {
    101 	hrtime_t	profc_expected;
    102 	hrtime_t	profc_interval;
    103 	profile_probe_t	*profc_probe;
    104 } profile_probe_percpu_t;
    105 
    106 hrtime_t	profile_interval_min = NANOSEC / 5000;		/* 5000 hz */
    107 int		profile_aframes = 0;				/* override */
    108 
    109 static int profile_rates[] = {
    110     97, 199, 499, 997, 1999,
    111     4001, 4999, 0, 0, 0,
    112     0, 0, 0, 0, 0,
    113     0, 0, 0, 0, 0
    114 };
    115 
    116 static int profile_ticks[] = {
    117     1, 10, 100, 500, 1000,
    118     5000, 0, 0, 0, 0,
    119     0, 0, 0, 0, 0
    120 };
    121 
    122 /*
    123  * profile_max defines the upper bound on the number of profile probes that
    124  * can exist (this is to prevent malicious or clumsy users from exhausing
    125  * system resources by creating a slew of profile probes). At mod load time,
    126  * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's
    127  * present in the profile.conf file.
    128  */
    129 #define	PROFILE_MAX_DEFAULT	1000	/* default max. number of probes */
    130 static uint32_t profile_max;		/* maximum number of profile probes */
    131 static uint32_t profile_total;	/* current number of profile probes */
    132 
    133 static void
    134 profile_fire(void *arg)
    135 {
    136 	profile_probe_percpu_t *pcpu = arg;
    137 	profile_probe_t *prof = pcpu->profc_probe;
    138 	hrtime_t late;
    139 
    140 	late = dtrace_gethrtime() - pcpu->profc_expected;
    141 	pcpu->profc_expected += pcpu->profc_interval;
    142 
    143 	dtrace_probe(prof->prof_id, CPU->cpu_profile_pc,
    144 	    CPU->cpu_profile_upc, late, 0, 0);
    145 }
    146 
    147 static void
    148 profile_tick(void *arg)
    149 {
    150 	profile_probe_t *prof = arg;
    151 
    152 	dtrace_probe(prof->prof_id, CPU->cpu_profile_pc,
    153 	    CPU->cpu_profile_upc, 0, 0, 0);
    154 }
    155 
    156 static void
    157 profile_create(hrtime_t interval, const char *name, int kind)
    158 {
    159 	profile_probe_t *prof;
    160 	int nr_frames = PROF_ARTIFICIAL_FRAMES + dtrace_mach_aframes();
    161 
    162 	if (profile_aframes)
    163 		nr_frames = profile_aframes;
    164 
    165 	if (interval < profile_interval_min)
    166 		return;
    167 
    168 	if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
    169 		return;
    170 
    171 	atomic_add_32(&profile_total, 1);
    172 	if (profile_total > profile_max) {
    173 		atomic_add_32(&profile_total, -1);
    174 		return;
    175 	}
    176 
    177 	prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP);
    178 	(void) strcpy(prof->prof_name, name);
    179 	prof->prof_interval = interval;
    180 	prof->prof_cyclic = CYCLIC_NONE;
    181 	prof->prof_kind = kind;
    182 	prof->prof_id = dtrace_probe_create(profile_id,
    183 	    NULL, NULL, name, nr_frames, prof);
    184 }
    185 
    186 /*ARGSUSED*/
    187 static void
    188 profile_provide(void *arg, const dtrace_probedesc_t *desc)
    189 {
    190 	int i, j, rate, kind;
    191 	hrtime_t val = 0, mult = 1, len;
    192 	const char *name, *suffix = NULL;
    193 
    194 	const struct {
    195 		char *prefix;
    196 		int kind;
    197 	} types[] = {
    198 		{ PROF_PREFIX_PROFILE, PROF_PROFILE },
    199 		{ PROF_PREFIX_TICK, PROF_TICK },
    200 		{ NULL, NULL }
    201 	};
    202 
    203 	const struct {
    204 		char *name;
    205 		hrtime_t mult;
    206 	} suffixes[] = {
    207 		{ "ns", 	NANOSEC / NANOSEC },
    208 		{ "nsec",	NANOSEC / NANOSEC },
    209 		{ "us",		NANOSEC / MICROSEC },
    210 		{ "usec",	NANOSEC / MICROSEC },
    211 		{ "ms",		NANOSEC / MILLISEC },
    212 		{ "msec",	NANOSEC / MILLISEC },
    213 		{ "s",		NANOSEC / SEC },
    214 		{ "sec",	NANOSEC / SEC },
    215 		{ "m",		NANOSEC * (hrtime_t)60 },
    216 		{ "min",	NANOSEC * (hrtime_t)60 },
    217 		{ "h",		NANOSEC * (hrtime_t)(60 * 60) },
    218 		{ "hour",	NANOSEC * (hrtime_t)(60 * 60) },
    219 		{ "d",		NANOSEC * (hrtime_t)(24 * 60 * 60) },
    220 		{ "day",	NANOSEC * (hrtime_t)(24 * 60 * 60) },
    221 		{ "hz",		0 },
    222 		{ NULL }
    223 	};
    224 
    225 	if (desc == NULL) {
    226 		char n[PROF_NAMELEN];
    227 
    228 		/*
    229 		 * If no description was provided, provide all of our probes.
    230 		 */
    231 		for (i = 0; i < sizeof (profile_rates) / sizeof (int); i++) {
    232 			if ((rate = profile_rates[i]) == 0)
    233 				continue;
    234 
    235 			(void) snprintf(n, PROF_NAMELEN, "%s%d",
    236 			    PROF_PREFIX_PROFILE, rate);
    237 			profile_create(NANOSEC / rate, n, PROF_PROFILE);
    238 		}
    239 
    240 		for (i = 0; i < sizeof (profile_ticks) / sizeof (int); i++) {
    241 			if ((rate = profile_ticks[i]) == 0)
    242 				continue;
    243 
    244 			(void) snprintf(n, PROF_NAMELEN, "%s%d",
    245 			    PROF_PREFIX_TICK, rate);
    246 			profile_create(NANOSEC / rate, n, PROF_TICK);
    247 		}
    248 
    249 		return;
    250 	}
    251 
    252 	name = desc->dtpd_name;
    253 
    254 	for (i = 0; types[i].prefix != NULL; i++) {
    255 		len = strlen(types[i].prefix);
    256 
    257 		if (strncmp(name, types[i].prefix, len) != 0)
    258 			continue;
    259 		break;
    260 	}
    261 
    262 	if (types[i].prefix == NULL)
    263 		return;
    264 
    265 	kind = types[i].kind;
    266 	j = strlen(name) - len;
    267 
    268 	/*
    269 	 * We need to start before any time suffix.
    270 	 */
    271 	for (j = strlen(name); j >= len; j--) {
    272 		if (name[j] >= '0' && name[j] <= '9')
    273 			break;
    274 		suffix = &name[j];
    275 	}
    276 
    277 	ASSERT(suffix != NULL);
    278 
    279 	/*
    280 	 * Now determine the numerical value present in the probe name.
    281 	 */
    282 	for (; j >= len; j--) {
    283 		if (name[j] < '0' || name[j] > '9')
    284 			return;
    285 
    286 		val += (name[j] - '0') * mult;
    287 		mult *= (hrtime_t)10;
    288 	}
    289 
    290 	if (val == 0)
    291 		return;
    292 
    293 	/*
    294 	 * Look-up the suffix to determine the multiplier.
    295 	 */
    296 	for (i = 0, mult = 0; suffixes[i].name != NULL; i++) {
    297 		if (strcasecmp(suffixes[i].name, suffix) == 0) {
    298 			mult = suffixes[i].mult;
    299 			break;
    300 		}
    301 	}
    302 
    303 	if (suffixes[i].name == NULL && *suffix != '\0')
    304 		return;
    305 
    306 	if (mult == 0) {
    307 		/*
    308 		 * The default is frequency-per-second.
    309 		 */
    310 		val = NANOSEC / val;
    311 	} else {
    312 		val *= mult;
    313 	}
    314 
    315 	profile_create(val, name, kind);
    316 }
    317 
    318 /*ARGSUSED*/
    319 static void
    320 profile_destroy(void *arg, dtrace_id_t id, void *parg)
    321 {
    322 	profile_probe_t *prof = parg;
    323 
    324 	ASSERT(prof->prof_cyclic == CYCLIC_NONE);
    325 	kmem_free(prof, sizeof (profile_probe_t));
    326 
    327 	ASSERT(profile_total >= 1);
    328 	atomic_add_32(&profile_total, -1);
    329 }
    330 
    331 /*ARGSUSED*/
    332 static void
    333 profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
    334 {
    335 	profile_probe_t *prof = arg;
    336 	profile_probe_percpu_t *pcpu;
    337 
    338 	pcpu = kmem_zalloc(sizeof (profile_probe_percpu_t), KM_SLEEP);
    339 	pcpu->profc_probe = prof;
    340 
    341 	hdlr->cyh_func = profile_fire;
    342 	hdlr->cyh_arg = pcpu;
    343 	hdlr->cyh_level = CY_HIGH_LEVEL;
    344 
    345 	when->cyt_interval = prof->prof_interval;
    346 	when->cyt_when = dtrace_gethrtime() + when->cyt_interval;
    347 
    348 	pcpu->profc_expected = when->cyt_when;
    349 	pcpu->profc_interval = when->cyt_interval;
    350 }
    351 
    352 /*ARGSUSED*/
    353 static void
    354 profile_offline(void *arg, cpu_t *cpu, void *oarg)
    355 {
    356 	profile_probe_percpu_t *pcpu = oarg;
    357 
    358 	ASSERT(pcpu->profc_probe == arg);
    359 	kmem_free(pcpu, sizeof (profile_probe_percpu_t));
    360 }
    361 
    362 /*ARGSUSED*/
    363 static void
    364 profile_enable(void *arg, dtrace_id_t id, void *parg)
    365 {
    366 	profile_probe_t *prof = parg;
    367 	cyc_omni_handler_t omni;
    368 	cyc_handler_t hdlr;
    369 	cyc_time_t when;
    370 
    371 	ASSERT(prof->prof_interval != 0);
    372 	ASSERT(MUTEX_HELD(&cpu_lock));
    373 
    374 	if (prof->prof_kind == PROF_TICK) {
    375 		hdlr.cyh_func = profile_tick;
    376 		hdlr.cyh_arg = prof;
    377 		hdlr.cyh_level = CY_HIGH_LEVEL;
    378 
    379 		when.cyt_interval = prof->prof_interval;
    380 		when.cyt_when = dtrace_gethrtime() + when.cyt_interval;
    381 	} else {
    382 		ASSERT(prof->prof_kind == PROF_PROFILE);
    383 		omni.cyo_online = profile_online;
    384 		omni.cyo_offline = profile_offline;
    385 		omni.cyo_arg = prof;
    386 	}
    387 
    388 	if (prof->prof_kind == PROF_TICK) {
    389 		prof->prof_cyclic = cyclic_add(&hdlr, &when);
    390 	} else {
    391 		prof->prof_cyclic = cyclic_add_omni(&omni);
    392 	}
    393 }
    394 
    395 /*ARGSUSED*/
    396 static void
    397 profile_disable(void *arg, dtrace_id_t id, void *parg)
    398 {
    399 	profile_probe_t *prof = parg;
    400 
    401 	ASSERT(prof->prof_cyclic != CYCLIC_NONE);
    402 	ASSERT(MUTEX_HELD(&cpu_lock));
    403 
    404 	cyclic_remove(prof->prof_cyclic);
    405 	prof->prof_cyclic = CYCLIC_NONE;
    406 }
    407 
    408 /*ARGSUSED*/
    409 static int
    410 profile_usermode(void *arg, dtrace_id_t id, void *parg)
    411 {
    412 	return (CPU->cpu_profile_pc == 0);
    413 }
    414 
    415 static dtrace_pattr_t profile_attr = {
    416 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
    417 { DTRACE_STABILITY_UNSTABLE, DTRACE_STABILITY_UNSTABLE, DTRACE_CLASS_UNKNOWN },
    418 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
    419 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
    420 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
    421 };
    422 
    423 static dtrace_pops_t profile_pops = {
    424 	profile_provide,
    425 	NULL,
    426 	profile_enable,
    427 	profile_disable,
    428 	NULL,
    429 	NULL,
    430 	NULL,
    431 	NULL,
    432 	profile_usermode,
    433 	profile_destroy
    434 };
    435 
    436 static int
    437 profile_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
    438 {
    439 	switch (cmd) {
    440 	case DDI_ATTACH:
    441 		break;
    442 	case DDI_RESUME:
    443 		return (DDI_SUCCESS);
    444 	default:
    445 		return (DDI_FAILURE);
    446 	}
    447 
    448 	if (ddi_create_minor_node(devi, "profile", S_IFCHR, 0,
    449 	    DDI_PSEUDO, NULL) == DDI_FAILURE ||
    450 	    dtrace_register("profile", &profile_attr,
    451 	    DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER, NULL,
    452 	    &profile_pops, NULL, &profile_id) != 0) {
    453 		ddi_remove_minor_node(devi, NULL);
    454 		return (DDI_FAILURE);
    455 	}
    456 
    457 	profile_max = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
    458 	    "profile-max-probes", PROFILE_MAX_DEFAULT);
    459 
    460 	ddi_report_dev(devi);
    461 	profile_devi = devi;
    462 	return (DDI_SUCCESS);
    463 }
    464 
    465 static int
    466 profile_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
    467 {
    468 	switch (cmd) {
    469 	case DDI_DETACH:
    470 		break;
    471 	case DDI_SUSPEND:
    472 		return (DDI_SUCCESS);
    473 	default:
    474 		return (DDI_FAILURE);
    475 	}
    476 
    477 	if (dtrace_unregister(profile_id) != 0)
    478 		return (DDI_FAILURE);
    479 
    480 	ddi_remove_minor_node(devi, NULL);
    481 	return (DDI_SUCCESS);
    482 }
    483 
    484 /*ARGSUSED*/
    485 static int
    486 profile_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
    487 {
    488 	int error;
    489 
    490 	switch (infocmd) {
    491 	case DDI_INFO_DEVT2DEVINFO:
    492 		*result = (void *)profile_devi;
    493 		error = DDI_SUCCESS;
    494 		break;
    495 	case DDI_INFO_DEVT2INSTANCE:
    496 		*result = (void *)0;
    497 		error = DDI_SUCCESS;
    498 		break;
    499 	default:
    500 		error = DDI_FAILURE;
    501 	}
    502 	return (error);
    503 }
    504 
    505 /*ARGSUSED*/
    506 static int
    507 profile_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
    508 {
    509 	return (0);
    510 }
    511 
    512 static struct cb_ops profile_cb_ops = {
    513 	profile_open,		/* open */
    514 	nodev,			/* close */
    515 	nulldev,		/* strategy */
    516 	nulldev,		/* print */
    517 	nodev,			/* dump */
    518 	nodev,			/* read */
    519 	nodev,			/* write */
    520 	nodev,			/* ioctl */
    521 	nodev,			/* devmap */
    522 	nodev,			/* mmap */
    523 	nodev,			/* segmap */
    524 	nochpoll,		/* poll */
    525 	ddi_prop_op,		/* cb_prop_op */
    526 	0,			/* streamtab  */
    527 	D_NEW | D_MP		/* Driver compatibility flag */
    528 };
    529 
    530 static struct dev_ops profile_ops = {
    531 	DEVO_REV,		/* devo_rev, */
    532 	0,			/* refcnt  */
    533 	profile_info,		/* get_dev_info */
    534 	nulldev,		/* identify */
    535 	nulldev,		/* probe */
    536 	profile_attach,		/* attach */
    537 	profile_detach,		/* detach */
    538 	nodev,			/* reset */
    539 	&profile_cb_ops,	/* driver operations */
    540 	NULL,			/* bus operations */
    541 	nodev,			/* dev power */
    542 	ddi_quiesce_not_needed,		/* quiesce */
    543 };
    544 
    545 /*
    546  * Module linkage information for the kernel.
    547  */
    548 static struct modldrv modldrv = {
    549 	&mod_driverops,		/* module type (this is a pseudo driver) */
    550 	"Profile Interrupt Tracing",	/* name of module */
    551 	&profile_ops,		/* driver ops */
    552 };
    553 
    554 static struct modlinkage modlinkage = {
    555 	MODREV_1,
    556 	(void *)&modldrv,
    557 	NULL
    558 };
    559 
    560 int
    561 _init(void)
    562 {
    563 	return (mod_install(&modlinkage));
    564 }
    565 
    566 int
    567 _info(struct modinfo *modinfop)
    568 {
    569 	return (mod_info(&modlinkage, modinfop));
    570 }
    571 
    572 int
    573 _fini(void)
    574 {
    575 	return (mod_remove(&modlinkage));
    576 }
    577