Home | History | Annotate | Download | only in dtrace
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * DTrace - Dynamic Tracing for Solaris
     29  *
     30  * This is the implementation of the Solaris Dynamic Tracing framework
     31  * (DTrace).  The user-visible interface to DTrace is described at length in
     32  * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
     33  * library, the in-kernel DTrace framework, and the DTrace providers are
     34  * described in the block comments in the <sys/dtrace.h> header file.  The
     35  * internal architecture of DTrace is described in the block comments in the
     36  * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
     37  * implementation very much assume mastery of all of these sources; if one has
     38  * an unanswered question about the implementation, one should consult them
     39  * first.
     40  *
     41  * The functions here are ordered roughly as follows:
     42  *
     43  *   - Probe context functions
     44  *   - Probe hashing functions
     45  *   - Non-probe context utility functions
     46  *   - Matching functions
     47  *   - Provider-to-Framework API functions
     48  *   - Probe management functions
     49  *   - DIF object functions
     50  *   - Format functions
     51  *   - Predicate functions
     52  *   - ECB functions
     53  *   - Buffer functions
     54  *   - Enabling functions
     55  *   - DOF functions
     56  *   - Anonymous enabling functions
     57  *   - Consumer state functions
     58  *   - Helper functions
     59  *   - Hook functions
     60  *   - Driver cookbook functions
     61  *
     62  * Each group of functions begins with a block comment labelled the "DTrace
     63  * [Group] Functions", allowing one to find each block by searching forward
     64  * on capital-f functions.
     65  */
     66 #include <sys/errno.h>
     67 #include <sys/stat.h>
     68 #include <sys/modctl.h>
     69 #include <sys/conf.h>
     70 #include <sys/systm.h>
     71 #include <sys/ddi.h>
     72 #include <sys/sunddi.h>
     73 #include <sys/cpuvar.h>
     74 #include <sys/kmem.h>
     75 #include <sys/strsubr.h>
     76 #include <sys/sysmacros.h>
     77 #include <sys/dtrace_impl.h>
     78 #include <sys/atomic.h>
     79 #include <sys/cmn_err.h>
     80 #include <sys/mutex_impl.h>
     81 #include <sys/rwlock_impl.h>
     82 #include <sys/ctf_api.h>
     83 #include <sys/panic.h>
     84 #include <sys/priv_impl.h>
     85 #include <sys/policy.h>
     86 #include <sys/cred_impl.h>
     87 #include <sys/procfs_isa.h>
     88 #include <sys/taskq.h>
     89 #include <sys/mkdev.h>
     90 #include <sys/kdi.h>
     91 #include <sys/zone.h>
     92 #include <sys/socket.h>
     93 #include <netinet/in.h>
     94 
     95 /*
     96  * DTrace Tunable Variables
     97  *
     98  * The following variables may be tuned by adding a line to /etc/system that
     99  * includes both the name of the DTrace module ("dtrace") and the name of the
    100  * variable.  For example:
    101  *
    102  *   set dtrace:dtrace_destructive_disallow = 1
    103  *
    104  * In general, the only variables that one should be tuning this way are those
    105  * that affect system-wide DTrace behavior, and for which the default behavior
    106  * is undesirable.  Most of these variables are tunable on a per-consumer
    107  * basis using DTrace options, and need not be tuned on a system-wide basis.
    108  * When tuning these variables, avoid pathological values; while some attempt
    109  * is made to verify the integrity of these variables, they are not considered
    110  * part of the supported interface to DTrace, and they are therefore not
    111  * checked comprehensively.  Further, these variables should not be tuned
    112  * dynamically via "mdb -kw" or other means; they should only be tuned via
    113  * /etc/system.
    114  */
    115 int		dtrace_destructive_disallow = 0;
    116 dtrace_optval_t	dtrace_nonroot_maxsize = (16 * 1024 * 1024);
    117 size_t		dtrace_difo_maxsize = (256 * 1024);
    118 dtrace_optval_t	dtrace_dof_maxsize = (256 * 1024);
    119 size_t		dtrace_global_maxsize = (16 * 1024);
    120 size_t		dtrace_actions_max = (16 * 1024);
    121 size_t		dtrace_retain_max = 1024;
    122 dtrace_optval_t	dtrace_helper_actions_max = 32;
    123 dtrace_optval_t	dtrace_helper_providers_max = 32;
    124 dtrace_optval_t	dtrace_dstate_defsize = (1 * 1024 * 1024);
    125 size_t		dtrace_strsize_default = 256;
    126 dtrace_optval_t	dtrace_cleanrate_default = 9900990;		/* 101 hz */
    127 dtrace_optval_t	dtrace_cleanrate_min = 200000;			/* 5000 hz */
    128 dtrace_optval_t	dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;	/* 1/minute */
    129 dtrace_optval_t	dtrace_aggrate_default = NANOSEC;		/* 1 hz */
    130 dtrace_optval_t	dtrace_statusrate_default = NANOSEC;		/* 1 hz */
    131 dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;	 /* 6/minute */
    132 dtrace_optval_t	dtrace_switchrate_default = NANOSEC;		/* 1 hz */
    133 dtrace_optval_t	dtrace_nspec_default = 1;
    134 dtrace_optval_t	dtrace_specsize_default = 32 * 1024;
    135 dtrace_optval_t dtrace_stackframes_default = 20;
    136 dtrace_optval_t dtrace_ustackframes_default = 20;
    137 dtrace_optval_t dtrace_jstackframes_default = 50;
    138 dtrace_optval_t dtrace_jstackstrsize_default = 512;
    139 int		dtrace_msgdsize_max = 128;
    140 hrtime_t	dtrace_chill_max = 500 * (NANOSEC / MILLISEC);	/* 500 ms */
    141 hrtime_t	dtrace_chill_interval = NANOSEC;		/* 1000 ms */
    142 int		dtrace_devdepth_max = 32;
    143 int		dtrace_err_verbose;
    144 hrtime_t	dtrace_deadman_interval = NANOSEC;
    145 hrtime_t	dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
    146 hrtime_t	dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
    147 
    148 /*
    149  * DTrace External Variables
    150  *
    151  * As dtrace(7D) is a kernel module, any DTrace variables are obviously
    152  * available to DTrace consumers via the backtick (`) syntax.  One of these,
    153  * dtrace_zero, is made deliberately so:  it is provided as a source of
    154  * well-known, zero-filled memory.  While this variable is not documented,
    155  * it is used by some translators as an implementation detail.
    156  */
    157 const char	dtrace_zero[256] = { 0 };	/* zero-filled memory */
    158 
    159 /*
    160  * DTrace Internal Variables
    161  */
    162 static dev_info_t	*dtrace_devi;		/* device info */
    163 static vmem_t		*dtrace_arena;		/* probe ID arena */
    164 static vmem_t		*dtrace_minor;		/* minor number arena */
    165 static taskq_t		*dtrace_taskq;		/* task queue */
    166 static dtrace_probe_t	**dtrace_probes;	/* array of all probes */
    167 static int		dtrace_nprobes;		/* number of probes */
    168 static dtrace_provider_t *dtrace_provider;	/* provider list */
    169 static dtrace_meta_t	*dtrace_meta_pid;	/* user-land meta provider */
    170 static int		dtrace_opens;		/* number of opens */
    171 static int		dtrace_helpers;		/* number of helpers */
    172 static void		*dtrace_softstate;	/* softstate pointer */
    173 static dtrace_hash_t	*dtrace_bymod;		/* probes hashed by module */
    174 static dtrace_hash_t	*dtrace_byfunc;		/* probes hashed by function */
    175 static dtrace_hash_t	*dtrace_byname;		/* probes hashed by name */
    176 static dtrace_toxrange_t *dtrace_toxrange;	/* toxic range array */
    177 static int		dtrace_toxranges;	/* number of toxic ranges */
    178 static int		dtrace_toxranges_max;	/* size of toxic range array */
    179 static dtrace_anon_t	dtrace_anon;		/* anonymous enabling */
    180 static kmem_cache_t	*dtrace_state_cache;	/* cache for dynamic state */
    181 static uint64_t		dtrace_vtime_references; /* number of vtimestamp refs */
    182 static kthread_t	*dtrace_panicked;	/* panicking thread */
    183 static dtrace_ecb_t	*dtrace_ecb_create_cache; /* cached created ECB */
    184 static dtrace_genid_t	dtrace_probegen;	/* current probe generation */
    185 static dtrace_helpers_t *dtrace_deferred_pid;	/* deferred helper list */
    186 static dtrace_enabling_t *dtrace_retained;	/* list of retained enablings */
    187 static dtrace_genid_t	dtrace_retained_gen;	/* current retained enab gen */
    188 static dtrace_dynvar_t	dtrace_dynhash_sink;	/* end of dynamic hash chains */
    189 
    190 /*
    191  * DTrace Locking
    192  * DTrace is protected by three (relatively coarse-grained) locks:
    193  *
    194  * (1) dtrace_lock is required to manipulate essentially any DTrace state,
    195  *     including enabling state, probes, ECBs, consumer state, helper state,
    196  *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
    197  *     probe context is lock-free -- synchronization is handled via the
    198  *     dtrace_sync() cross call mechanism.
    199  *
    200  * (2) dtrace_provider_lock is required when manipulating provider state, or
    201  *     when provider state must be held constant.
    202  *
    203  * (3) dtrace_meta_lock is required when manipulating meta provider state, or
    204  *     when meta provider state must be held constant.
    205  *
    206  * The lock ordering between these three locks is dtrace_meta_lock before
    207  * dtrace_provider_lock before dtrace_lock.  (In particular, there are
    208  * several places where dtrace_provider_lock is held by the framework as it
    209  * calls into the providers -- which then call back into the framework,
    210  * grabbing dtrace_lock.)
    211  *
    212  * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
    213  * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
    214  * role as a coarse-grained lock; it is acquired before both of these locks.
    215  * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
    216  * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
    217  * mod_lock is similar with respect to dtrace_provider_lock in that it must be
    218  * acquired _between_ dtrace_provider_lock and dtrace_lock.
    219  */
    220 static kmutex_t		dtrace_lock;		/* probe state lock */
    221 static kmutex_t		dtrace_provider_lock;	/* provider state lock */
    222 static kmutex_t		dtrace_meta_lock;	/* meta-provider state lock */
    223 
    224 /*
    225  * DTrace Provider Variables
    226  *
    227  * These are the variables relating to DTrace as a provider (that is, the
    228  * provider of the BEGIN, END, and ERROR probes).
    229  */
    230 static dtrace_pattr_t	dtrace_provider_attr = {
    231 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
    232 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
    233 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
    234 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
    235 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
    236 };
    237 
    238 static void
    239 dtrace_nullop(void)
    240 {}
    241 
    242 static dtrace_pops_t	dtrace_provider_ops = {
    243 	(void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
    244 	(void (*)(void *, struct modctl *))dtrace_nullop,
    245 	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
    246 	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
    247 	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
    248 	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
    249 	NULL,
    250 	NULL,
    251 	NULL,
    252 	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop
    253 };
    254 
    255 static dtrace_id_t	dtrace_probeid_begin;	/* special BEGIN probe */
    256 static dtrace_id_t	dtrace_probeid_end;	/* special END probe */
    257 dtrace_id_t		dtrace_probeid_error;	/* special ERROR probe */
    258 
    259 /*
    260  * DTrace Helper Tracing Variables
    261  */
    262 uint32_t dtrace_helptrace_next = 0;
    263 uint32_t dtrace_helptrace_nlocals;
    264 char	*dtrace_helptrace_buffer;
    265 int	dtrace_helptrace_bufsize = 512 * 1024;
    266 
    267 #ifdef DEBUG
    268 int	dtrace_helptrace_enabled = 1;
    269 #else
    270 int	dtrace_helptrace_enabled = 0;
    271 #endif
    272 
    273 /*
    274  * DTrace Error Hashing
    275  *
    276  * On DEBUG kernels, DTrace will track the errors that has seen in a hash
    277  * table.  This is very useful for checking coverage of tests that are
    278  * expected to induce DIF or DOF processing errors, and may be useful for
    279  * debugging problems in the DIF code generator or in DOF generation .  The
    280  * error hash may be examined with the ::dtrace_errhash MDB dcmd.
    281  */
    282 #ifdef DEBUG
    283 static dtrace_errhash_t	dtrace_errhash[DTRACE_ERRHASHSZ];
    284 static const char *dtrace_errlast;
    285 static kthread_t *dtrace_errthread;
    286 static kmutex_t dtrace_errlock;
    287 #endif
    288 
    289 /*
    290  * DTrace Macros and Constants
    291  *
    292  * These are various macros that are useful in various spots in the
    293  * implementation, along with a few random constants that have no meaning
    294  * outside of the implementation.  There is no real structure to this cpp
    295  * mishmash -- but is there ever?
    296  */
    297 #define	DTRACE_HASHSTR(hash, probe)	\
    298 	dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
    299 
    300 #define	DTRACE_HASHNEXT(hash, probe)	\
    301 	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
    302 
    303 #define	DTRACE_HASHPREV(hash, probe)	\
    304 	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
    305 
    306 #define	DTRACE_HASHEQ(hash, lhs, rhs)	\
    307 	(strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
    308 	    *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
    309 
    310 #define	DTRACE_AGGHASHSIZE_SLEW		17
    311 
    312 #define	DTRACE_V4MAPPED_OFFSET		(sizeof (uint32_t) * 3)
    313 
    314 /*
    315  * The key for a thread-local variable consists of the lower 61 bits of the
    316  * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
    317  * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
    318  * equal to a variable identifier.  This is necessary (but not sufficient) to
    319  * assure that global associative arrays never collide with thread-local
    320  * variables.  To guarantee that they cannot collide, we must also define the
    321  * order for keying dynamic variables.  That order is:
    322  *
    323  *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
    324  *
    325  * Because the variable-key and the tls-key are in orthogonal spaces, there is
    326  * no way for a global variable key signature to match a thread-local key
    327  * signature.
    328  */
    329 #define	DTRACE_TLS_THRKEY(where) { \
    330 	uint_t intr = 0; \
    331 	uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
    332 	for (; actv; actv >>= 1) \
    333 		intr++; \
    334 	ASSERT(intr < (1 << 3)); \
    335 	(where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
    336 	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
    337 }
    338 
    339 #define	DT_BSWAP_8(x)	((x) & 0xff)
    340 #define	DT_BSWAP_16(x)	((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
    341 #define	DT_BSWAP_32(x)	((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
    342 #define	DT_BSWAP_64(x)	((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
    343 
    344 #define	DT_MASK_LO 0x00000000FFFFFFFFULL
    345 
    346 #define	DTRACE_STORE(type, tomax, offset, what) \
    347 	*((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
    348 
    349 #ifndef __i386
    350 #define	DTRACE_ALIGNCHECK(addr, size, flags)				\
    351 	if (addr & (size - 1)) {					\
    352 		*flags |= CPU_DTRACE_BADALIGN;				\
    353 		cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;	\
    354 		return (0);						\
    355 	}
    356 #else
    357 #define	DTRACE_ALIGNCHECK(addr, size, flags)
    358 #endif
    359 
    360 /*
    361  * Test whether a range of memory starting at testaddr of size testsz falls
    362  * within the range of memory described by addr, sz.  We take care to avoid
    363  * problems with overflow and underflow of the unsigned quantities, and
    364  * disallow all negative sizes.  Ranges of size 0 are allowed.
    365  */
    366 #define	DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
    367 	((testaddr) - (baseaddr) < (basesz) && \
    368 	(testaddr) + (testsz) - (baseaddr) <= (basesz) && \
    369 	(testaddr) + (testsz) >= (testaddr))
    370 
    371 /*
    372  * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
    373  * alloc_sz on the righthand side of the comparison in order to avoid overflow
    374  * or underflow in the comparison with it.  This is simpler than the INRANGE
    375  * check above, because we know that the dtms_scratch_ptr is valid in the
    376  * range.  Allocations of size zero are allowed.
    377  */
    378 #define	DTRACE_INSCRATCH(mstate, alloc_sz) \
    379 	((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
    380 	(mstate)->dtms_scratch_ptr >= (alloc_sz))
    381 
    382 #define	DTRACE_LOADFUNC(bits)						\
    383 /*CSTYLED*/								\
    384 uint##bits##_t								\
    385 dtrace_load##bits(uintptr_t addr)					\
    386 {									\
    387 	size_t size = bits / NBBY;					\
    388 	/*CSTYLED*/							\
    389 	uint##bits##_t rval;						\
    390 	int i;								\
    391 	volatile uint16_t *flags = (volatile uint16_t *)		\
    392 	    &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;			\
    393 									\
    394 	DTRACE_ALIGNCHECK(addr, size, flags);				\
    395 									\
    396 	for (i = 0; i < dtrace_toxranges; i++) {			\
    397 		if (addr >= dtrace_toxrange[i].dtt_limit)		\
    398 			continue;					\
    399 									\
    400 		if (addr + size <= dtrace_toxrange[i].dtt_base)		\
    401 			continue;					\
    402 									\
    403 		/*							\
    404 		 * This address falls within a toxic region; return 0.	\
    405 		 */							\
    406 		*flags |= CPU_DTRACE_BADADDR;				\
    407 		cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;	\
    408 		return (0);						\
    409 	}								\
    410 									\
    411 	*flags |= CPU_DTRACE_NOFAULT;					\
    412 	/*CSTYLED*/							\
    413 	rval = *((volatile uint##bits##_t *)addr);			\
    414 	*flags &= ~CPU_DTRACE_NOFAULT;					\
    415 									\
    416 	return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0);		\
    417 }
    418 
    419 #ifdef _LP64
    420 #define	dtrace_loadptr	dtrace_load64
    421 #else
    422 #define	dtrace_loadptr	dtrace_load32
    423 #endif
    424 
    425 #define	DTRACE_DYNHASH_FREE	0
    426 #define	DTRACE_DYNHASH_SINK	1
    427 #define	DTRACE_DYNHASH_VALID	2
    428 
    429 #define	DTRACE_MATCH_NEXT	0
    430 #define	DTRACE_MATCH_DONE	1
    431 #define	DTRACE_ANCHORED(probe)	((probe)->dtpr_func[0] != '\0')
    432 #define	DTRACE_STATE_ALIGN	64
    433 
    434 #define	DTRACE_FLAGS2FLT(flags)						\
    435 	(((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :		\
    436 	((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :		\
    437 	((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :		\
    438 	((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :		\
    439 	((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :		\
    440 	((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :		\
    441 	((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :		\
    442 	((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :	\
    443 	((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :		\
    444 	DTRACEFLT_UNKNOWN)
    445 
    446 #define	DTRACEACT_ISSTRING(act)						\
    447 	((act)->dta_kind == DTRACEACT_DIFEXPR &&			\
    448 	(act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
    449 
    450 static size_t dtrace_strlen(const char *, size_t);
    451 static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
    452 static void dtrace_enabling_provide(dtrace_provider_t *);
    453 static int dtrace_enabling_match(dtrace_enabling_t *, int *);
    454 static void dtrace_enabling_matchall(void);
    455 static dtrace_state_t *dtrace_anon_grab(void);
    456 static uint64_t dtrace_helper(int, dtrace_mstate_t *,
    457     dtrace_state_t *, uint64_t, uint64_t);
    458 static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
    459 static void dtrace_buffer_drop(dtrace_buffer_t *);
    460 static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
    461     dtrace_state_t *, dtrace_mstate_t *);
    462 static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
    463     dtrace_optval_t);
    464 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
    465 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
    466 
    467 /*
    468  * DTrace Probe Context Functions
    469  *
    470  * These functions are called from probe context.  Because probe context is
    471  * any context in which C may be called, arbitrarily locks may be held,
    472  * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
    473  * As a result, functions called from probe context may only call other DTrace
    474  * support functions -- they may not interact at all with the system at large.
    475  * (Note that the ASSERT macro is made probe-context safe by redefining it in
    476  * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
    477  * loads are to be performed from probe context, they _must_ be in terms of
    478  * the safe dtrace_load*() variants.
    479  *
    480  * Some functions in this block are not actually called from probe context;
    481  * for these functions, there will be a comment above the function reading
    482  * "Note:  not called from probe context."
    483  */
    484 void
    485 dtrace_panic(const char *format, ...)
    486 {
    487 	va_list alist;
    488 
    489 	va_start(alist, format);
    490 	dtrace_vpanic(format, alist);
    491 	va_end(alist);
    492 }
    493 
    494 int
    495 dtrace_assfail(const char *a, const char *f, int l)
    496 {
    497 	dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
    498 
    499 	/*
    500 	 * We just need something here that even the most clever compiler
    501 	 * cannot optimize away.
    502 	 */
    503 	return (a[(uintptr_t)f]);
    504 }
    505 
    506 /*
    507  * Atomically increment a specified error counter from probe context.
    508  */
    509 static void
    510 dtrace_error(uint32_t *counter)
    511 {
    512 	/*
    513 	 * Most counters stored to in probe context are per-CPU counters.
    514 	 * However, there are some error conditions that are sufficiently
    515 	 * arcane that they don't merit per-CPU storage.  If these counters
    516 	 * are incremented concurrently on different CPUs, scalability will be
    517 	 * adversely affected -- but we don't expect them to be white-hot in a
    518 	 * correctly constructed enabling...
    519 	 */
    520 	uint32_t oval, nval;
    521 
    522 	do {
    523 		oval = *counter;
    524 
    525 		if ((nval = oval + 1) == 0) {
    526 			/*
    527 			 * If the counter would wrap, set it to 1 -- assuring
    528 			 * that the counter is never zero when we have seen
    529 			 * errors.  (The counter must be 32-bits because we
    530 			 * aren't guaranteed a 64-bit compare&swap operation.)
    531 			 * To save this code both the infamy of being fingered
    532 			 * by a priggish news story and the indignity of being
    533 			 * the target of a neo-puritan witch trial, we're
    534 			 * carefully avoiding any colorful description of the
    535 			 * likelihood of this condition -- but suffice it to
    536 			 * say that it is only slightly more likely than the
    537 			 * overflow of predicate cache IDs, as discussed in
    538 			 * dtrace_predicate_create().
    539 			 */
    540 			nval = 1;
    541 		}
    542 	} while (dtrace_cas32(counter, oval, nval) != oval);
    543 }
    544 
    545 /*
    546  * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
    547  * uint8_t, a uint16_t, a uint32_t and a uint64_t.
    548  */
    549 DTRACE_LOADFUNC(8)
    550 DTRACE_LOADFUNC(16)
    551 DTRACE_LOADFUNC(32)
    552 DTRACE_LOADFUNC(64)
    553 
    554 static int
    555 dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
    556 {
    557 	if (dest < mstate->dtms_scratch_base)
    558 		return (0);
    559 
    560 	if (dest + size < dest)
    561 		return (0);
    562 
    563 	if (dest + size > mstate->dtms_scratch_ptr)
    564 		return (0);
    565 
    566 	return (1);
    567 }
    568 
    569 static int
    570 dtrace_canstore_statvar(uint64_t addr, size_t sz,
    571     dtrace_statvar_t **svars, int nsvars)
    572 {
    573 	int i;
    574 
    575 	for (i = 0; i < nsvars; i++) {
    576 		dtrace_statvar_t *svar = svars[i];
    577 
    578 		if (svar == NULL || svar->dtsv_size == 0)
    579 			continue;
    580 
    581 		if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
    582 			return (1);
    583 	}
    584 
    585 	return (0);
    586 }
    587 
    588 /*
    589  * Check to see if the address is within a memory region to which a store may
    590  * be issued.  This includes the DTrace scratch areas, and any DTrace variable
    591  * region.  The caller of dtrace_canstore() is responsible for performing any
    592  * alignment checks that are needed before stores are actually executed.
    593  */
    594 static int
    595 dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
    596     dtrace_vstate_t *vstate)
    597 {
    598 	/*
    599 	 * First, check to see if the address is in scratch space...
    600 	 */
    601 	if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
    602 	    mstate->dtms_scratch_size))
    603 		return (1);
    604 
    605 	/*
    606 	 * Now check to see if it's a dynamic variable.  This check will pick
    607 	 * up both thread-local variables and any global dynamically-allocated
    608 	 * variables.
    609 	 */
    610 	if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
    611 	    vstate->dtvs_dynvars.dtds_size)) {
    612 		dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
    613 		uintptr_t base = (uintptr_t)dstate->dtds_base +
    614 		    (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
    615 		uintptr_t chunkoffs;
    616 
    617 		/*
    618 		 * Before we assume that we can store here, we need to make
    619 		 * sure that it isn't in our metadata -- storing to our
    620 		 * dynamic variable metadata would corrupt our state.  For
    621 		 * the range to not include any dynamic variable metadata,
    622 		 * it must:
    623 		 *
    624 		 *	(1) Start above the hash table that is at the base of
    625 		 *	the dynamic variable space
    626 		 *
    627 		 *	(2) Have a starting chunk offset that is beyond the
    628 		 *	dtrace_dynvar_t that is at the base of every chunk
    629 		 *
    630 		 *	(3) Not span a chunk boundary
    631 		 *
    632 		 */
    633 		if (addr < base)
    634 			return (0);
    635 
    636 		chunkoffs = (addr - base) % dstate->dtds_chunksize;
    637 
    638 		if (chunkoffs < sizeof (dtrace_dynvar_t))
    639 			return (0);
    640 
    641 		if (chunkoffs + sz > dstate->dtds_chunksize)
    642 			return (0);
    643 
    644 		return (1);
    645 	}
    646 
    647 	/*
    648 	 * Finally, check the static local and global variables.  These checks
    649 	 * take the longest, so we perform them last.
    650 	 */
    651 	if (dtrace_canstore_statvar(addr, sz,
    652 	    vstate->dtvs_locals, vstate->dtvs_nlocals))
    653 		return (1);
    654 
    655 	if (dtrace_canstore_statvar(addr, sz,
    656 	    vstate->dtvs_globals, vstate->dtvs_nglobals))
    657 		return (1);
    658 
    659 	return (0);
    660 }
    661 
    662 
    663 /*
    664  * Convenience routine to check to see if the address is within a memory
    665  * region in which a load may be issued given the user's privilege level;
    666  * if not, it sets the appropriate error flags and loads 'addr' into the
    667  * illegal value slot.
    668  *
    669  * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
    670  * appropriate memory access protection.
    671  */
    672 static int
    673 dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
    674     dtrace_vstate_t *vstate)
    675 {
    676 	volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
    677 
    678 	/*
    679 	 * If we hold the privilege to read from kernel memory, then
    680 	 * everything is readable.
    681 	 */
    682 	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
    683 		return (1);
    684 
    685 	/*
    686 	 * You can obviously read that which you can store.
    687 	 */
    688 	if (dtrace_canstore(addr, sz, mstate, vstate))
    689 		return (1);
    690 
    691 	/*
    692 	 * We're allowed to read from our own string table.
    693 	 */
    694 	if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
    695 	    mstate->dtms_difo->dtdo_strlen))
    696 		return (1);
    697 
    698 	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
    699 	*illval = addr;
    700 	return (0);
    701 }
    702 
    703 /*
    704  * Convenience routine to check to see if a given string is within a memory
    705  * region in which a load may be issued given the user's privilege level;
    706  * this exists so that we don't need to issue unnecessary dtrace_strlen()
    707  * calls in the event that the user has all privileges.
    708  */
    709 static int
    710 dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
    711     dtrace_vstate_t *vstate)
    712 {
    713 	size_t strsz;
    714 
    715 	/*
    716 	 * If we hold the privilege to read from kernel memory, then
    717 	 * everything is readable.
    718 	 */
    719 	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
    720 		return (1);
    721 
    722 	strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
    723 	if (dtrace_canload(addr, strsz, mstate, vstate))
    724 		return (1);
    725 
    726 	return (0);
    727 }
    728 
    729 /*
    730  * Convenience routine to check to see if a given variable is within a memory
    731  * region in which a load may be issued given the user's privilege level.
    732  */
    733 static int
    734 dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
    735     dtrace_vstate_t *vstate)
    736 {
    737 	size_t sz;
    738 	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
    739 
    740 	/*
    741 	 * If we hold the privilege to read from kernel memory, then
    742 	 * everything is readable.
    743 	 */
    744 	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
    745 		return (1);
    746 
    747 	if (type->dtdt_kind == DIF_TYPE_STRING)
    748 		sz = dtrace_strlen(src,
    749 		    vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
    750 	else
    751 		sz = type->dtdt_size;
    752 
    753 	return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
    754 }
    755 
    756 /*
    757  * Compare two strings using safe loads.
    758  */
    759 static int
    760 dtrace_strncmp(char *s1, char *s2, size_t limit)
    761 {
    762 	uint8_t c1, c2;
    763 	volatile uint16_t *flags;
    764 
    765 	if (s1 == s2 || limit == 0)
    766 		return (0);
    767 
    768 	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
    769 
    770 	do {
    771 		if (s1 == NULL) {
    772 			c1 = '\0';
    773 		} else {
    774 			c1 = dtrace_load8((uintptr_t)s1++);
    775 		}
    776 
    777 		if (s2 == NULL) {
    778 			c2 = '\0';
    779 		} else {
    780 			c2 = dtrace_load8((uintptr_t)s2++);
    781 		}
    782 
    783 		if (c1 != c2)
    784 			return (c1 - c2);
    785 	} while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
    786 
    787 	return (0);
    788 }
    789 
    790 /*
    791  * Compute strlen(s) for a string using safe memory accesses.  The additional
    792  * len parameter is used to specify a maximum length to ensure completion.
    793  */
    794 static size_t
    795 dtrace_strlen(const char *s, size_t lim)
    796 {
    797 	uint_t len;
    798 
    799 	for (len = 0; len != lim; len++) {
    800 		if (dtrace_load8((uintptr_t)s++) == '\0')
    801 			break;
    802 	}
    803 
    804 	return (len);
    805 }
    806 
    807 /*
    808  * Check if an address falls within a toxic region.
    809  */
    810 static int
    811 dtrace_istoxic(uintptr_t kaddr, size_t size)
    812 {
    813 	uintptr_t taddr, tsize;
    814 	int i;
    815 
    816 	for (i = 0; i < dtrace_toxranges; i++) {
    817 		taddr = dtrace_toxrange[i].dtt_base;
    818 		tsize = dtrace_toxrange[i].dtt_limit - taddr;
    819 
    820 		if (kaddr - taddr < tsize) {
    821 			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
    822 			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
    823 			return (1);
    824 		}
    825 
    826 		if (taddr - kaddr < size) {
    827 			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
    828 			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
    829 			return (1);
    830 		}
    831 	}
    832 
    833 	return (0);
    834 }
    835 
    836 /*
    837  * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
    838  * memory specified by the DIF program.  The dst is assumed to be safe memory
    839  * that we can store to directly because it is managed by DTrace.  As with
    840  * standard bcopy, overlapping copies are handled properly.
    841  */
    842 static void
    843 dtrace_bcopy(const void *src, void *dst, size_t len)
    844 {
    845 	if (len != 0) {
    846 		uint8_t *s1 = dst;
    847 		const uint8_t *s2 = src;
    848 
    849 		if (s1 <= s2) {
    850 			do {
    851 				*s1++ = dtrace_load8((uintptr_t)s2++);
    852 			} while (--len != 0);
    853 		} else {
    854 			s2 += len;
    855 			s1 += len;
    856 
    857 			do {
    858 				*--s1 = dtrace_load8((uintptr_t)--s2);
    859 			} while (--len != 0);
    860 		}
    861 	}
    862 }
    863 
    864 /*
    865  * Copy src to dst using safe memory accesses, up to either the specified
    866  * length, or the point that a nul byte is encountered.  The src is assumed to
    867  * be unsafe memory specified by the DIF program.  The dst is assumed to be
    868  * safe memory that we can store to directly because it is managed by DTrace.
    869  * Unlike dtrace_bcopy(), overlapping regions are not handled.
    870  */
    871 static void
    872 dtrace_strcpy(const void *src, void *dst, size_t len)
    873 {
    874 	if (len != 0) {
    875 		uint8_t *s1 = dst, c;
    876 		const uint8_t *s2 = src;
    877 
    878 		do {
    879 			*s1++ = c = dtrace_load8((uintptr_t)s2++);
    880 		} while (--len != 0 && c != '\0');
    881 	}
    882 }
    883 
    884 /*
    885  * Copy src to dst, deriving the size and type from the specified (BYREF)
    886  * variable type.  The src is assumed to be unsafe memory specified by the DIF
    887  * program.  The dst is assumed to be DTrace variable memory that is of the
    888  * specified type; we assume that we can store to directly.
    889  */
    890 static void
    891 dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
    892 {
    893 	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
    894 
    895 	if (type->dtdt_kind == DIF_TYPE_STRING) {
    896 		dtrace_strcpy(src, dst, type->dtdt_size);
    897 	} else {
    898 		dtrace_bcopy(src, dst, type->dtdt_size);
    899 	}
    900 }
    901 
    902 /*
    903  * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
    904  * unsafe memory specified by the DIF program.  The s2 data is assumed to be
    905  * safe memory that we can access directly because it is managed by DTrace.
    906  */
    907 static int
    908 dtrace_bcmp(const void *s1, const void *s2, size_t len)
    909 {
    910 	volatile uint16_t *flags;
    911 
    912 	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
    913 
    914 	if (s1 == s2)
    915 		return (0);
    916 
    917 	if (s1 == NULL || s2 == NULL)
    918 		return (1);
    919 
    920 	if (s1 != s2 && len != 0) {
    921 		const uint8_t *ps1 = s1;
    922 		const uint8_t *ps2 = s2;
    923 
    924 		do {
    925 			if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
    926 				return (1);
    927 		} while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
    928 	}
    929 	return (0);
    930 }
    931 
    932 /*
    933  * Zero the specified region using a simple byte-by-byte loop.  Note that this
    934  * is for safe DTrace-managed memory only.
    935  */
    936 static void
    937 dtrace_bzero(void *dst, size_t len)
    938 {
    939 	uchar_t *cp;
    940 
    941 	for (cp = dst; len != 0; len--)
    942 		*cp++ = 0;
    943 }
    944 
    945 static void
    946 dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
    947 {
    948 	uint64_t result[2];
    949 
    950 	result[0] = addend1[0] + addend2[0];
    951 	result[1] = addend1[1] + addend2[1] +
    952 	    (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
    953 
    954 	sum[0] = result[0];
    955 	sum[1] = result[1];
    956 }
    957 
    958 /*
    959  * Shift the 128-bit value in a by b. If b is positive, shift left.
    960  * If b is negative, shift right.
    961  */
    962 static void
    963 dtrace_shift_128(uint64_t *a, int b)
    964 {
    965 	uint64_t mask;
    966 
    967 	if (b == 0)
    968 		return;
    969 
    970 	if (b < 0) {
    971 		b = -b;
    972 		if (b >= 64) {
    973 			a[0] = a[1] >> (b - 64);
    974 			a[1] = 0;
    975 		} else {
    976 			a[0] >>= b;
    977 			mask = 1LL << (64 - b);
    978 			mask -= 1;
    979 			a[0] |= ((a[1] & mask) << (64 - b));
    980 			a[1] >>= b;
    981 		}
    982 	} else {
    983 		if (b >= 64) {
    984 			a[1] = a[0] << (b - 64);
    985 			a[0] = 0;
    986 		} else {
    987 			a[1] <<= b;
    988 			mask = a[0] >> (64 - b);
    989 			a[1] |= mask;
    990 			a[0] <<= b;
    991 		}
    992 	}
    993 }
    994 
    995 /*
    996  * The basic idea is to break the 2 64-bit values into 4 32-bit values,
    997  * use native multiplication on those, and then re-combine into the
    998  * resulting 128-bit value.
    999  *
   1000  * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
   1001  *     hi1 * hi2 << 64 +
   1002  *     hi1 * lo2 << 32 +
   1003  *     hi2 * lo1 << 32 +
   1004  *     lo1 * lo2
   1005  */
   1006 static void
   1007 dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
   1008 {
   1009 	uint64_t hi1, hi2, lo1, lo2;
   1010 	uint64_t tmp[2];
   1011 
   1012 	hi1 = factor1 >> 32;
   1013 	hi2 = factor2 >> 32;
   1014 
   1015 	lo1 = factor1 & DT_MASK_LO;
   1016 	lo2 = factor2 & DT_MASK_LO;
   1017 
   1018 	product[0] = lo1 * lo2;
   1019 	product[1] = hi1 * hi2;
   1020 
   1021 	tmp[0] = hi1 * lo2;
   1022 	tmp[1] = 0;
   1023 	dtrace_shift_128(tmp, 32);
   1024 	dtrace_add_128(product, tmp, product);
   1025 
   1026 	tmp[0] = hi2 * lo1;
   1027 	tmp[1] = 0;
   1028 	dtrace_shift_128(tmp, 32);
   1029 	dtrace_add_128(product, tmp, product);
   1030 }
   1031 
   1032 /*
   1033  * This privilege check should be used by actions and subroutines to
   1034  * verify that the user credentials of the process that enabled the
   1035  * invoking ECB match the target credentials
   1036  */
   1037 static int
   1038 dtrace_priv_proc_common_user(dtrace_state_t *state)
   1039 {
   1040 	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
   1041 
   1042 	/*
   1043 	 * We should always have a non-NULL state cred here, since if cred
   1044 	 * is null (anonymous tracing), we fast-path bypass this routine.
   1045 	 */
   1046 	ASSERT(s_cr != NULL);
   1047 
   1048 	if ((cr = CRED()) != NULL &&
   1049 	    s_cr->cr_uid == cr->cr_uid &&
   1050 	    s_cr->cr_uid == cr->cr_ruid &&
   1051 	    s_cr->cr_uid ==