Home | History | Annotate | Download | only in dtrace
      1     0    stevel /*
      2     0    stevel  * CDDL HEADER START
      3     0    stevel  *
      4     0    stevel  * The contents of this file are subject to the terms of the
      5  1677        dp  * Common Development and Distribution License (the "License").
      6  1677        dp  * You may not use this file except in compliance with the License.
      7     0    stevel  *
      8     0    stevel  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9     0    stevel  * or http://www.opensolaris.org/os/licensing.
     10     0    stevel  * See the License for the specific language governing permissions
     11     0    stevel  * and limitations under the License.
     12     0    stevel  *
     13     0    stevel  * When distributing Covered Code, include this CDDL HEADER in each
     14     0    stevel  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15     0    stevel  * If applicable, add the following below this CDDL HEADER, with the
     16     0    stevel  * fields enclosed by brackets "[]" replaced with your own identifying
     17     0    stevel  * information: Portions Copyright [yyyy] [name of copyright owner]
     18     0    stevel  *
     19     0    stevel  * CDDL HEADER END
     20     0    stevel  */
     21  1710       ahl 
     22     0    stevel /*
     23  8803  Jonathan  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24     0    stevel  * Use is subject to license terms.
     25     0    stevel  */
     26     0    stevel 
     27     0    stevel /*
     28     0    stevel  * DTrace - Dynamic Tracing for Solaris
     29     0    stevel  *
     30     0    stevel  * This is the implementation of the Solaris Dynamic Tracing framework
     31     0    stevel  * (DTrace).  The user-visible interface to DTrace is described at length in
     32     0    stevel  * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
     33     0    stevel  * library, the in-kernel DTrace framework, and the DTrace providers are
     34     0    stevel  * described in the block comments in the <sys/dtrace.h> header file.  The
     35     0    stevel  * internal architecture of DTrace is described in the block comments in the
     36     0    stevel  * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
     37     0    stevel  * implementation very much assume mastery of all of these sources; if one has
     38     0    stevel  * an unanswered question about the implementation, one should consult them
     39     0    stevel  * first.
     40     0    stevel  *
     41     0    stevel  * The functions here are ordered roughly as follows:
     42     0    stevel  *
     43     0    stevel  *   - Probe context functions
     44     0    stevel  *   - Probe hashing functions
     45     0    stevel  *   - Non-probe context utility functions
     46     0    stevel  *   - Matching functions
     47     0    stevel  *   - Provider-to-Framework API functions
     48     0    stevel  *   - Probe management functions
     49     0    stevel  *   - DIF object functions
     50     0    stevel  *   - Format functions
     51     0    stevel  *   - Predicate functions
     52     0    stevel  *   - ECB functions
     53     0    stevel  *   - Buffer functions
     54     0    stevel  *   - Enabling functions
     55     0    stevel  *   - DOF functions
     56     0    stevel  *   - Anonymous enabling functions
     57     0    stevel  *   - Consumer state functions
     58     0    stevel  *   - Helper functions
     59     0    stevel  *   - Hook functions
     60     0    stevel  *   - Driver cookbook functions
     61     0    stevel  *
     62     0    stevel  * Each group of functions begins with a block comment labelled the "DTrace
     63     0    stevel  * [Group] Functions", allowing one to find each block by searching forward
     64     0    stevel  * on capital-f functions.
     65     0    stevel  */
     66     0    stevel #include <sys/errno.h>
     67     0    stevel #include <sys/stat.h>
     68     0    stevel #include <sys/modctl.h>
     69     0    stevel #include <sys/conf.h>
     70     0    stevel #include <sys/systm.h>
     71     0    stevel #include <sys/ddi.h>
     72     0    stevel #include <sys/sunddi.h>
     73     0    stevel #include <sys/cpuvar.h>
     74     0    stevel #include <sys/kmem.h>
     75     0    stevel #include <sys/strsubr.h>
     76     0    stevel #include <sys/sysmacros.h>
     77     0    stevel #include <sys/dtrace_impl.h>
     78     0    stevel #include <sys/atomic.h>
     79     0    stevel #include <sys/cmn_err.h>
     80     0    stevel #include <sys/mutex_impl.h>
     81     0    stevel #include <sys/rwlock_impl.h>
     82     0    stevel #include <sys/ctf_api.h>
     83     0    stevel #include <sys/panic.h>
     84     0    stevel #include <sys/priv_impl.h>
     85     0    stevel #include <sys/policy.h>
     86     0    stevel #include <sys/cred_impl.h>
     87     0    stevel #include <sys/procfs_isa.h>
     88     0    stevel #include <sys/taskq.h>
     89     0    stevel #include <sys/mkdev.h>
     90     0    stevel #include <sys/kdi.h>
     91     0    stevel #include <sys/zone.h>
     92  4291   brendan #include <sys/socket.h>
     93  4291   brendan #include <netinet/in.h>
     94     0    stevel 
     95     0    stevel /*
     96     0    stevel  * DTrace Tunable Variables
     97     0    stevel  *
     98     0    stevel  * The following variables may be tuned by adding a line to /etc/system that
     99     0    stevel  * includes both the name of the DTrace module ("dtrace") and the name of the
    100     0    stevel  * variable.  For example:
    101     0    stevel  *
    102     0    stevel  *   set dtrace:dtrace_destructive_disallow = 1
    103     0    stevel  *
    104     0    stevel  * In general, the only variables that one should be tuning this way are those
    105     0    stevel  * that affect system-wide DTrace behavior, and for which the default behavior
    106     0    stevel  * is undesirable.  Most of these variables are tunable on a per-consumer
    107     0    stevel  * basis using DTrace options, and need not be tuned on a system-wide basis.
    108     0    stevel  * When tuning these variables, avoid pathological values; while some attempt
    109     0    stevel  * is made to verify the integrity of these variables, they are not considered
    110     0    stevel  * part of the supported interface to DTrace, and they are therefore not
    111     0    stevel  * checked comprehensively.  Further, these variables should not be tuned
    112     0    stevel  * dynamically via "mdb -kw" or other means; they should only be tuned via
    113     0    stevel  * /etc/system.
    114     0    stevel  */
    115     0    stevel int		dtrace_destructive_disallow = 0;
    116     0    stevel dtrace_optval_t	dtrace_nonroot_maxsize = (16 * 1024 * 1024);
    117     0    stevel size_t		dtrace_difo_maxsize = (256 * 1024);
    118     0    stevel dtrace_optval_t	dtrace_dof_maxsize = (256 * 1024);
    119     0    stevel size_t		dtrace_global_maxsize = (16 * 1024);
    120     0    stevel size_t		dtrace_actions_max = (16 * 1024);
    121     0    stevel size_t		dtrace_retain_max = 1024;
    122     0    stevel dtrace_optval_t	dtrace_helper_actions_max = 32;
    123     0    stevel dtrace_optval_t	dtrace_helper_providers_max = 32;
    124     0    stevel dtrace_optval_t	dtrace_dstate_defsize = (1 * 1024 * 1024);
    125     0    stevel size_t		dtrace_strsize_default = 256;
    126     0    stevel dtrace_optval_t	dtrace_cleanrate_default = 9900990;		/* 101 hz */
    127     0    stevel dtrace_optval_t	dtrace_cleanrate_min = 200000;			/* 5000 hz */
    128     0    stevel dtrace_optval_t	dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;	/* 1/minute */
    129     0    stevel dtrace_optval_t	dtrace_aggrate_default = NANOSEC;		/* 1 hz */
    130     0    stevel dtrace_optval_t	dtrace_statusrate_default = NANOSEC;		/* 1 hz */
    131     0    stevel dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;	 /* 6/minute */
    132     0    stevel dtrace_optval_t	dtrace_switchrate_default = NANOSEC;		/* 1 hz */
    133     0    stevel dtrace_optval_t	dtrace_nspec_default = 1;
    134     0    stevel dtrace_optval_t	dtrace_specsize_default = 32 * 1024;
    135     0    stevel dtrace_optval_t dtrace_stackframes_default = 20;
    136     0    stevel dtrace_optval_t dtrace_ustackframes_default = 20;
    137     0    stevel dtrace_optval_t dtrace_jstackframes_default = 50;
    138     0    stevel dtrace_optval_t dtrace_jstackstrsize_default = 512;
    139     0    stevel int		dtrace_msgdsize_max = 128;
    140     0    stevel hrtime_t	dtrace_chill_max = 500 * (NANOSEC / MILLISEC);	/* 500 ms */
    141     0    stevel hrtime_t	dtrace_chill_interval = NANOSEC;		/* 1000 ms */
    142     0    stevel int		dtrace_devdepth_max = 32;
    143     0    stevel int		dtrace_err_verbose;
    144     0    stevel hrtime_t	dtrace_deadman_interval = NANOSEC;
    145     0    stevel hrtime_t	dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
    146     0    stevel hrtime_t	dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
    147     0    stevel 
    148     0    stevel /*
    149     0    stevel  * DTrace External Variables
    150     0    stevel  *
    151     0    stevel  * As dtrace(7D) is a kernel module, any DTrace variables are obviously
    152     0    stevel  * available to DTrace consumers via the backtick (`) syntax.  One of these,
    153     0    stevel  * dtrace_zero, is made deliberately so:  it is provided as a source of
    154     0    stevel  * well-known, zero-filled memory.  While this variable is not documented,
    155     0    stevel  * it is used by some translators as an implementation detail.
    156     0    stevel  */
    157     0    stevel const char	dtrace_zero[256] = { 0 };	/* zero-filled memory */
    158     0    stevel 
    159     0    stevel /*
    160     0    stevel  * DTrace Internal Variables
    161     0    stevel  */
    162     0    stevel static dev_info_t	*dtrace_devi;		/* device info */
    163     0    stevel static vmem_t		*dtrace_arena;		/* probe ID arena */
    164     0    stevel static vmem_t		*dtrace_minor;		/* minor number arena */
    165     0    stevel static taskq_t		*dtrace_taskq;		/* task queue */
    166     0    stevel static dtrace_probe_t	**dtrace_probes;	/* array of all probes */
    167     0    stevel static int		dtrace_nprobes;		/* number of probes */
    168     0    stevel static dtrace_provider_t *dtrace_provider;	/* provider list */
    169     0    stevel static dtrace_meta_t	*dtrace_meta_pid;	/* user-land meta provider */
    170     0    stevel static int		dtrace_opens;		/* number of opens */
    171   457       bmc static int		dtrace_helpers;		/* number of helpers */
    172     0    stevel static void		*dtrace_softstate;	/* softstate pointer */
    173     0    stevel static dtrace_hash_t	*dtrace_bymod;		/* probes hashed by module */
    174     0    stevel static dtrace_hash_t	*dtrace_byfunc;		/* probes hashed by function */
    175     0    stevel static dtrace_hash_t	*dtrace_byname;		/* probes hashed by name */
    176     0    stevel static dtrace_toxrange_t *dtrace_toxrange;	/* toxic range array */
    177     0    stevel static int		dtrace_toxranges;	/* number of toxic ranges */
    178     0    stevel static int		dtrace_toxranges_max;	/* size of toxic range array */
    179     0    stevel static dtrace_anon_t	dtrace_anon;		/* anonymous enabling */
    180     0    stevel static kmem_cache_t	*dtrace_state_cache;	/* cache for dynamic state */
    181     0    stevel static uint64_t		dtrace_vtime_references; /* number of vtimestamp refs */
    182     0    stevel static kthread_t	*dtrace_panicked;	/* panicking thread */
    183     0    stevel static dtrace_ecb_t	*dtrace_ecb_create_cache; /* cached created ECB */
    184     0    stevel static dtrace_genid_t	dtrace_probegen;	/* current probe generation */
    185     0    stevel static dtrace_helpers_t *dtrace_deferred_pid;	/* deferred helper list */
    186     0    stevel static dtrace_enabling_t *dtrace_retained;	/* list of retained enablings */
    187  7590  Jonathan static dtrace_genid_t	dtrace_retained_gen;	/* current retained enab gen */
    188  1739       bmc static dtrace_dynvar_t	dtrace_dynhash_sink;	/* end of dynamic hash chains */
    189     0    stevel 
    190     0    stevel /*
    191     0    stevel  * DTrace Locking
    192     0    stevel  * DTrace is protected by three (relatively coarse-grained) locks:
    193     0    stevel  *
    194     0    stevel  * (1) dtrace_lock is required to manipulate essentially any DTrace state,
    195     0    stevel  *     including enabling state, probes, ECBs, consumer state, helper state,
    196     0    stevel  *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
    197     0    stevel  *     probe context is lock-free -- synchronization is handled via the
    198     0    stevel  *     dtrace_sync() cross call mechanism.
    199     0    stevel  *
    200     0    stevel  * (2) dtrace_provider_lock is required when manipulating provider state, or
    201     0    stevel  *     when provider state must be held constant.
    202     0    stevel  *
    203     0    stevel  * (3) dtrace_meta_lock is required when manipulating meta provider state, or
    204     0    stevel  *     when meta provider state must be held constant.
    205     0    stevel  *
    206     0    stevel  * The lock ordering between these three locks is dtrace_meta_lock before
    207     0    stevel  * dtrace_provider_lock before dtrace_lock.  (In particular, there are
    208     0    stevel  * several places where dtrace_provider_lock is held by the framework as it
    209     0    stevel  * calls into the providers -- which then call back into the framework,
    210     0    stevel  * grabbing dtrace_lock.)
    211     0    stevel  *
    212   457       bmc  * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
    213   457       bmc  * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
    214   457       bmc  * role as a coarse-grained lock; it is acquired before both of these locks.
    215   457       bmc  * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
    216   457       bmc  * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
    217   457       bmc  * mod_lock is similar with respect to dtrace_provider_lock in that it must be
    218   457       bmc  * acquired _between_ dtrace_provider_lock and dtrace_lock.
    219     0    stevel  */
    220     0    stevel static kmutex_t		dtrace_lock;		/* probe state lock */
    221     0    stevel static kmutex_t		dtrace_provider_lock;	/* provider state lock */
    222     0    stevel static kmutex_t		dtrace_meta_lock;	/* meta-provider state lock */
    223     0    stevel 
    224     0    stevel /*
    225     0    stevel  * DTrace Provider Variables
    226     0    stevel  *
    227     0    stevel  * These are the variables relating to DTrace as a provider (that is, the
    228     0    stevel  * provider of the BEGIN, END, and ERROR probes).
    229     0    stevel  */
    230     0    stevel static dtrace_pattr_t	dtrace_provider_attr = {
    231     0    stevel { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
    232     0    stevel { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
    233     0    stevel { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
    234     0    stevel { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
    235     0    stevel { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
    236     0    stevel };
    237     0    stevel 
    238     0    stevel static void
    239     0    stevel dtrace_nullop(void)
    240     0    stevel {}
    241     0    stevel 
    242  8803  Jonathan static int
    243  8803  Jonathan dtrace_enable_nullop(void)
    244  8803  Jonathan {
    245  8803  Jonathan 	return (0);
    246  8803  Jonathan }
    247  8803  Jonathan 
    248     0    stevel static dtrace_pops_t	dtrace_provider_ops = {
    249     0    stevel 	(void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
    250     0    stevel 	(void (*)(void *, struct modctl *))dtrace_nullop,
    251  8803  Jonathan 	(int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop,
    252     0    stevel 	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
    253     0    stevel 	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
    254     0    stevel 	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
    255     0    stevel 	NULL,
    256     0    stevel 	NULL,
    257     0    stevel 	NULL,
    258     0    stevel 	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop
    259     0    stevel };
    260     0    stevel 
    261     0    stevel static dtrace_id_t	dtrace_probeid_begin;	/* special BEGIN probe */
    262     0    stevel static dtrace_id_t	dtrace_probeid_end;	/* special END probe */
    263     0    stevel dtrace_id_t		dtrace_probeid_error;	/* special ERROR probe */
    264     0    stevel 
    265     0    stevel /*
    266     0    stevel  * DTrace Helper Tracing Variables
    267     0    stevel  */
    268     0    stevel uint32_t dtrace_helptrace_next = 0;
    269     0    stevel uint32_t dtrace_helptrace_nlocals;
    270     0    stevel char	*dtrace_helptrace_buffer;
    271     0    stevel int	dtrace_helptrace_bufsize = 512 * 1024;
    272     0    stevel 
    273     0    stevel #ifdef DEBUG
    274     0    stevel int	dtrace_helptrace_enabled = 1;
    275     0    stevel #else
    276     0    stevel int	dtrace_helptrace_enabled = 0;
    277     0    stevel #endif
    278     0    stevel 
    279     0    stevel /*
    280     0    stevel  * DTrace Error Hashing
    281     0    stevel  *
    282     0    stevel  * On DEBUG kernels, DTrace will track the errors that has seen in a hash
    283     0    stevel  * table.  This is very useful for checking coverage of tests that are
    284     0    stevel  * expected to induce DIF or DOF processing errors, and may be useful for
    285     0    stevel  * debugging problems in the DIF code generator or in DOF generation .  The
    286     0    stevel  * error hash may be examined with the ::dtrace_errhash MDB dcmd.
    287     0    stevel  */
    288     0    stevel #ifdef DEBUG
    289     0    stevel static dtrace_errhash_t	dtrace_errhash[DTRACE_ERRHASHSZ];
    290     0    stevel static const char *dtrace_errlast;
    291     0    stevel static kthread_t *dtrace_errthread;
    292     0    stevel static kmutex_t dtrace_errlock;
    293     0    stevel #endif
    294     0    stevel 
    295     0    stevel /*
    296     0    stevel  * DTrace Macros and Constants
    297     0    stevel  *
    298     0    stevel  * These are various macros that are useful in various spots in the
    299     0    stevel  * implementation, along with a few random constants that have no meaning
    300     0    stevel  * outside of the implementation.  There is no real structure to this cpp
    301     0    stevel  * mishmash -- but is there ever?
    302     0    stevel  */
    303     0    stevel #define	DTRACE_HASHSTR(hash, probe)	\
    304     0    stevel 	dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
    305     0    stevel 
    306     0    stevel #define	DTRACE_HASHNEXT(hash, probe)	\
    307     0    stevel 	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
    308     0    stevel 
    309     0    stevel #define	DTRACE_HASHPREV(hash, probe)	\
    310     0    stevel 	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
    311     0    stevel 
    312     0    stevel #define	DTRACE_HASHEQ(hash, lhs, rhs)	\
    313     0    stevel 	(strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
    314     0    stevel 	    *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
    315     0    stevel 
    316     0    stevel #define	DTRACE_AGGHASHSIZE_SLEW		17
    317  4291   brendan 
    318  4291   brendan #define	DTRACE_V4MAPPED_OFFSET		(sizeof (uint32_t) * 3)
    319     0    stevel 
    320     0    stevel /*
    321     0    stevel  * The key for a thread-local variable consists of the lower 61 bits of the
    322     0    stevel  * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
    323     0    stevel  * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
    324     0    stevel  * equal to a variable identifier.  This is necessary (but not sufficient) to
    325     0    stevel  * assure that global associative arrays never collide with thread-local
    326     0    stevel  * variables.  To guarantee that they cannot collide, we must also define the
    327     0    stevel  * order for keying dynamic variables.  That order is:
    328     0    stevel  *
    329     0    stevel  *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
    330     0    stevel  *
    331     0    stevel  * Because the variable-key and the tls-key are in orthogonal spaces, there is
    332     0    stevel  * no way for a global variable key signature to match a thread-local key
    333     0    stevel  * signature.
    334     0    stevel  */
    335     0    stevel #define	DTRACE_TLS_THRKEY(where) { \
    336     0    stevel 	uint_t intr = 0; \
    337     0    stevel 	uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
    338     0    stevel 	for (; actv; actv >>= 1) \
    339     0    stevel 		intr++; \
    340     0    stevel 	ASSERT(intr < (1 << 3)); \
    341     0    stevel 	(where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
    342     0    stevel 	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
    343     0    stevel }
    344  2769       ahl 
    345  2769       ahl #define	DT_BSWAP_8(x)	((x) & 0xff)
    346  2769       ahl #define	DT_BSWAP_16(x)	((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
    347  2769       ahl #define	DT_BSWAP_32(x)	((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
    348  2769       ahl #define	DT_BSWAP_64(x)	((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
    349  5984   jhaslam 
    350  5984   jhaslam #define	DT_MASK_LO 0x00000000FFFFFFFFULL
    351     0    stevel 
    352     0    stevel #define	DTRACE_STORE(type, tomax, offset, what) \
    353     0    stevel 	*((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
    354     0    stevel 
    355     0    stevel #ifndef __i386
    356     0    stevel #define	DTRACE_ALIGNCHECK(addr, size, flags)				\
    357     0    stevel 	if (addr & (size - 1)) {					\
    358     0    stevel 		*flags |= CPU_DTRACE_BADALIGN;				\
    359     0    stevel 		cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;	\
    360     0    stevel 		return (0);						\
    361     0    stevel 	}
    362     0    stevel #else
    363     0    stevel #define	DTRACE_ALIGNCHECK(addr, size, flags)
    364     0    stevel #endif
    365  2870        dp 
    366  2870        dp /*
    367  2870        dp  * Test whether a range of memory starting at testaddr of size testsz falls
    368  2922        dp  * within the range of memory described by addr, sz.  We take care to avoid
    369  2922        dp  * problems with overflow and underflow of the unsigned quantities, and
    370  2922        dp  * disallow all negative sizes.  Ranges of size 0 are allowed.
    371  2870        dp  */
    372  2870        dp #define	DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
    373  2870        dp 	((testaddr) - (baseaddr) < (basesz) && \
    374  2922        dp 	(testaddr) + (testsz) - (baseaddr) <= (basesz) && \
    375  2922        dp 	(testaddr) + (testsz) >= (testaddr))
    376  2922        dp 
    377  2922        dp /*
    378  2922        dp  * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
    379  2922        dp  * alloc_sz on the righthand side of the comparison in order to avoid overflow
    380  2922        dp  * or underflow in the comparison with it.  This is simpler than the INRANGE
    381  2922        dp  * check above, because we know that the dtms_scratch_ptr is valid in the
    382  2922        dp  * range.  Allocations of size zero are allowed.
    383  2922        dp  */
    384  2922        dp #define	DTRACE_INSCRATCH(mstate, alloc_sz) \
    385  2922        dp 	((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
    386  2922        dp 	(mstate)->dtms_scratch_ptr >= (alloc_sz))
    387     0    stevel 
    388     0    stevel #define	DTRACE_LOADFUNC(bits)						\
    389     0    stevel /*CSTYLED*/								\
    390     0    stevel uint##bits##_t								\
    391     0    stevel dtrace_load##bits(uintptr_t addr)					\
    392     0    stevel {									\
    393     0    stevel 	size_t size = bits / NBBY;					\
    394     0    stevel 	/*CSTYLED*/							\
    395     0    stevel 	uint##bits##_t rval;						\
    396     0    stevel 	int i;								\
    397     0    stevel 	volatile uint16_t *flags = (volatile uint16_t *)		\
    398     0    stevel 	    &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;			\
    399     0    stevel 									\
    400     0    stevel 	DTRACE_ALIGNCHECK(addr, size, flags);				\
    401     0    stevel 									\
    402     0    stevel 	for (i = 0; i < dtrace_toxranges; i++) {			\
    403     0    stevel 		if (addr >= dtrace_toxrange[i].dtt_limit)		\
    404     0    stevel 			continue;					\
    405     0    stevel 									\
    406     0    stevel 		if (addr + size <= dtrace_toxrange[i].dtt_base)		\
    407     0    stevel 			continue;					\
    408     0    stevel 									\
    409     0    stevel 		/*							\
    410     0    stevel 		 * This address falls within a toxic region; return 0.	\
    411     0    stevel 		 */							\
    412     0    stevel 		*flags |= CPU_DTRACE_BADADDR;				\
    413     0    stevel 		cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;	\
    414     0    stevel 		return (0);						\
    415     0    stevel 	}								\
    416     0    stevel 									\
    417     0    stevel 	*flags |= CPU_DTRACE_NOFAULT;					\
    418     0    stevel 	/*CSTYLED*/							\
    419     0    stevel 	rval = *((volatile uint##bits##_t *)addr);			\
    420     0    stevel 	*flags &= ~CPU_DTRACE_NOFAULT;					\
    421     0    stevel 									\
    422  3043       bmc 	return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0);		\
    423     0    stevel }
    424     0    stevel 
    425     0    stevel #ifdef _LP64
    426     0    stevel #define	dtrace_loadptr	dtrace_load64
    427     0    stevel #else
    428     0    stevel #define	dtrace_loadptr	dtrace_load32
    429     0    stevel #endif
    430     0    stevel 
    431  1739       bmc #define	DTRACE_DYNHASH_FREE	0
    432  1739       bmc #define	DTRACE_DYNHASH_SINK	1
    433  1739       bmc #define	DTRACE_DYNHASH_VALID	2
    434  1739       bmc 
    435  8803  Jonathan #define	DTRACE_MATCH_FAIL	-1
    436     0    stevel #define	DTRACE_MATCH_NEXT	0
    437     0    stevel #define	DTRACE_MATCH_DONE	1
    438     0    stevel #define	DTRACE_ANCHORED(probe)	((probe)->dtpr_func[0] != '\0')
    439     0    stevel #define	DTRACE_STATE_ALIGN	64
    440   491       bmc 
    441   491       bmc #define	DTRACE_FLAGS2FLT(flags)						\
    442   491       bmc 	(((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :		\
    443   491       bmc 	((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :		\
    444   491       bmc 	((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :		\
    445   491       bmc 	((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :		\
    446   491       bmc 	((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :		\
    447   491       bmc 	((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :		\
    448   491       bmc 	((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :		\
    449   491       bmc 	((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :	\
    450  3682   jhaslam 	((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :		\
    451   491       bmc 	DTRACEFLT_UNKNOWN)
    452     0    stevel 
    453  1017       bmc #define	DTRACEACT_ISSTRING(act)						\
    454  1017       bmc 	((act)->dta_kind == DTRACEACT_DIFEXPR &&			\
    455  1017       bmc 	(act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
    456  1017       bmc 
    457  2870        dp static size_t dtrace_strlen(const char *, size_t);
    458     0    stevel static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
    459     0    stevel static void dtrace_enabling_provide(dtrace_provider_t *);
    460     0    stevel static int dtrace_enabling_match(dtrace_enabling_t *, int *);
    461     0    stevel static void dtrace_enabling_matchall(void);
    462     0    stevel static dtrace_state_t *dtrace_anon_grab(void);
    463     0    stevel static uint64_t dtrace_helper(int, dtrace_mstate_t *,
    464     0    stevel     dtrace_state_t *, uint64_t, uint64_t);
    465     0    stevel static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
    466     0    stevel static void dtrace_buffer_drop(dtrace_buffer_t *);
    467     0    stevel static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
    468     0    stevel     dtrace_state_t *, dtrace_mstate_t *);
    469     0    stevel static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
    470     0    stevel     dtrace_optval_t);
    471     0    stevel static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
    472  2179       ahl static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
    473     0    stevel 
    474     0    stevel /*
    475     0    stevel  * DTrace Probe Context Functions
    476     0    stevel  *
    477     0    stevel  * These functions are called from probe context.  Because probe context is
    478     0    stevel  * any context in which C may be called, arbitrarily locks may be held,
    479     0    stevel  * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
    480     0    stevel  * As a result, functions called from probe context may only call other DTrace
    481     0    stevel  * support functions -- they may not interact at all with the system at large.
    482     0    stevel  * (Note that the ASSERT macro is made probe-context safe by redefining it in
    483     0    stevel  * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
    484     0    stevel  * loads are to be performed from probe context, they _must_ be in terms of
    485     0    stevel  * the safe dtrace_load*() variants.
    486     0    stevel  *
    487     0    stevel  * Some functions in this block are not actually called from probe context;
    488     0    stevel  * for these functions, there will be a comment above the function reading
    489     0    stevel  * "Note:  not called from probe context."
    490     0    stevel  */
    491     0    stevel void
    492     0    stevel dtrace_panic(const char *format, ...)
    493     0    stevel {
    494     0    stevel 	va_list alist;
    495     0    stevel 
    496     0    stevel 	va_start(alist, format);
    497     0    stevel 	dtrace_vpanic(format, alist);
    498     0    stevel 	va_end(alist);
    499     0    stevel }
    500     0    stevel 
    501     0    stevel int
    502     0    stevel dtrace_assfail(const char *a, const char *f, int l)
    503     0    stevel {
    504     0    stevel 	dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
    505     0    stevel 
    506     0    stevel 	/*
    507     0    stevel 	 * We just need something here that even the most clever compiler
    508     0    stevel 	 * cannot optimize away.
    509     0    stevel 	 */
    510     0    stevel 	return (a[(uintptr_t)f]);
    511     0    stevel }
    512     0    stevel 
    513     0    stevel /*
    514   457       bmc  * Atomically increment a specified error counter from probe context.
    515   457       bmc  */
    516   457       bmc static void
    517   457       bmc dtrace_error(uint32_t *counter)
    518   457       bmc {
    519   457       bmc 	/*
    520   457       bmc 	 * Most counters stored to in probe context are per-CPU counters.
    521   457       bmc 	 * However, there are some error conditions that are sufficiently
    522   457       bmc 	 * arcane that they don't merit per-CPU storage.  If these counters
    523   457       bmc 	 * are incremented concurrently on different CPUs, scalability will be
    524   457       bmc 	 * adversely affected -- but we don't expect them to be white-hot in a
    525   457       bmc 	 * correctly constructed enabling...
    526   457       bmc 	 */
    527   457       bmc 	uint32_t oval, nval;
    528   457       bmc 
    529   457       bmc 	do {
    530   457       bmc 		oval = *counter;
    531   457       bmc 
    532   457       bmc 		if ((nval = oval + 1) == 0) {
    533   457       bmc 			/*
    534   457       bmc 			 * If the counter would wrap, set it to 1 -- assuring
    535   457       bmc 			 * that the counter is never zero when we have seen
    536   457       bmc 			 * errors.  (The counter must be 32-bits because we
    537   457       bmc 			 * aren't guaranteed a 64-bit compare&swap operation.)
    538   457       bmc 			 * To save this code both the infamy of being fingered
    539   457       bmc 			 * by a priggish news story and the indignity of being
    540   457       bmc 			 * the target of a neo-puritan witch trial, we're
    541   457       bmc 			 * carefully avoiding any colorful description of the
    542   457       bmc 			 * likelihood of this condition -- but suffice it to
    543   457       bmc 			 * say that it is only slightly more likely than the
    544   457       bmc 			 * overflow of predicate cache IDs, as discussed in
    545   457       bmc 			 * dtrace_predicate_create().
    546   457       bmc 			 */
    547   457       bmc 			nval = 1;
    548   457       bmc 		}
    549   457       bmc 	} while (dtrace_cas32(counter, oval, nval) != oval);
    550   457       bmc }
    551   457       bmc 
    552   457       bmc /*
    553     0    stevel  * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
    554     0    stevel  * uint8_t, a uint16_t, a uint32_t and a uint64_t.
    555     0    stevel  */
    556     0    stevel DTRACE_LOADFUNC(8)
    557     0    stevel DTRACE_LOADFUNC(16)
    558     0    stevel DTRACE_LOADFUNC(32)
    559     0    stevel DTRACE_LOADFUNC(64)
    560     0    stevel 
    561     0    stevel static int
    562     0    stevel dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
    563     0    stevel {
    564     0    stevel 	if (dest < mstate->dtms_scratch_base)
    565     0    stevel 		return (0);
    566     0    stevel 
    567     0    stevel 	if (dest + size < dest)
    568     0    stevel 		return (0);
    569     0    stevel 
    570     0    stevel 	if (dest + size > mstate->dtms_scratch_ptr)
    571     0    stevel 		return (0);
    572     0    stevel 
    573     0    stevel 	return (1);
    574     0    stevel }
    575     0    stevel 
    576     0    stevel static int
    577     0    stevel dtrace_canstore_statvar(uint64_t addr, size_t sz,
    578     0    stevel     dtrace_statvar_t **svars, int nsvars)
    579     0    stevel {
    580     0    stevel 	int i;
    581     0    stevel 
    582     0    stevel 	for (i = 0; i < nsvars; i++) {
    583     0    stevel 		dtrace_statvar_t *svar = svars[i];
    584     0    stevel 
    585     0    stevel 		if (svar == NULL || svar->dtsv_size == 0)
    586     0    stevel 			continue;
    587     0    stevel 
    588  2870        dp 		if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
    589     0    stevel 			return (1);
    590     0    stevel 	}
    591     0    stevel 
    592     0    stevel 	return (0);
    593     0    stevel }
    594     0    stevel 
    595     0    stevel /*
    596     0    stevel  * Check to see if the address is within a memory region to which a store may
    597     0    stevel  * be issued.  This includes the DTrace scratch areas, and any DTrace variable
    598     0    stevel  * region.  The caller of dtrace_canstore() is responsible for performing any
    599     0    stevel  * alignment checks that are needed before stores are actually executed.
    600     0    stevel  */
    601     0    stevel static int
    602     0    stevel dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
    603     0    stevel     dtrace_vstate_t *vstate)
    604     0    stevel {
    605     0    stevel 	/*
    606     0    stevel 	 * First, check to see if the address is in scratch space...
    607     0    stevel 	 */
    608  2870        dp 	if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
    609  2870        dp 	    mstate->dtms_scratch_size))
    610     0    stevel 		return (1);
    611     0    stevel 
    612     0    stevel 	/*
    613     0    stevel 	 * Now check to see if it's a dynamic variable.  This check will pick
    614     0    stevel 	 * up both thread-local variables and any global dynamically-allocated
    615     0    stevel 	 * variables.
    616     0    stevel 	 */
    617  2870        dp 	if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
    618  4682   jhaslam 	    vstate->dtvs_dynvars.dtds_size)) {
    619  4682   jhaslam 		dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
    620  4682   jhaslam 		uintptr_t base = (uintptr_t)dstate->dtds_base +
    621  4682   jhaslam 		    (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
    622  4682   jhaslam 		uintptr_t chunkoffs;
    623  4682   jhaslam 
    624  4682   jhaslam 		/*
    625  4682   jhaslam 		 * Before we assume that we can store here, we need to make
    626  4682   jhaslam 		 * sure that it isn't in our metadata -- storing to our
    627  4682   jhaslam 		 * dynamic variable metadata would corrupt our state.  For
    628  4682   jhaslam 		 * the range to not include any dynamic variable metadata,
    629  4682   jhaslam 		 * it must:
    630  4682   jhaslam 		 *
    631  4682   jhaslam 		 *	(1) Start above the hash table that is at the base of
    632  4682   jhaslam 		 *	the dynamic variable space
    633  4682   jhaslam 		 *
    634  4682   jhaslam 		 *	(2) Have a starting chunk offset that is beyond the
    635  4682   jhaslam 		 *	dtrace_dynvar_t that is at the base of every chunk
    636  4682   jhaslam 		 *
    637  4682   jhaslam 		 *	(3) Not span a chunk boundary
    638  4682   jhaslam 		 *
    639  4682   jhaslam 		 */
    640  4682   jhaslam 		if (addr < base)
    641  4682   jhaslam 			return (0);
    642  4682   jhaslam 
    643  4682   jhaslam 		chunkoffs = (addr - base) % dstate->dtds_chunksize;
    644  4682   jhaslam 
    645  4682   jhaslam 		if (chunkoffs < sizeof (dtrace_dynvar_t))
    646  4682   jhaslam 			return (0);
    647  4682   jhaslam 
    648  4682   jhaslam 		if (chunkoffs + sz > dstate->dtds_chunksize)
    649  4682   jhaslam 			return (0);
    650  4682   jhaslam 
    651  4682   jhaslam 		return (1);
    652  4682   jhaslam 	}
    653     0    stevel 
    654     0    stevel 	/*
    655     0    stevel 	 * Finally, check the static local and global variables.  These checks
    656     0    stevel 	 * take the longest, so we perform them last.
    657     0    stevel 	 */
    658     0    stevel 	if (dtrace_canstore_statvar(addr, sz,
    659     0    stevel 	    vstate->dtvs_locals, vstate->dtvs_nlocals))
    660     0    stevel 		return (1);
    661     0    stevel 
    662     0    stevel 	if (dtrace_canstore_statvar(addr, sz,
    663     0    stevel 	    vstate->dtvs_globals, vstate->dtvs_nglobals))
    664     0    stevel 		return (1);
    665     0    stevel 
    666     0    stevel 	return (0);
    667  2870        dp }
    668  2870        dp 
    669  2870        dp 
    670  2870        dp /*
    671  2870        dp  * Convenience routine to check to see if the address is within a memory
    672  2870        dp  * region in which a load may be issued given the user's privilege level;
    673  2870        dp  * if not, it sets the appropriate error flags and loads 'addr' into the
    674  2870        dp  * illegal value slot.
    675  2870        dp  *
    676  2870        dp  * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
    677  2870        dp  * appropriate memory access protection.
    678  2870        dp  */
    679  2870        dp static int
    680  2870        dp dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
    681  2870        dp     dtrace_vstate_t *vstate)
    682  2870        dp {
    683  2870        dp 	volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
    684  2870        dp 
    685  2870        dp 	/*
    686  2870        dp 	 * If we hold the privilege to read from kernel memory, then
    687  2870        dp 	 * everything is readable.
    688  2870        dp 	 */
    689  2870        dp 	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
    690  2870        dp 		return (1);
    691  2870        dp 
    692  2870        dp 	/*
    693  2870        dp 	 * You can obviously read that which you can store.
    694  2870        dp 	 */
    695  2870        dp 	if (dtrace_canstore(addr, sz, mstate, vstate))
    696  2870        dp 		return (1);
    697  2870        dp 
    698  2870        dp 	/*
    699  2870        dp 	 * We're allowed to read from our own string table.
    700  2870        dp 	 */
    701  2870        dp 	if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
    702  2870        dp 	    mstate->dtms_difo->dtdo_strlen))
    703  2870        dp 		return (1);
    704  2870        dp 
    705  2870        dp 	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
    706  2870        dp 	*illval = addr;
    707  2870        dp 	return (0);
    708  2870        dp }
    709  2870        dp 
    710  2870        dp /*
    711  2870        dp  * Convenience routine to check to see if a given string is within a memory
    712  2870        dp  * region in which a load may be issued given the user's privilege level;
    713  2870        dp  * this exists so that we don't need to issue unnecessary dtrace_strlen()
    714  2870        dp  * calls in the event that the user has all privileges.
    715  2870        dp  */
    716  2870        dp static int
    717  2870        dp dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
    718  2870        dp     dtrace_vstate_t *vstate)
    719  2870        dp {
    720  2870        dp 	size_t strsz;
    721  2870        dp 
    722  2870        dp 	/*
    723  2870        dp 	 * If we hold the privilege to read from kernel memory, then
    724  2870        dp 	 * everything is readable.
    725  2870        dp 	 */
    726  2870        dp 	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
    727  2870        dp 		return (1);
    728  2870        dp 
    729  2870        dp 	strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
    730  2870        dp 	if (dtrace_canload(addr, strsz, mstate, vstate))
    731  2870        dp 		return (1);
    732  2870        dp 
    733  2870        dp 	return (0);
    734  2870        dp }
    735  2870        dp 
    736  2870        dp /*
    737  2870        dp  * Convenience routine to check to see if a given variable is within a memory
    738  2870        dp  * region in which a load may be issued given the user's privilege level.
    739  2870        dp  */
    740  2870        dp static int
    741  2870        dp dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
    742  2870        dp     dtrace_vstate_t *vstate)
    743  2870        dp {
    744  2870        dp 	size_t sz;
    745  2870        dp 	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
    746  2870        dp 
    747  2870        dp 	/*
    748  2870        dp 	 * If we hold the privilege to read from kernel memory, then
    749  2870        dp 	 * everything is readable.
    750  2870        dp 	 */
    751  2870        dp 	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
    752  2870        dp 		return (1);
    753  2870        dp 
    754  2870        dp 	if (type->dtdt_kind == DIF_TYPE_STRING)
    755  2870        dp 		sz = dtrace_strlen(src,
    756  2870        dp 		    vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
    757  2870        dp 	else
    758  2870        dp 		sz = type->dtdt_size;
    759  2870        dp 
    760  2870        dp 	return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
    761     0    stevel }
    762     0    stevel 
    763     0    stevel /*
    764     0    stevel  * Compare two strings using safe loads.
    765     0    stevel  */
    766     0    stevel static int
    767     0    stevel dtrace_strncmp(char *s1, char *s2, size_t limit)
    768     0    stevel {
    769     0    stevel 	uint8_t c1, c2;
    770     0    stevel 	volatile uint16_t *flags;
    771     0    stevel 
    772     0    stevel 	if (s1 == s2 || limit == 0)
    773     0    stevel 		return (0);
    774     0    stevel 
    775     0    stevel 	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
    776     0    stevel 
    777     0    stevel 	do {
    778     0    stevel 		if (s1 == NULL) {
    779     0    stevel 			c1 = '\0';
    780     0    stevel 		} else {
    781     0    stevel 			c1 = dtrace_load8((uintptr_t)s1++);
    782     0    stevel 		}
    783     0    stevel 
    784     0    stevel 		if (s2 == NULL) {
    785     0    stevel 			c2 = '\0';
    786     0    stevel 		} else {
    787     0    stevel 			c2 = dtrace_load8((uintptr_t)s2++);
    788     0    stevel 		}
    789     0    stevel 
    790     0    stevel 		if (c1 != c2)
    791     0    stevel 			return (c1 - c2);
    792     0    stevel 	} while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
    793     0    stevel 
    794     0    stevel 	return (0);
    795     0    stevel }
    796     0    stevel 
    797     0    stevel /*
    798     0    stevel  * Compute strlen(s) for a string using safe memory accesses.  The additional
    799     0    stevel  * len parameter is used to specify a maximum length to ensure completion.
    800     0    stevel  */
    801     0    stevel static size_t
    802     0    stevel dtrace_strlen(const char *s, size_t lim)
    803     0    stevel {
    804     0    stevel 	uint_t len;
    805     0    stevel 
    806     0    stevel 	for (len = 0; len != lim; len++) {
    807     0    stevel 		if (dtrace_load8((uintptr_t)s++) == '\0')
    808     0    stevel 			break;
    809     0    stevel 	}
    810     0    stevel 
    811     0    stevel 	return (len);
    812     0    stevel }
    813     0    stevel 
    814     0    stevel /*
    815     0    stevel  * Check if an address falls within a toxic region.
    816     0    stevel  */
    817     0    stevel static int
    818     0    stevel dtrace_istoxic(uintptr_t kaddr, size_t size)
    819     0    stevel {
    820     0    stevel 	uintptr_t taddr, tsize;
    821     0    stevel 	int i;
    822     0    stevel 
    823     0    stevel 	for (i = 0; i < dtrace_toxranges; i++) {
    824     0    stevel 		taddr = dtrace_toxrange[i].dtt_base;
    825     0    stevel 		tsize = dtrace_toxrange[i].dtt_limit - taddr;
    826     0    stevel 
    827     0    stevel 		if (kaddr - taddr < tsize) {
    828     0    stevel 			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
    829     0    stevel 			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
    830     0    stevel 			return (1);
    831     0    stevel 		}
    832     0    stevel 
    833     0    stevel 		if (taddr - kaddr < size) {
    834     0    stevel 			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
    835     0    stevel 			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
    836     0    stevel 			return (1);
    837     0    stevel 		}
    838     0    stevel 	}
    839     0    stevel 
    840     0    stevel 	return (0);
    841     0    stevel }
    842     0    stevel 
    843     0    stevel /*
    844     0    stevel  * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
    845     0    stevel  * memory specified by the DIF program.  The dst is assumed to be safe memory
    846     0    stevel  * that we can store to directly because it is managed by DTrace.  As with
    847     0    stevel  * standard bcopy, overlapping copies are handled properly.
    848     0    stevel  */
    849     0    stevel static void
    850     0    stevel dtrace_bcopy(const void *src, void *dst, size_t len)
    851     0    stevel {
    852     0    stevel 	if (len != 0) {
    853     0    stevel 		uint8_t *s1 = dst;
    854     0    stevel 		const uint8_t *s2 = src;
    855     0    stevel 
    856     0    stevel 		if (s1 <= s2) {
    857     0    stevel 			do {
    858     0    stevel 				*s1++ = dtrace_load8((uintptr_t)s2++);
    859     0    stevel 			} while (--len != 0);
    860     0    stevel 		} else {
    861     0    stevel 			s2 += len;
    862     0    stevel 			s1 += len;
    863     0    stevel 
    864     0    stevel 			do {
    865     0    stevel 				*--s1 = dtrace_load8((uintptr_t)--s2);
    866     0    stevel 			} while (--len != 0);
    867     0    stevel 		}
    868     0    stevel 	}
    869     0    stevel }
    870     0    stevel 
    871     0    stevel /*
    872     0    stevel  * Copy src to dst using safe memory accesses, up to either the specified
    873     0    stevel  * length, or the point that a nul byte is encountered.  The src is assumed to
    874     0    stevel  * be unsafe memory specified by the DIF program.  The dst is assumed to be
    875     0    stevel  * safe memory that we can store to directly because it is managed by DTrace.
    876     0    stevel  * Unlike dtrace_bcopy(), overlapping regions are not handled.
    877     0    stevel  */
    878     0    stevel static void
    879     0    stevel dtrace_strcpy(const void *src, void *dst, size_t len)
    880     0    stevel {
    881     0    stevel 	if (len != 0) {
    882     0    stevel 		uint8_t *s1 = dst, c;
    883     0    stevel 		const uint8_t *s2 = src;
    884     0    stevel 
    885     0    stevel 		do {
    886     0    stevel 			*s1++ = c = dtrace_load8((uintptr_t)s2++);
    887     0    stevel 		} while (--len != 0 && c != '\0');
    888     0    stevel 	}
    889     0    stevel }
    890     0    stevel 
    891     0    stevel /*
    892     0    stevel  * Copy src to dst, deriving the size and type from the specified (BYREF)
    893     0    stevel  * variable type.  The src is assumed to be unsafe memory specified by the DIF
    894     0    stevel  * program.  The dst is assumed to be DTrace variable memory that is of the
    895     0    stevel  * specified type; we assume that we can store to directly.
    896     0    stevel  */
    897     0    stevel static void
    898     0    stevel dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
    899     0    stevel {
    900     0    stevel 	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
    901     0    stevel 
    902     0    stevel 	if (type->dtdt_kind == DIF_TYPE_STRING) {
    903     0    stevel 		dtrace_strcpy(src, dst, type->dtdt_size);
    904     0    stevel 	} else {
    905     0    stevel 		dtrace_bcopy(src, dst, type->dtdt_size);
    906     0    stevel 	}
    907     0    stevel }
    908     0    stevel 
    909     0    stevel /*
    910     0    stevel  * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
    911     0    stevel  * unsafe memory specified by the DIF program.  The s2 data is assumed to be
    912     0    stevel  * safe memory that we can access directly because it is managed by DTrace.
    913     0    stevel  */
    914     0    stevel static int
    915     0    stevel dtrace_bcmp(const void *s1, const void *s2, size_t len)
    916     0    stevel {
    917     0    stevel 	volatile uint16_t *flags;
    918     0    stevel 
    919     0    stevel 	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
    920     0    stevel 
    921     0    stevel 	if (s1 == s2)
    922     0    stevel 		return (0);
    923     0    stevel 
    924     0    stevel 	if (s1 == NULL || s2 == NULL)
    925     0    stevel 		return (1);
    926     0    stevel 
    927     0    stevel 	if (s1 != s2 && len != 0) {
    928     0    stevel 		const uint8_t *ps1 = s1;
    929     0    stevel 		const uint8_t *ps2 = s2;
    930     0    stevel 
    931     0    stevel 		do {
    932     0    stevel 			if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
    933     0    stevel 				return (1);
    934     0    stevel 		} while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
    935     0    stevel 	}
    936     0    stevel 	return (0);
    937     0    stevel }
    938     0    stevel 
    939     0    stevel /*
    940     0    stevel  * Zero the specified region using a simple byte-by-byte loop.  Note that this
    941     0    stevel  * is for safe DTrace-managed memory only.
    942     0    stevel  */
    943     0    stevel static void
    944     0    stevel dtrace_bzero(void *dst, size_t len)
    945     0    stevel {
    946     0    stevel 	uchar_t *cp;
    947     0    stevel 
    948     0    stevel 	for (cp = dst; len != 0; len--)
    949     0    stevel 		*cp++ = 0;
    950     0    stevel }
    951     0    stevel 
    952  5984   jhaslam static void
    953  5984   jhaslam dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
    954  5984   jhaslam {
    955  5984   jhaslam 	uint64_t result[2];
    956  5984   jhaslam 
    957  5984   jhaslam 	result[0] = addend1[0] + addend2[0];
    958  5984   jhaslam 	result[1] = addend1[1] + addend2[1] +
    959  5984   jhaslam 	    (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
    960  5984   jhaslam 
    961  5984   jhaslam 	sum[0] = result[0];
    962  5984   jhaslam 	sum[1] = result[1];
    963  5984   jhaslam }
    964  5984   jhaslam 
    965  5984   jhaslam /*
    966  5984   jhaslam  * Shift the 128-bit value in a by b. If b is positive, shift left.
    967  5984   jhaslam  * If b is negative, shift right.
    968  5984   jhaslam  */
    969  5984   jhaslam static void
    970  5984   jhaslam dtrace_shift_128(uint64_t *a, int b)
    971  5984   jhaslam {
    972  5984   jhaslam 	uint64_t mask;
    973  5984   jhaslam 
    974  5984   jhaslam 	if (b == 0)
    975  5984   jhaslam 		return;
    976  5984   jhaslam 
    977  5984   jhaslam 	if (b < 0) {
    978  5984   jhaslam 		b = -b;
    979  5984   jhaslam 		if (b >= 64) {
    980  5984   jhaslam 			a[0] = a[1] >> (b - 64);
    981  5984   jhaslam 			a[1] = 0;
    982  5984   jhaslam 		} else {
    983  5984   jhaslam 			a[0] >>= b;
    984  5984   jhaslam 			mask = 1LL << (64 - b);
    985  5984   jhaslam 			mask -= 1;
    986  5984   jhaslam 			a[0] |= ((a[1] & mask) << (64 - b));
    987  5984   jhaslam 			a[1] >>= b;
    988  5984   jhaslam 		}
    989  5984   jhaslam 	} else {
    990  5984   jhaslam 		if (b >= 64) {
    991  5984   jhaslam 			a[1] = a[0] << (b - 64);
    992  5984   jhaslam 			a[0] = 0;
    993  5984   jhaslam 		} else {
    994  5984   jhaslam 			a[1] <<= b;
    995  5984   jhaslam 			mask = a[0] >> (64 - b);
    996  5984   jhaslam 			a[1] |= mask;
    997  5984   jhaslam 			a[0] <<= b;
    998  5984   jhaslam 		}
    999  5984   jhaslam 	}
   1000  5984   jhaslam }
   1001  5984   jhaslam 
   1002  5984   jhaslam /*
   1003  5984   jhaslam  * The basic idea is to break the 2 64-bit values into 4 32-bit values,
   1004  5984   jhaslam  * use native multiplication on those, and then re-combine into the
   1005  5984   jhaslam  * resulting 128-bit value.
   1006  5984   jhaslam  *
   1007  5984   jhaslam  * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
   1008  5984   jhaslam  *     hi1 * hi2 << 64 +
   1009  5984   jhaslam  *     hi1 * lo2 << 32 +
   1010  5984   jhaslam  *     hi2 * lo1 << 32 +
   1011  5984   jhaslam  *     lo1 * lo2
   1012  5984   jhaslam  */
   1013  5984   jhaslam static void
   1014  5984   jhaslam dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
   1015  5984   jhaslam {
   1016  5984   jhaslam 	uint64_t hi1, hi2, lo1, lo2;
   1017  5984   jhaslam 	uint64_t tmp[2];
   1018  5984   jhaslam 
   1019  5984   jhaslam 	hi1 = factor1 >> 32;
   1020  5984   jhaslam 	hi2 = factor2 >> 32;
   1021  5984   jhaslam 
   1022  5984   jhaslam 	lo1 = factor1 & DT_MASK_LO;
   1023  5984   jhaslam 	lo2 = factor2 & DT_MASK_LO;
   1024  5984   jhaslam 
   1025  5984   jhaslam 	product[0] = lo1 * lo2;
   1026  5984   jhaslam 	product[1] = hi1 * hi2;
   1027  5984   jhaslam 
   1028  5984   jhaslam 	tmp[0] = hi1 * lo2;
   1029  5984   jhaslam 	tmp[1] = 0;
   1030  5984   jhaslam 	dtrace_shift_128(tmp, 32);
   1031  5984   jhaslam 	dtrace_add_128(product, tmp, product);
   1032  5984   jhaslam 
   1033  5984   jhaslam 	tmp[0] = hi2 * lo1;
   1034  5984   jhaslam 	tmp[1] = 0;
   1035  5984   jhaslam 	dtrace_shift_128(tmp, 32);
   1036  5984   jhaslam 	dtrace_add_128(product, tmp, product);
   1037  5984   jhaslam }
   1038  5984   jhaslam 
   1039     0    stevel /*
   1040  1677        dp  * This privilege check should be used by actions and subroutines to
   1041  1677        dp  * verify that the user credentials of the process that enabled the
   1042  1677        dp  * invoking ECB match the target credentials
   1043  1677        dp  */
   1044  1677        dp static int
   1045  1677        dp dtrace_priv_proc_common_user(dtrace_state_t *state)
   1046  1677        dp {
   1047  1677        dp 	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
   1048  1677        dp 
   1049  1677        dp 	/*
   1050  1677        dp 	 * We should always have a non-NULL state cred here, since if cred
   1051  1677        dp 	 * is null (anonymous tracing), we fast-path bypass this routine.
   1052  1677        dp 	 */
   1053  1677        dp 	ASSERT(s_cr != NULL);
   1054  1677        dp 
   1055  1677        dp 	if ((cr = CRED()) != NULL &&
   1056  1677        dp 	    s_cr->cr_uid == cr->cr_uid &&
   1057  1677        dp 	    s_cr->cr_uid == cr->cr_ruid &&
   1058  1677        dp 	    s_cr->cr_uid == cr->cr_suid &&
   1059  1677        dp 	    s_cr->cr_gid == cr->cr_gid &&
   1060  1677        dp 	    s_cr->cr_gid == cr->cr_rgid &&
   1061  1677        dp 	    s_cr->cr_gid == cr->cr_sgid)
   1062  1677        dp 		return (1);
   1063  1677        dp 
   1064  1677        dp 	return (0);
   1065  1677        dp }
   1066  1677        dp 
   1067  1677        dp /*
   1068  1677        dp  * This privilege check should be used by actions and subroutines to
   1069  1677        dp  * verify that the zone of the process that enabled the invoking ECB
   1070  1677        dp  * matches the target credentials
   1071  1677        dp  */
   1072  1677        dp static int
   1073  1677        dp dtrace_priv_proc_common_zone(dtrace_state_t *state)
   1074  1677        dp {
   1075  1677        dp 	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
   1076  1677        dp 
   1077  1677        dp 	/*
   1078  1677        dp 	 * We should always have a non-NULL state cred here, since if cred
   1079  1677        dp 	 * is null (anonymous tracing), we fast-path bypass this routine.
   1080  1677        dp 	 */
   1081  1677        dp 	ASSERT(s_cr != NULL);
   1082  1677        dp 
   1083  1677        dp 	if ((cr = CRED()) != NULL &&
   1084  1677        dp 	    s_cr->cr_zone == cr->cr_zone)
   1085  1677        dp 		return (1);
   1086  1677        dp 
   1087  1677        dp 	return (0);
   1088  1677        dp }
   1089  1677        dp 
   1090  1677        dp /*
   1091  1677        dp  * This privilege check should be used by actions and subroutines to
   1092  1677        dp  * verify that the process has not setuid or changed credentials.
   1093  1677        dp  */
   1094  1677        dp static int
   1095  1677        dp dtrace_priv_proc_common_nocd()
   1096  1677        dp {
   1097     0    stevel 	proc_t *proc;
   1098     0    stevel 
   1099  1677        dp 	if ((proc = ttoproc(curthread)) != NULL &&
   1100     0    stevel 	    !(proc->p_flag & SNOCD))
   1101     0    stevel 		return (1);
   1102     0    stevel 
   1103  1677        dp 	return (0);
   1104  1677        dp }
   1105  1677        dp 
   1106  1677        dp static int
   1107  1677        dp dtrace_priv_proc_destructive(dtrace_state_t *state)
   1108  1677        dp {
   1109  1677        dp 	int action = state->dts_cred.dcr_action;
   1110  1677        dp 
   1111  1677        dp 	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
   1112  1677        dp 	    dtrace_priv_proc_common_zone(state) == 0)
   1113  1677        dp 		goto bad;
   1114  1677        dp 
   1115  1677        dp 	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
   1116  1677        dp 	    dtrace_priv_proc_common_user(state) == 0)
   1117  1677        dp 		goto bad;
   1118  1677        dp 
   1119  1677        dp 	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
   1120  1677        dp 	    dtrace_priv_proc_common_nocd() == 0)
   1121  1677        dp 		goto bad;
   1122  1677        dp 
   1123  1677        dp 	return (1);
   1124  1677        dp 
   1125  1677        dp bad:
   1126     0    stevel 	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
   1127     0    stevel 
   1128     0    stevel 	return (0);
   1129     0    stevel }
   1130     0    stevel 
   1131     0    stevel static int
   1132     0    stevel dtrace_priv_proc_control(dtrace_state_t *state)
   1133     0    stevel {
   1134     0    stevel 	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
   1135     0    stevel 		return (1);
   1136     0    stevel 
   1137  1677        dp 	if (dtrace_priv_proc_common_zone(state) &&
   1138  1677        dp 	    dtrace_priv_proc_common_user(state) &&
   1139  1677        dp 	    dtrace_priv_proc_common_nocd())
   1140  1677        dp 		return (1);
   1141  1677        dp 
   1142  1677        dp 	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
   1143  1677        dp 
   1144  1677        dp 	return (0);
   1145     0    stevel }
   1146     0    stevel 
   1147     0    stevel static int
   1148     0    stevel dtrace_priv_proc(dtrace_state_t *state)
   1149     0    stevel {
   1150     0    stevel 	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
   1151     0    stevel 		return (1);
   1152     0    stevel 
   1153     0    stevel 	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
   1154     0    stevel 
   1155     0    stevel 	return (0);
   1156     0    stevel }
   1157     0    stevel 
   1158     0    stevel static int
   1159     0    stevel dtrace_priv_kernel(dtrace_state_t *state)
   1160     0    stevel {
   1161     0    stevel 	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
   1162     0    stevel 		return (1);
   1163     0    stevel 
   1164     0    stevel 	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
   1165     0    stevel 
   1166     0    stevel 	return (0);
   1167     0    stevel }
   1168     0    stevel 
   1169     0    stevel static int
   1170     0    stevel dtrace_priv_kernel_destructive(dtrace_state_t *state)
   1171     0    stevel {
   1172     0    stevel 	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
   1173     0    stevel 		return (1);
   1174     0    stevel 
   1175     0    stevel 	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
   1176     0    stevel 
   1177     0    stevel 	return (0);
   1178     0    stevel }
   1179     0    stevel 
   1180     0    stevel /*
   1181     0    stevel  * Note:  not called from probe context.  This function is called
   1182     0    stevel  * asynchronously (and at a regular interval) from outside of probe context to
   1183     0    stevel  * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
   1184     0    stevel  * cleaning is explained in detail in <sys/dtrace_impl.h>.
   1185     0    stevel  */
   1186     0    stevel void
   1187     0    stevel dtrace_dynvar_clean(dtrace_dstate_t *dstate)
   1188     0    stevel {
   1189     0    stevel 	dtrace_dynvar_t *dirty;
   1190     0    stevel 	dtrace_dstate_percpu_t *dcpu;
   1191     0    stevel 	int i, work = 0;
   1192     0    stevel 
   1193     0    stevel 	for (i = 0; i < NCPU; i++) {
   1194     0    stevel 		dcpu = &dstate->dtds_percpu[i];
   1195     0    stevel 
   1196     0    stevel 		ASSERT(dcpu->dtdsc_rinsing == NULL);
   1197     0    stevel 
   1198     0    stevel 		/*
   1199     0    stevel 		 * If the dirty list is NULL, there is no dirty work to do.
   1200     0    stevel 		 */
   1201     0    stevel 		if (dcpu->dtdsc_dirty == NULL)
   1202     0    stevel 			continue;
   1203     0    stevel 
   1204     0    stevel 		/*
   1205     0    stevel 		 * If the clean list is non-NULL, then we're not going to do
   1206     0    stevel 		 * any work for this CPU -- it means that there has not been
   1207     0    stevel 		 * a dtrace_dynvar() allocation on this CPU (or from this CPU)
   1208     0    stevel 		 * since the last time we cleaned house.
   1209     0    stevel 		 */
   1210     0    stevel 		if (dcpu->dtdsc_clean != NULL)
   1211     0    stevel 			continue;
   1212     0    stevel 
   1213     0    stevel 		work = 1;
   1214     0    stevel 
   1215     0    stevel 		/*
   1216     0    stevel 		 * Atomically move the dirty list aside.
   1217     0    stevel 		 */
   1218     0    stevel 		do {
   1219     0    stevel 			dirty = dcpu->dtdsc_dirty;
   1220     0    stevel 
   1221     0    stevel 			/*
   1222     0    stevel 			 * Before we zap the dirty list, set the rinsing list.
   1223     0    stevel 			 * (This allows for a potential assertion in
   1224     0    stevel 			 * dtrace_dynvar():  if a free dynamic variable appears
   1225     0    stevel 			 * on a hash chain, either the dirty list or the
   1226     0    stevel 			 * rinsing list for some CPU must be non-NULL.)
   1227     0    stevel 			 */
   1228     0    stevel 			dcpu->dtdsc_rinsing = dirty;
   1229     0    stevel 			dtrace_membar_producer();
   1230     0    stevel 		} while (dtrace_casptr(&dcpu->dtdsc_dirty,
   1231     0    stevel 		    dirty, NULL) != dirty);
   1232     0    stevel 	}
   1233     0    stevel 
   1234     0    stevel 	if (!work) {
   1235     0    stevel 		/*
   1236     0    stevel 		 * We have no work to do; we can simply return.
   1237     0    stevel 		 */
   1238     0    stevel 		return;
   1239     0    stevel 	}
   1240     0    stevel 
   1241     0    stevel 	dtrace_sync();
   1242     0    stevel 
   1243     0    stevel 	for (i = 0; i < NCPU; i++) {
   1244     0    stevel 		dcpu = &dstate->dtds_percpu[i];
   1245     0    stevel 
   1246     0    stevel 		if (dcpu->dtdsc_rinsing == NULL)
   1247     0    stevel 			continue;
   1248     0    stevel 
   1249     0    stevel 		/*
   1250     0    stevel 		 * We are now guaranteed that no hash chain contains a pointer
   1251     0    stevel 		 * into this dirty list; we can make it clean.
   1252     0    stevel 		 */
   1253     0    stevel 		ASSERT(dcpu->dtdsc_clean == NULL);
   1254     0    stevel 		dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
   1255     0    stevel 		dcpu->dtdsc_rinsing = NULL;
   1256     0    stevel 	}
   1257     0    stevel 
   1258     0    stevel 	/*
   1259     0    stevel 	 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
   1260     0    stevel 	 * sure that all CPUs have seen all of the dtdsc_clean pointers.
   1261     0    stevel 	 * This prevents a race whereby a CPU incorrectly decides that
   1262     0    stevel 	 * the state should be something other than DTRACE_DSTATE_CLEAN
   1263     0    stevel 	 * after dtrace_dynvar_clean() has completed.
   1264     0    stevel 	 */
   1265     0    stevel 	dtrace_sync();
   1266     0    stevel 
   1267     0    stevel 	dstate->dtds_state = DTRACE_DSTATE_CLEAN;
   1268     0    stevel }
   1269     0    stevel 
   1270     0    stevel /*
   1271     0    stevel  * Depending on the value of the op parameter, this function looks-up,
   1272     0    stevel  * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
   1273     0    stevel  * allocation is requested, this function will return a pointer to a
   1274     0    stevel  * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
   1275     0    stevel  * variable can be allocated.  If NULL is returned, the appropriate counter
   1276     0    stevel  * will be incremented.
   1277     0    stevel  */
   1278     0    stevel dtrace_dynvar_t *
   1279     0    stevel dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
   1280  2870        dp     dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
   1281  2870        dp     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
   1282     0    stevel {
   1283  1739       bmc 	uint64_t hashval = DTRACE_DYNHASH_VALID;
   1284     0    stevel 	dtrace_dynhash_t *hash = dstate->dtds_hash;
   1285     0    stevel 	dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
   1286     0    stevel 	processorid_t me = CPU->cpu_id, cpu = me;
   1287     0    stevel 	dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
   1288     0    stevel 	size_t bucket, ksize;
   1289     0    stevel 	size_t chunksize = dstate->dtds_chunksize;
   1290     0    stevel 	uintptr_t kdata, lock, nstate;
   1291     0    stevel 	uint_t i;
   1292     0    stevel 
   1293     0    stevel 	ASSERT(nkeys != 0);
   1294     0    stevel 
   1295     0    stevel 	/*
   1296     0    stevel 	 * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
   1297     0    stevel 	 * algorithm.  For the by-value portions, we perform the algorithm in
   1298     0    stevel 	 * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
   1299     0    stevel 	 * bit, and seems to have only a minute effect on distribution.  For
   1300     0    stevel 	 * the by-reference data, we perform "One-at-a-time" iterating (safely)
   1301     0    stevel 	 * over each referenced byte.  It's painful to do this, but it's much
   1302     0    stevel 	 * better than pathological hash distribution.  The efficacy of the
   1303     0    stevel 	 * hashing algorithm (and a comparison with other algorithms) may be
   1304     0    stevel 	 * found by running the ::dtrace_dynstat MDB dcmd.
   1305     0    stevel 	 */
   1306     0    stevel 	for (i = 0; i < nkeys; i++) {
   1307     0    stevel 		if (key[i].dttk_size == 0) {
   1308     0    stevel 			uint64_t val = key[i].dttk_value;
   1309     0    stevel 
   1310     0    stevel 			hashval += (val >> 48) & 0xffff;
   1311     0    stevel 			hashval += (hashval << 10);
   1312     0    stevel 			hashval ^= (hashval >> 6);
   1313     0    stevel 
   1314     0    stevel 			hashval += (val >> 32) & 0xffff;
   1315     0    stevel 			hashval += (hashval << 10);
   1316     0    stevel 			hashval ^= (hashval >> 6);
   1317     0    stevel 
   1318     0    stevel 			hashval += (val >> 16) & 0xffff;
   1319     0    stevel 			hashval += (hashval << 10);
   1320     0    stevel 			hashval ^= (hashval >> 6);
   1321     0    stevel 
   1322     0    stevel 			hashval += val & 0xffff;
   1323     0    stevel 			hashval += (hashval << 10);
   1324     0    stevel 			hashval ^= (hashval >> 6);
   1325     0    stevel 		} else {
   1326     0    stevel 			/*
   1327     0    stevel 			 * This is incredibly painful, but it beats the hell
   1328     0    stevel 			 * out of the alternative.
   1329     0    stevel 			 */
   1330     0    stevel 			uint64_t j, size = key[i].dttk_size;
   1331     0    stevel 			uintptr_t base = (uintptr_t)key[i].dttk_value;
   1332     0    stevel 
   1333  2870        dp 			if (!dtrace_canload(base, size, mstate, vstate))
   1334  2870        dp 				break;
   1335  2870        dp 
   1336     0    stevel 			for (j = 0; j < size; j++) {
   1337     0    stevel 				hashval += dtrace_load8(base + j);
   1338     0    stevel 				hashval += (hashval << 10);
   1339     0    stevel 				hashval ^= (hashval >> 6);
   1340     0    stevel 			}
   1341     0    stevel 		}
   1342     0    stevel 	}
   1343  2870        dp 
   1344  2870        dp 	if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
   1345  2870        dp 		return (NULL);
   1346     0    stevel 
   1347     0    stevel 	hashval += (hashval << 3);
   1348     0    stevel 	hashval ^= (hashval >> 11);
   1349     0    stevel 	hashval += (hashval << 15);
   1350     0    stevel 
   1351     0    stevel 	/*
   1352  1739       bmc 	 * There is a remote chance (ideally, 1 in 2^31) that our hashval
   1353  1739       bmc 	 * comes out to be one of our two sentinel hash values.  If this
   1354  1739       bmc 	 * actually happens, we set the hashval to be a value known to be a
   1355  1739       bmc 	 * non-sentinel value.
   1356  1739       bmc 	 */
   1357  1739       bmc 	if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
   1358  1739       bmc 		hashval = DTRACE_DYNHASH_VALID;
   1359     0    stevel 
   1360     0    stevel 	/*
   1361     0    stevel 	 * Yes, it's painful to do a divide here.  If the cycle count becomes
   1362     0    stevel 	 * important here, tricks can be pulled to reduce it.  (However, it's
   1363     0    stevel 	 * critical that hash collisions be kept to an absolute minimum;
   1364     0    stevel 	 * they're much more painful than a divide.)  It's better to have a
   1365     0    stevel 	 * solution that generates few collisions and still keeps things
   1366     0    stevel 	 * relatively simple.
   1367     0    stevel 	 */
   1368     0    stevel 	bucket = hashval % dstate->dtds_hashsize;
   1369     0    stevel 
   1370     0    stevel 	if (op == DTRACE_DYNVAR_DEALLOC) {
   1371     0    stevel 		volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
   1372     0    stevel 
   1373     0    stevel 		for (;;) {
   1374     0    stevel 			while ((lock = *lockp) & 1)
   1375     0    stevel 				continue;
   1376     0    stevel 
   1377     0    stevel 			if (dtrace_casptr((void *)lockp,
   1378     0    stevel 			    (void *)lock, (void *)(lock + 1)) == (void *)lock)
   1379     0    stevel 				break;
   1380     0    stevel 		}
   1381     0    stevel 
   1382     0    stevel 		dtrace_membar_producer();
   1383     0    stevel 	}
   1384     0    stevel 
   1385     0    stevel top:
   1386     0    stevel 	prev = NULL;
   1387     0    stevel 	lock = hash[bucket].dtdh_lock;
   1388     0    stevel 
   1389     0    stevel 	dtrace_membar_consumer();
   1390     0    stevel 
   1391     0    stevel 	start = hash[bucket].dtdh_chain;
   1392  1739       bmc 	ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
   1393  1739       bmc 	    start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
   1394  1739       bmc 	    op != DTRACE_DYNVAR_DEALLOC));
   1395     0    stevel 
   1396     0    stevel 	for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
   1397     0    stevel 		dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
   1398     0    stevel 		dtrace_key_t *dkey = &dtuple->dtt_key[0];
   1399     0    stevel 
   1400     0    stevel 		if (dvar->dtdv_hashval != hashval) {
   1401  1739       bmc 			if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
   1402  1739       bmc 				/*
   1403  1739       bmc 				 * We've reached the sink, and therefore the
   1404  1739       bmc 				 * end of the hash chain; we can kick out of
   1405  1739       bmc 				 * the loop knowing that we have seen a valid
   1406  1739       bmc 				 * snapshot of state.
   1407  1739       bmc 				 */
   1408  1739       bmc 				ASSERT(dvar->dtdv_next == NULL);
   1409  1739       bmc 				ASSERT(dvar == &dtrace_dynhash_sink);
   1410  1739       bmc 				break;
   1411  1739       bmc 			}
   1412  1739       bmc 
   1413  1739       bmc 			if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
   1414  1739       bmc 				/*
   1415  1739       bmc 				 * We've gone off the rails:  somewhere along
   1416  1739       bmc 				 * the line, one of the members of this hash
   1417  1739       bmc 				 * chain was deleted.  Note that we could also
   1418  1739       bmc 				 * detect this by simply letting this loop run
   1419  1739       bmc 				 * to completion, as we would eventually hit
   1420  1739       bmc 				 * the end of the dirty list.  However, we
   1421  1739       bmc 				 * want to avoid running the length of the
   1422  1739       bmc 				 * dirty list unnecessarily (it might be quite
   1423  1739       bmc 				 * long), so we catch this as early as
   1424  1739       bmc 				 * possible by detecting the hash marker.  In
   1425  1739       bmc 				 * this case, we simply set dvar to NULL and
   1426  1739       bmc 				 * break; the conditional after the loop will
   1427  1739       bmc 				 * send us back to top.
   1428  1739       bmc 				 */
   1429  1739       bmc 				dvar = NULL;
   1430  1739       bmc 				break;
   1431     0    stevel 			}
   1432     0    stevel 
   1433     0    stevel 			goto next;
   1434     0    stevel 		}
   1435     0    stevel 
   1436     0    stevel 		if (dtuple->dtt_nkeys != nkeys)
   1437     0    stevel 			goto next;
   1438     0    stevel 
   1439     0    stevel 		for (i = 0; i < nkeys; i++, dkey++) {
   1440     0    stevel 			if (dkey->dttk_size != key[i].dttk_size)
   1441     0    stevel 				goto next; /* size or type mismatch */
   1442     0    stevel 
   1443     0    stevel 			if (dkey->dttk_size != 0) {
   1444     0    stevel 				if (dtrace_bcmp(
   1445     0    stevel 				    (void *)(uintptr_t)key[i].dttk_value,
   1446     0    stevel 				    (void *)(uintptr_t)dkey->dttk_value,
   1447     0    stevel 				    dkey->dttk_size))
   1448     0    stevel 					goto next;
   1449     0    stevel 			} else {
   1450     0    stevel 				if (dkey->dttk_value != key[i].dttk_value)
   1451     0    stevel 					goto next;
   1452     0    stevel 			}
   1453     0    stevel 		}
   1454     0    stevel 
   1455     0    stevel 		if (op != DTRACE_DYNVAR_DEALLOC)
   1456     0    stevel 			return (dvar);
   1457     0    stevel 
   1458     0    stevel 		ASSERT(dvar->dtdv_next == NULL ||
   1459  1739       bmc 		    dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
   1460     0    stevel 
   1461     0    stevel 		if (prev != NULL) {
   1462     0    stevel 			ASSERT(hash[bucket].dtdh_chain != dvar);
   1463     0    stevel 			ASSERT(start != dvar);
   1464     0    stevel 			ASSERT(prev->dtdv_next == dvar);
   1465     0    stevel 			prev->dtdv_next = dvar->dtdv_next;
   1466     0    stevel 		} else {
   1467     0    stevel 			if (dtrace_casptr(&hash[bucket].dtdh_chain,
   1468     0    stevel 			    start, dvar->dtdv_next) != start) {
   1469     0    stevel 				/*
   1470     0    stevel 				 * We have failed to atomically swing the
   1471     0    stevel 				 * hash table head pointer, presumably because
   1472     0    stevel 				 * of a conflicting allocation on another CPU.
   1473     0    stevel 				 * We need to reread the hash chain and try
   1474     0    stevel 				 * again.
   1475     0    stevel 				 */
   1476     0    stevel 				goto top;
   1477     0    stevel 			}
   1478     0    stevel 		}
   1479     0    stevel 
   1480     0    stevel 		dtrace_membar_producer();
   1481     0    stevel 
   1482     0    stevel 		/*
   1483  1739       bmc 		 * Now set the hash value to indicate that it's free.
   1484     0    stevel 		 */
   1485     0    stevel 		ASSERT(hash[bucket].dtdh_chain != dvar);
   1486  1739       bmc 		dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
   1487     0    stevel 
   1488     0    stevel 		dtrace_membar_producer();
   1489     0    stevel 
   1490     0    stevel 		/*
   1491     0    stevel 		 * Set the next pointer to point at the dirty list, and
   1492     0    stevel 		 * atomically swing the dirty pointer to the newly freed dvar.
   1493     0    stevel 		 */
   1494     0    stevel 		do {
   1495     0    stevel 			next = dcpu->dtdsc_dirty;
   1496     0    stevel 			dvar->dtdv_next = next;
   1497     0    stevel 		} while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
   1498     0    stevel 
   1499     0    stevel 		/*
   1500     0    stevel 		 * Finally, unlock this hash bucket.
   1501     0    stevel 		 */
   1502     0    stevel 		ASSERT(hash[bucket].dtdh_lock == lock);
   1503     0    stevel 		ASSERT(lock & 1);
   1504     0    stevel 		hash[bucket].dtdh_lock++;
   1505     0    stevel 
   1506     0    stevel 		return (NULL);
   1507     0    stevel next:
   1508     0    stevel 		prev = dvar;
   1509     0    stevel 		continue;
   1510  1739       bmc 	}
   1511  1739       bmc 
   1512  1739       bmc 	if (dvar == NULL) {
   1513  1739       bmc 		/*
   1514  1739       bmc 		 * If dvar is NULL, it is because we went off the rails:
   1515  1739       bmc 		 * one of the elements that we traversed in the hash chain
   1516  1739       bmc 		 * was deleted while we were traversing it.  In this case,
   1517  1739       bmc 		 * we assert that we aren't doing a dealloc (deallocs lock
   1518  1739       bmc 		 * the hash bucket to prevent themselves from racing with
   1519  1739       bmc 		 * one another), and retry the hash chain traversal.
   1520  1739       bmc 		 */
   1521  1739       bmc 		ASSERT(op != DTRACE_DYNVAR_DEALLOC);
   1522  1739       bmc 		goto top;
   1523     0    stevel 	}
   1524     0    stevel 
   1525     0    stevel 	if (op != DTRACE_DYNVAR_ALLOC) {
   1526     0    stevel 		/*
   1527     0    stevel 		 * If we are not to allocate a new variable, we want to
   1528     0    stevel 		 * return NULL now.  Before we return, check that the value
   1529     0    stevel 		 * of the lock word hasn't changed.  If it has, we may have
   1530     0    stevel 		 * seen an inconsistent snapshot.
   1531     0    stevel 		 */
   1532     0    stevel 		if (op == DTRACE_DYNVAR_NOALLOC) {
   1533     0    stevel 			if (hash[bucket].dtdh_lock != lock)
   1534     0    stevel 				goto top;
   1535     0    stevel 		} else {
   1536     0    stevel 			ASSERT(op == DTRACE_DYNVAR_DEALLOC);
   1537     0    stevel 			ASSERT(hash[bucket].dtdh_lock == lock);
   1538     0    stevel 			ASSERT(lock & 1);
   1539     0    stevel 			hash[bucket].dtdh_lock++;
   1540     0    stevel 		}
   1541     0    stevel 
   1542     0    stevel 		return (NULL);
   1543     0    stevel 	}
   1544     0    stevel 
   1545     0    stevel 	/*
   1546     0    stevel 	 * We need to allocate a new dynamic variable.  The size we need is the
   1547     0    stevel 	 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
   1548     0    stevel 	 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
   1549     0    stevel 	 * the size of any referred-to data (dsize).  We then round the final
   1550     0    stevel 	 * size up to the chunksize for allocation.
   1551     0    stevel 	 */
   1552     0    stevel 	for (ksize = 0, i = 0; i < nkeys; i++)
   1553     0    stevel 		ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
   1554     0    stevel 
   1555     0    stevel 	/*
   1556     0    stevel 	 * This should be pretty much impossible, but could happen if, say,
   1557     0    stevel 	 * strange DIF specified the tuple.  Ideally, this should be an
   1558     0    stevel 	 * assertion and not an error condition -- but that requires that the
   1559     0    stevel 	 * chunksize calculation in dtrace_difo_chunksize() be absolutely
   1560     0    stevel 	 * bullet-proof.  (That is, it must not be able to be fooled by
   1561     0    stevel 	 * malicious DIF.)  Given the lack of backwards branches in DIF,
   1562     0    stevel 	 * solving this would presumably not amount to solving the Halting
   1563     0    stevel 	 * Problem -- but it still seems awfully hard.
   1564     0    stevel 	 */
   1565     0    stevel 	if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
   1566     0    stevel 	    ksize + dsize > chunksize) {
   1567     0    stevel 		dcpu->dtdsc_drops++;
   1568     0    stevel 		return (NULL);
   1569     0    stevel 	}
   1570     0    stevel 
   1571     0    stevel 	nstate = DTRACE_DSTATE_EMPTY;
   1572     0    stevel 
   1573     0    stevel 	do {
   1574     0    stevel retry:
   1575     0    stevel 		free = dcpu->dtdsc_free;
   1576     0    stevel 
   1577     0    stevel 		if (free == NULL) {
   1578     0    stevel 			dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
   1579     0    stevel 			void *rval;
   1580     0    stevel 
   1581     0    stevel 			if (clean == NULL) {
   1582     0    stevel 				/*
   1583     0    stevel 				 * We're out of dynamic variable space on
   1584     0    stevel 				 * this CPU.  Unless we have tried all CPUs,
   1585     0    stevel 				 * we'll try to allocate from a different
   1586     0    stevel 				 * CPU.
   1587     0    stevel 				 */
   1588     0    stevel 				switch (dstate->dtds_state) {
   1589     0    stevel 				case DTRACE_DSTATE_CLEAN: {
   1590     0    stevel 					void *sp = &dstate->dtds_state;
   1591     0    stevel 
   1592     0    stevel 					if (++cpu >= NCPU)
   1593     0    stevel 						cpu = 0;
   1594     0    stevel 
   1595     0    stevel 					if (dcpu->dtdsc_dirty != NULL &&
   1596     0    stevel 					    nstate == DTRACE_DSTATE_EMPTY)
   1597     0    stevel 						nstate = DTRACE_DSTATE_DIRTY;
   1598     0    stevel 
   1599     0    stevel 					if (dcpu->dtdsc_rinsing != NULL)
   1600     0    stevel 						nstate = DTRACE_DSTATE_RINSING;
   1601     0    stevel 
   1602     0    stevel 					dcpu = &dstate->dtds_percpu[cpu];
   1603     0    stevel 
   1604     0    stevel 					if (cpu != me)
   1605     0    stevel 						goto retry;
   1606     0    stevel 
   1607     0    stevel 					(void) dtrace_cas32(sp,
   1608     0    stevel 					    DTRACE_DSTATE_CLEAN, nstate);
   1609     0    stevel 
   1610     0    stevel 					/*
   1611     0    stevel 					 * To increment the correct bean
   1612     0    stevel 					 * counter, take another lap.
   1613     0    stevel 					 */
   1614     0    stevel 					goto retry;
   1615     0    stevel 				}
   1616     0    stevel 
   1617     0    stevel 				case DTRACE_DSTATE_DIRTY:
   1618     0    stevel 					dcpu->dtdsc_dirty_drops++;
   1619     0    stevel 					break;
   1620     0    stevel 
   1621     0    stevel 				case DTRACE_DSTATE_RINSING:
   1622     0    stevel 					dcpu->dtdsc_rinsing_drops++;
   1623     0    stevel 					break;
   1624     0    stevel 
   1625     0    stevel 				case DTRACE_DSTATE_EMPTY:
   1626     0    stevel 					dcpu->dtdsc_drops++;
   1627     0    stevel 					break;
   1628     0    stevel 				}
   1629     0    stevel 
   1630     0    stevel 				DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
   1631     0    stevel 				return (NULL);
   1632     0    stevel 			}
   1633     0    stevel 
   1634     0    stevel 			/*
   1635     0    stevel 			 * The clean list appears to be non-empty.  We want to
   1636     0    stevel 			 * move the clean list to the free list; we start by
   1637     0    stevel 			 * moving the clean pointer aside.
   1638     0    stevel 			 */
   1639     0    stevel 			if (dtrace_casptr(&dcpu->dtdsc_clean,
   1640     0    stevel 			    clean, NULL) != clean) {
   1641     0    stevel 				/*
   1642     0    stevel 				 * We are in one of two situations:
   1643     0    stevel 				 *
   1644     0    stevel 				 *  (a)	The clean list was switched to the
   1645     0    stevel 				 *	free list by another CPU.
   1646     0    stevel 				 *
   1647     0    stevel 				 *  (b)	The clean list was added to by the
   1648     0    stevel 				 *	cleansing cyclic.
   1649     0    stevel 				 *
   1650     0    stevel 				 * In either of these situations, we can
   1651     0    stevel 				 * just reattempt the free list allocation.
   1652     0    stevel 				 */
   1653     0    stevel 				goto retry;
   1654     0    stevel 			}
   1655     0    stevel 
   1656  1739       bmc 			ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
   1657     0    stevel 
   1658     0    stevel 			/*
   1659     0    stevel 			 * Now we'll move the clean list to the free list.
   1660     0    stevel 			 * It's impossible for this to fail:  the only way
   1661     0    stevel 			 * the free list can be updated is through this
   1662     0    stevel 			 * code path, and only one CPU can own the clean list.
   1663     0    stevel 			 * Thus, it would only be possible for this to fail if
   1664     0    stevel 			 * this code were racing with dtrace_dynvar_clean().
   1665     0    stevel 			 * (That is, if dtrace_dynvar_clean() updated the clean
   1666     0    stevel 			 * list, and we ended up racing to update the free
   1667     0    stevel 			 * list.)  This race is prevented by the dtrace_sync()
   1668     0    stevel 			 * in dtrace_dynvar_clean() -- which flushes the
   1669     0    stevel 			 * owners of the clean lists out before resetting
   1670     0    stevel 			 * the clean lists.
   1671     0    stevel 			 */
   1672     0    stevel 			rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
   1673     0    stevel 			ASSERT(rval == NULL);
   1674     0    stevel 			goto retry;
   1675     0    stevel 		}
   1676     0    stevel 
   1677     0    stevel 		dvar = free;
   1678     0    stevel 		new_free = dvar->dtdv_next;
   1679     0    stevel 	} while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
   1680     0    stevel 
   1681     0    stevel 	/*
   1682     0    stevel 	 * We have now allocated a new chunk.  We copy the tuple keys into the
   1683     0    stevel 	 * tuple array and copy any referenced key data into the data space
   1684     0    stevel 	 * following the tuple array.  As we do this, we relocate dttk_value
   1685     0    stevel 	 * in the final tuple to point to the key data address in the chunk.
   1686     0    stevel 	 */
   1687     0    stevel 	kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
   1688     0    stevel 	dvar->dtdv_data = (void *)(kdata + ksize);
   1689     0    stevel 	dvar->dtdv_tuple.dtt_nkeys = nkeys;
   1690     0    stevel 
   1691     0    stevel 	for (i = 0; i < nkeys; i++) {
   1692     0    stevel 		dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
   1693     0    stevel 		size_t kesize = key[i].dttk_size;
   1694     0    stevel 
   1695     0    stevel 		if (kesize != 0) {
   1696     0    stevel 			dtrace_bcopy(
   1697     0    stevel 			    (const void *)(uintptr_t)key[i].dttk_value,
   1698     0    stevel 			    (void *)kdata, kesize);
   1699     0    stevel 			dkey->dttk_value = kdata;
   1700     0    stevel 			kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
   1701     0    stevel 		} else {
   1702     0    stevel 			dkey->dttk_value = key[i].dttk_value;
   1703     0    stevel 		}
   1704     0    stevel 
   1705     0    stevel 		dkey->dttk_size = kesize;
   1706     0    stevel 	}
   1707     0    stevel 
   1708  1739       bmc 	ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
   1709     0    stevel 	dvar->dtdv_hashval = hashval;
   1710     0    stevel 	dvar->dtdv_next = start;
   1711     0    stevel 
   1712     0    stevel 	if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
   1713     0    stevel 		return (dvar);
   1714     0    stevel 
   1715     0    stevel 	/*
   1716     0    stevel 	 * The cas has failed.  Either another CPU is adding an element to
   1717     0    stevel 	 * this hash chain, or another CPU is deleting an element from this
   1718     0    stevel 	 * hash chain.  The simplest way to deal with both of these cases
   1719     0    stevel 	 * (though not necessarily the most efficient) is to free our
   1720     0    stevel 	 * allocated block and tail-call ourselves.  Note that the free is
   1721     0    stevel 	 * to the dirty list and _not_ to the free list.  This is to prevent
   1722     0    stevel 	 * races with allocators, above.
   1723     0    stevel 	 */
   1724  1739       bmc 	dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
   1725     0    stevel 
   1726     0    stevel 	dtrace_membar_producer();
   1727     0    stevel 
   1728     0    stevel 	do {
   1729     0    stevel 		free = dcpu->dtdsc_dirty;
   1730     0    stevel 		dvar->dtdv_next = free;
   1731     0    stevel 	} while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
   1732     0    stevel 
   1733  2870        dp 	return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
   1734     0    stevel }
   1735     0    stevel 
   1736   457       bmc /*ARGSUSED*/
   1737   457       bmc static void
   1738   457       bmc dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
   1739     0    stevel {
   1740  5984   jhaslam 	if ((int64_t)nval < (int64_t)*oval)
   1741     0    stevel 		*oval = nval;
   1742     0    stevel }
   1743     0    stevel 
   1744   457       bmc /*ARGSUSED*/
   1745   457       bmc static void
   1746   457       bmc dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
   1747     0    stevel {
   1748  5984   jhaslam 	if ((int64_t)nval > (int64_t)*oval)
   1749     0    stevel 		*oval = nval;
   1750     0    stevel }
   1751     0    stevel 
   1752     0    stevel static void
   1753   457       bmc dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
   1754     0    stevel {
   1755     0    stevel 	int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
   1756     0    stevel 	int64_t val = (int64_t)nval;
   1757     0    stevel 
   1758     0    stevel 	if (val < 0) {
   1759     0    stevel 		for (i = 0; i < zero; i++) {
   1760     0    stevel 			if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
   1761   457       bmc 				quanta[i] += incr;
   1762     0    stevel 				return;
   1763     0    stevel 			}
   1764     0    stevel 		}
   1765     0    stevel 	} else {
   1766     0    stevel 		for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
   1767     0    stevel 			if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
   1768   457       bmc 				quanta[i - 1] += incr;
   1769     0    stevel 				return;
   1770     0    stevel 			}
   1771     0    stevel 		}
   1772     0    stevel 
   1773   457       bmc 		quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
   1774     0    stevel 		return;
   1775     0    stevel 	}
   1776     0    stevel 
   1777     0    stevel 	ASSERT(0);
   1778     0    stevel }
   1779     0    stevel 
   1780     0    stevel static void
   1781   457       bmc dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
   1782     0    stevel {
   1783     0    stevel 	uint64_t arg = *lquanta++;
   1784     0    stevel 	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
   1785     0    stevel 	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
   1786     0    stevel 	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
   1787     0    stevel 	int32_t val = (int32_t)nval, level;
   1788     0    stevel 
   1789     0    stevel 	ASSERT(step != 0);
   1790     0    stevel 	ASSERT(levels != 0);
   1791     0    stevel 
   1792     0    stevel 	if (val < base) {
   1793     0    stevel 		/*
   1794     0    stevel 		 * This is an underflow.
   1795     0    stevel 		 */
   1796   457       bmc 		lquanta[0] += incr;
   1797     0    stevel 		return;
   1798     0    stevel 	}
   1799     0    stevel 
   1800     0    stevel 	level = (val - base) / step;
   1801     0    stevel 
   1802     0    stevel 	if (level < levels) {
   1803   457       bmc 		lquanta[level + 1] += incr;
   1804     0    stevel 		return;
   1805     0    stevel 	}
   1806     0    stevel 
   1807     0    stevel 	/*
   1808     0    stevel 	 * This is an overflow.
   1809     0    stevel 	 */
   1810   457       bmc 	lquanta[levels + 1] += incr;
   1811   457       bmc }
   1812   457       bmc 
   1813   457       bmc /*ARGSUSED*/
   1814   457       bmc static void
   1815   457       bmc dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
   1816     0    stevel {
   1817     0    stevel 	data[0]++;
   1818     0    stevel 	data[1] += nval;
   1819  5984   jhaslam }
   1820  5984   jhaslam 
   1821  5984   jhaslam /*ARGSUSED*/
   1822  5984   jhaslam static void
   1823  5984   jhaslam dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
   1824  5984   jhaslam {
   1825  5984   jhaslam 	int64_t snval = (int64_t)nval;
   1826  5984   jhaslam 	uint64_t tmp[2];
   1827  5984   jhaslam 
   1828  5984   jhaslam 	data[0]++;
   1829  5984   jhaslam 	data[1] += nval;
   1830  5984   jhaslam 
   1831  5984   jhaslam 	/*
   1832  5984   jhaslam 	 * What we want to say here is:
   1833  5984   jhaslam 	 *
   1834  5984   jhaslam 	 * data[2] += nval * nval;
   1835  5984   jhaslam 	 *
   1836  5984   jhaslam 	 * But given that nval is 64-bit, we could easily overflow, so
   1837  5984   jhaslam 	 * we do this as 128-bit arithmetic.
   1838  5984   jhaslam 	 */
   1839  5984   jhaslam 	if (snval < 0)
   1840  5984   jhaslam 		snval = -snval;
   1841  5984   jhaslam 
   1842  5984   jhaslam 	dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
   1843  5984   jhaslam 	dtrace_add_128(data + 2, tmp, data + 2);
   1844     0    stevel }
   1845     0    stevel 
   1846     0    stevel /*ARGSUSED*/
   1847     0    stevel static void
   1848   457       bmc dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
   1849     0    stevel {
   1850     0    stevel 	*oval = *oval + 1;
   1851     0    stevel }
   1852     0    stevel 
   1853     0    stevel /*ARGSUSED*/
   1854     0    stevel static void
   1855   457       bmc dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
   1856     0    stevel {
   1857     0    stevel 	*oval += nval;
   1858     0    stevel }
   1859     0    stevel 
   1860     0    stevel /*
   1861     0    stevel  * Aggregate given the tuple in the principal data buffer, and the aggregating
   1862     0    stevel  * action denoted by the specified dtrace_aggregation_t.  The aggregation
   1863     0    stevel  * buffer is specified as the buf parameter.  This routine does not return
   1864     0    stevel  * failure; if there is no space in the aggregation buffer, the data will be
   1865     0    stevel  * dropped, and a corresponding counter incremented.
   1866     0    stevel  */
   1867     0    stevel static void
   1868     0    stevel dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
   1869   457       bmc     intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
   1870     0    stevel {
   1871     0    stevel 	dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
   1872     0    stevel 	uint32_t i, ndx, size, fsize;
   1873     0    stevel 	uint32_t align = sizeof (uint64_t) - 1;
   1874     0    stevel 	dtrace_aggbuffer_t *agb;
   1875     0    stevel 	dtrace_aggkey_t *key;
   1876  1017       bmc 	uint32_t hashval = 0, limit, isstr;
   1877     0    stevel 	caddr_t tomax, data, kdata;
   1878     0    stevel 	dtrace_actkind_t action;
   1879  1017       bmc 	dtrace_action_t *act;
   1880     0    stevel 	uintptr_t offs;
   1881     0    stevel 
   1882     0    stevel 	if (buf == NULL)
   1883     0    stevel 		return;
   1884     0    stevel 
   1885   457       bmc 	if (!agg->dtag_hasarg) {
   1886   457       bmc 		/*
   1887   457       bmc 		 * Currently, only quantize() and lquantize() take additional
   1888   457       bmc 		 * arguments, and they have the same semantics:  an increment
   1889   457       bmc 		 * value that defaults to 1 when not present.  If additional
   1890   457       bmc 		 * aggregating actions take arguments, the setting of the
   1891   457       bmc 		 * default argument value will presumably have to become more
   1892   457       bmc 		 * sophisticated...
   1893   457       bmc 		 */
   1894   457       bmc 		arg = 1;
   1895   457       bmc 	}
   1896   457       bmc 
   1897     0    stevel 	action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
   1898     0    stevel 	size = rec->dtrd_offset - agg->dtag_base;
   1899     0    stevel 	fsize = size + rec->dtrd_size;
   1900     0    stevel 
   1901     0    stevel 	ASSERT(dbuf->dtb_tomax != NULL);
   1902     0    stevel 	data = dbuf->dtb_tomax + offset + agg->dtag_base;
   1903     0    stevel 
   1904     0    stevel 	if ((tomax = buf->dtb_tomax) == NULL) {
   1905     0    stevel 		dtrace_buffer_drop(buf);
   1906     0    stevel 		return;
   1907     0    stevel 	}
   1908     0    stevel 
   1909     0    stevel 	/*
   1910     0    stevel 	 * The metastructure is always at the bottom of the buffer.
   1911     0    stevel 	 */
   1912     0    stevel 	agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
   1913     0    stevel 	    sizeof (dtrace_aggbuffer_t));
   1914     0    stevel 
   1915     0    stevel 	if (buf->dtb_offset == 0) {
   1916     0    stevel 		/*
   1917     0    stevel 		 * We just kludge up approximately 1/8th of the size to be
   1918     0    stevel 		 * buckets.  If this guess ends up being routinely
   1919     0    stevel 		 * off-the-mark, we may need to dynamically readjust this
   1920     0    stevel 		 * based on past performance.
   1921     0    stevel 		 */
   1922     0    stevel 		uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
   1923     0    stevel 
   1924     0    stevel 		if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
   1925     0    stevel 		    (uintptr_t)tomax || hashsize == 0) {
   1926     0    stevel 			/*
   1927     0    stevel 			 * We've been given a ludicrously small buffer;
   1928     0    stevel 			 * increment our drop count and leave.
   1929     0    stevel 			 */
   1930     0    stevel 			dtrace_buffer_drop(buf);
   1931     0    stevel 			return;
   1932     0    stevel 		}
   1933     0    stevel 
   1934     0    stevel 		/*
   1935     0    stevel 		 * And now, a pathetic attempt to try to get a an odd (or
   1936     0    stevel 		 * perchance, a prime) hash size for better hash distribution.
   1937     0    stevel 		 */
   1938     0    stevel 		if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
   1939     0    stevel 			hashsize -= DTRACE_AGGHASHSIZE_SLEW;
   1940     0    stevel 
   1941     0    stevel 		agb->dtagb_hashsize = hashsize;
   1942     0    stevel 		agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
   1943     0    stevel 		    agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
   1944     0    stevel 		agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
   1945     0    stevel 
   1946     0    stevel 		for (i = 0; i < agb->dtagb_hashsize; i++)
   1947     0    stevel 			agb->dtagb_hash[i] = NULL;
   1948     0    stevel 	}
   1949  1017       bmc 
   1950  1017       bmc 	ASSERT(agg->dtag_first != NULL);
   1951  1017       bmc 	ASSERT(agg->dtag_first->dta_intuple);
   1952     0    stevel 
   1953     0    stevel 	/*
   1954     0    stevel 	 * Calculate the hash value based on the key.  Note that we _don't_
   1955     0    stevel 	 * include the aggid in the hashing (but we will store it as part of
   1956     0    stevel 	 * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
   1957     0    stevel 	 * algorithm: a simple, quick algorithm that has no known funnels, and
   1958     0    stevel 	 * gets good distribution in practice.  The efficacy of the hashing
   1959     0    stevel 	 * algorithm (and a comparison with other algorithms) may be found by
   1960     0    stevel 	 * running the ::dtrace_aggstat MDB dcmd.
   1961     0    stevel 	 */
   1962  1017       bmc 	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
   1963  1017       bmc 		i = act->dta_rec.dtrd_offset - agg->dtag_base;
   1964  1017       bmc 		limit = i + act->dta_rec.dtrd_size;
   1965  1017       bmc 		ASSERT(limit <= size);
   1966  1017       bmc 		isstr = DTRACEACT_ISSTRING(act);
   1967  1017       bmc 
   1968  1017       bmc 		for (; i < limit; i++) {
   1969  1017       bmc 			hashval += data[i];
   1970  1017       bmc 			hashval += (hashval << 10);
   1971  1017       bmc 			hashval ^= (hashval >> 6);
   1972  1017       bmc 
   1973  1017       bmc 			if (isstr && data[i] == '\0')
   1974  1017       bmc 				break;
   1975  1017       bmc 		}
   1976     0    stevel 	}
   1977     0    stevel 
   1978     0    stevel 	hashval += (hashval << 3);
   1979     0    stevel 	hashval ^= (hashval >> 11);
   1980     0    stevel 	hashval += (hashval << 15);
   1981     0    stevel 
   1982     0    stevel 	/*
   1983  1017       bmc 	 * Yes, the divide here is expensive -- but it's generally the least
   1984  1017       bmc 	 * of the performance issues given the amount of data that we iterate
   1985  1017       bmc 	 * over to compute hash values, compare data, etc.
   1986     0    stevel 	 */
   1987     0    stevel 	ndx = hashval % agb->dtagb_hashsize;
   1988     0    stevel 
   1989     0    stevel 	for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
   1990     0    stevel 		ASSERT((caddr_t)key >= tomax);
   1991     0    stevel 		ASSERT((caddr_t)key < tomax + buf->dtb_size);
   1992     0    stevel 
   1993     0    stevel 		if (hashval != key->dtak_hashval || key->dtak_size != size)
   1994     0    stevel 			continue;
   1995     0    stevel 
   1996     0    stevel 		kdata = key->dtak_data;
   1997     0    stevel 		ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
   1998     0    stevel 
   1999  1017       bmc 		for (act = agg->dtag_first; act->dta_intuple;
   2000  1017       bmc 		    act = act->dta_next) {
   2001  1017       bmc 			i = act->dta_rec.dtrd_offset - agg->dtag_base;
   2002  1017       bmc 			limit = i + act->dta_rec.dtrd_size;
   2003  1017       bmc 			ASSERT(limit <= size);
   2004  1017       bmc 			isstr = DTRACEACT_ISSTRING(act);
   2005  1017       bmc 
   2006  1017       bmc 			for (; i < limit; i++) {
   2007  1017       bmc 				if (kdata[i] != data[i])
   2008  1017       bmc 					goto next;
   2009  1017       bmc 
   2010  1017       bmc 				if (isstr && data[i] == '\0')
   2011  1017       bmc 					break;
   2012  1017       bmc 			}
   2013     0    stevel 		}
   2014     0    stevel 
   2015     0    stevel 		if (action != key->dtak_action) {
   2016     0    stevel 			/*
   2017     0    stevel 			 * We are aggregating on the same value in the same
   2018     0    stevel 			 * aggregation with two different aggregating actions.
   2019     0    stevel 			 * (This should have been picked up in the compiler,
   2020     0    stevel 			 * so we may be dealing with errant or devious DIF.)
   2021     0    stevel 			 * This is an error condition; we indicate as much,
   2022     0    stevel 			 * and return.
   2023     0    stevel 			 */
   2024     0    stevel 			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
   2025     0    stevel 			return;
   2026     0    stevel 		}
   2027     0    stevel 
   2028     0    stevel 		/*
   2029     0    stevel 		 * This is a hit:  we need to apply the aggregator to
   2030     0    stevel 		 * the value at this key.
   2031     0    stevel 		 */
   2032   457       bmc 		agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
   2033     0    stevel 		return;
   2034     0    stevel next:
   2035     0    stevel 		continue;
   2036     0    stevel 	}
   2037     0    stevel 
   2038     0    stevel 	/*
   2039     0    stevel 	 * We didn't find it.  We need to allocate some zero-filled space,
   2040     0    stevel 	 * link it into the hash table appropriately, and apply the aggregator
   2041     0    stevel 	 * to the (zero-filled) value.
   2042     0    stevel 	 */
   2043     0    stevel 	offs = buf->dtb_offset;
   2044     0    stevel 	while (offs & (align - 1))
   2045     0    stevel 		offs += sizeof (uint32_t);
   2046     0    stevel 
   2047     0    stevel 	/*
   2048     0    stevel 	 * If we don't have enough room to both allocate a new key _and_
   2049     0    stevel 	 * its associated data, increment the drop count and return.
   2050     0    stevel 	 */
   2051     0    stevel 	if ((uintptr_t)tomax + offs + fsize >
   2052     0    stevel 	    agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
   2053     0    stevel 		dtrace_buffer_drop(buf);
   2054     0    stevel 		return;
   2055     0    stevel 	}
   2056     0    stevel 
   2057     0    stevel 	/*CONSTCOND*/
   2058     0    stevel 	ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
   2059     0    stevel 	key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
   2060     0    stevel 	agb->dtagb_free -= sizeof (dtrace_aggkey_t);
   2061     0    stevel 
   2062     0    stevel 	key->dtak_data = kdata = tomax + offs;
   2063     0    stevel 	buf->dtb_offset = offs + fsize;
   2064     0    stevel 
   2065     0    stevel 	/*
   2066     0    stevel 	 * Now copy the data across.
   2067     0    stevel 	 */
   2068     0    stevel 	*((dtrace_aggid_t *)kdata) = agg->dtag_id;
   2069     0    stevel 
   2070     0    stevel 	for (i = sizeof (dtrace_aggid_t); i < size; i++)
   2071     0    stevel 		kdata[i] = data[i];
   2072  1017       bmc 
   2073  1017       bmc 	/*
   2074  1017       bmc 	 * Because strings are not zeroed out by default, we need to iterate
   2075  1017       bmc 	 * looking for actions that store strings, and we need to explicitly
   2076  1017       bmc 	 * pad these strings out with zeroes.
   2077  1017       bmc 	 */
   2078  1017       bmc 	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
   2079  1017       bmc 		int nul;
   2080  1017       bmc 
   2081  1017       bmc 		if (!DTRACEACT_ISSTRING(act))
   2082  1017       bmc 			continue;
   2083  1017       bmc 
   2084  1017       bmc 		i = act->dta_rec.dtrd_offset - agg->dtag_base;
   2085  1017       bmc 		limit = i + act->dta_rec.dtrd_size;
   2086  1017       bmc 		ASSERT(limit <= size);
   2087  1017       bmc 
   2088  1017       bmc 		for (nul = 0; i < limit; i++) {
   2089  1017       bmc 			if (nul) {
   2090  1017       bmc 				kdata[i] = '\0';
   2091  1017       bmc 				continue;
   2092  1017       bmc 			}
   2093  1017       bmc 
   2094  1017       bmc 			if (data[i] != '\0')
   2095  1017       bmc 				continue;
   2096  1017       bmc 
   2097  1017       bmc 			nul = 1;
   2098  1017       bmc 		}
   2099  1017       bmc 	}
   2100     0    stevel 
   2101     0    stevel 	for (i = size; i < fsize; i++)
   2102     0    stevel 		kdata[i] = 0;
   2103     0    stevel 
   2104     0    stevel 	key->dtak_hashval = hashval;
   2105     0    stevel 	key->dtak_size = size;
   2106     0    stevel 	key->dtak_action = action;
   2107     0    stevel 	key->dtak_next = agb->dtagb_hash[ndx];
   2108     0    stevel 	agb->dtagb_hash[ndx] = key;
   2109     0    stevel 
   2110     0    stevel 	/*
   2111     0    stevel 	 * Finally, apply the aggregator.
   2112     0    stevel 	 */
   2113     0    stevel 	*((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
   2114   457       bmc 	agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
   2115     0    stevel }
   2116     0    stevel 
   2117     0    stevel /*
   2118     0    stevel  * Given consumer state, this routine finds a speculation in the INACTIVE
   2119     0    stevel  * state and transitions it into the ACTIVE state.  If there is no speculation
   2120     0    stevel  * in the INACTIVE state, 0 is returned.  In this case, no error counter is
   2121     0    stevel  * incremented -- it is up to the caller to take appropriate action.
   2122     0    stevel  */
   2123     0    stevel static int
   2124     0    stevel dtrace_speculation(dtrace_state_t *state)
   2125     0    stevel {
   2126     0    stevel 	int i = 0;
   2127     0    stevel 	dtrace_speculation_state_t current;
   2128     0    stevel 	uint32_t *stat = &state->dts_speculations_unavail, count;
   2129     0    stevel 
   2130     0    stevel 	while (i < state->dts_nspeculations) {
   2131     0    stevel 		dtrace_speculation_t *spec = &state->dts_speculations[i];
   2132     0    stevel 
   2133     0    stevel 		current = spec->dtsp_state;
   2134     0    stevel 
   2135     0    stevel 		if (current != DTRACESPEC_INACTIVE) {
   2136     0    stevel 			if (current == DTRACESPEC_COMMITTINGMANY ||
   2137     0    stevel 			    current == DTRACESPEC_COMMITTING ||
   2138     0    stevel 			    current == DTRACESPEC_DISCARDING)
   2139     0    stevel 				stat = &state->dts_speculations_busy;
   2140     0    stevel 			i++;
   2141     0    stevel 			continue;
   2142     0    stevel 		}
   2143     0    stevel 
   2144     0    stevel 		if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
   2145     0    stevel 		    current, DTRACESPEC_ACTIVE) == current)
   2146     0    stevel 			return (i + 1);
   2147     0    stevel 	}
   2148     0    stevel 
   2149     0    stevel 	/*
   2150     0    stevel 	 * We couldn't find a speculation.  If we found as much as a single
   2151     0    stevel 	 * busy speculation buffer, we'll attribute this failure as "busy"
   2152     0    stevel 	 * instead of "unavail".
   2153     0    stevel 	 */
   2154     0    stevel 	do {
   2155     0    stevel 		count = *stat;
   2156     0    stevel 	} while (dtrace_cas32(stat, count, count + 1) != count);
   2157     0    stevel 
   2158     0    stevel 	return (0);
   2159     0    stevel }
   2160     0    stevel 
   2161     0    stevel /*
   2162     0    stevel  * This routine commits an active speculation.  If the specified speculation
   2163     0    stevel  * is not in a valid state to perform a commit(), this routine will silently do
   2164     0    stevel  * nothing.  The state of the specified speculation is transitioned according
   2165     0    stevel  * to the state transition diagram outlined in <sys/dtrace_impl.h>
   2166     0    stevel  */
   2167     0    stevel static void
   2168     0    stevel dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
   2169     0    stevel     dtrace_specid_t which)
   2170     0    stevel {
   2171     0    stevel 	dtrace_speculation_t *spec;
   2172     0    stevel 	dtrace_buffer_t *src, *dest;
   2173     0    stevel 	uintptr_t daddr, saddr, dlimit;
   2174     0    stevel 	dtrace_speculation_state_t current, new;
   2175     0    stevel 	intptr_t offs;
   2176     0    stevel 
   2177     0    stevel 	if (which == 0)
   2178     0    stevel 		return;
   2179     0    stevel 
   2180     0    stevel 	if (which > state->dts_nspeculations) {
   2181     0    stevel 		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
   2182     0    stevel 		return;
   2183     0    stevel 	}
   2184     0    stevel 
   2185     0    stevel 	spec = &state->dts_speculations[which - 1];
   2186     0    stevel 	src = &spec->dtsp_buffer[cpu];
   2187     0    stevel 	dest = &state->dts_buffer[cpu];
   2188     0    stevel 
   2189     0    stevel 	do {
   2190     0    stevel 		current = spec->dtsp_state;
   2191     0    stevel 
   2192     0    stevel 		if (current == DTRACESPEC_COMMITTINGMANY)
   2193     0    stevel 			break;
   2194     0    stevel 
   2195     0    stevel 		switch (current) {
   2196     0    stevel 		case DTRACESPEC_INACTIVE:
   2197     0    stevel 		case DTRACESPEC_DISCARDING:
   2198     0    stevel 			return;
   2199     0    stevel 
   2200     0    stevel 		case DTRACESPEC_COMMITTING:
   2201     0    stevel 			/*
   2202     0    stevel 			 * This is only possible if we are (a) commit()'ing
   2203     0    stevel 			 * without having done a prior speculate() on this CPU
   2204     0    stevel 			 * and (b) racing with another commit() on a different
   2205     0    stevel 			 * CPU.  There's nothing to do -- we just assert that
   2206     0    stevel 			 * our offset is 0.
   2207     0    stevel 			 */
   2208     0    stevel 			ASSERT(src->dtb_offset == 0);
   2209     0    stevel 			return;
   2210     0    stevel 
   2211     0    stevel 		case DTRACESPEC_ACTIVE:
   2212     0    stevel 			new = DTRACESPEC_COMMITTING;
   2213     0    stevel 			break;
   2214     0    stevel 
   2215     0    stevel 		case DTRACESPEC_ACTIVEONE:
   2216     0    stevel 			/*
   2217     0    stevel 			 * This speculation is active on one CPU.  If our
   2218     0    stevel 			 * buffer offset is non-zero, we know that the one CPU
   2219     0    stevel 			 * must be us.  Otherwise, we are committing on a
   2220     0    stevel 			 * different CPU from the speculate(), and we must
   2221     0    stevel 			 * rely on being asynchronously cleaned.
   2222     0    stevel 			 */
   2223     0    stevel 			if (src->dtb_offset != 0) {
   2224     0    stevel 				new = DTRACESPEC_COMMITTING;
   2225     0    stevel 				break;
   2226     0    stevel 			}
   2227     0    stevel 			/*FALLTHROUGH*/
   2228     0    stevel 
   2229     0    stevel 		case DTRACESPEC_ACTIVEMANY:
   2230     0    stevel 			new = DTRACESPEC_COMMITTINGMANY;
   2231     0    stevel 			break;
   2232     0    stevel 
   2233     0    stevel 		default:
   2234     0    stevel 			ASSERT(0);
   2235     0    stevel 		}
   2236     0    stevel 	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
   2237     0    stevel 	    current, new) != current);
   2238     0    stevel 
   2239     0    stevel 	/*
   2240     0    stevel 	 * We have set the state to indicate that we are committing this
   2241     0    stevel 	 * speculation.  Now reserve the necessary space in the destination
   2242     0    stevel 	 * buffer.
   2243     0    stevel 	 */
   2244     0    stevel 	if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
   2245     0    stevel 	    sizeof (uint64_t), state, NULL)) < 0) {
   2246     0    stevel 		dtrace_buffer_drop(dest);
   2247     0    stevel 		goto out;
   2248     0    stevel 	}
   2249     0    stevel 
   2250     0    stevel 	/*
   2251     0    stevel 	 * We have the space; copy the buffer across.  (Note that this is a
   2252     0    stevel 	 * highly subobtimal bcopy(); in the unlikely event that this becomes
   2253     0    stevel 	 * a serious performance issue, a high-performance DTrace-specific
   2254     0    stevel 	 * bcopy() should obviously be invented.)
   2255     0    stevel 	 */
   2256     0    stevel 	daddr = (uintptr_t)dest->dtb_tomax + offs;
   2257     0    stevel 	dlimit = daddr + src->dtb_offset;
   2258     0    stevel 	saddr = (uintptr_t)src->dtb_tomax;
   2259     0    stevel 
   2260     0    stevel 	/*
   2261     0    stevel 	 * First, the aligned portion.
   2262     0    stevel 	 */
   2263     0    stevel 	while (dlimit - daddr >= sizeof (uint64_t)) {
   2264     0    stevel 		*((uint64_t *)daddr) = *((uint64_t *)saddr);
   2265     0    stevel 
   2266     0    stevel 		daddr += sizeof (uint64_t);
   2267     0    stevel 		saddr += sizeof (uint64_t);
   2268     0    stevel 	}
   2269     0    stevel 
   2270     0    stevel 	/*
   2271     0    stevel 	 * Now any left-over bit...
   2272     0    stevel 	 */
   2273     0    stevel 	while (dlimit - daddr)
   2274     0    stevel 		*((uint8_t *)daddr++) = *((uint8_t *)saddr++);
   2275     0    stevel 
   2276     0    stevel 	/*
   2277     0    stevel 	 * Finally, commit the reserved space in the destination buffer.
   2278     0    stevel 	 */
   2279     0    stevel 	dest->dtb_offset = offs + src->dtb_offset;
   2280     0    stevel 
   2281     0    stevel out:
   2282     0    stevel 	/*
   2283     0    stevel 	 * If we're lucky enough to be the only active CPU on this speculation
   2284     0    stevel 	 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
   2285     0    stevel 	 */
   2286     0    stevel 	if (current == DTRACESPEC_ACTIVE ||
   2287     0    stevel 	    (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
   2288     0    stevel 		uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
   2289     0    stevel 		    DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
   2290     0    stevel 
   2291     0    stevel 		ASSERT(rval == DTRACESPEC_COMMITTING);
   2292     0    stevel 	}
   2293     0    stevel 
   2294     0    stevel 	src->dtb_offset = 0;
   2295     0    stevel 	src->dtb_xamot_drops += src->dtb_drops;
   2296     0    stevel 	src->dtb_drops = 0;
   2297     0    stevel }
   2298     0    stevel 
   2299     0    stevel /*
   2300     0    stevel  * This routine discards an active speculation.  If the specified speculation
   2301     0    stevel  * is not in a valid state to perform a discard(), this routine will silently
   2302     0    stevel  * do nothing.  The state of the specified speculation is transitioned
   2303     0    stevel  * according to the state transition diagram outlined in <sys/dtrace_impl.h>
   2304     0    stevel  */
   2305     0    stevel static void
   2306     0    stevel dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
   2307     0    stevel     dtrace_specid_t which)
   2308     0    stevel {
   2309     0    stevel 	dtrace_speculation_t *spec;
   2310     0    stevel 	dtrace_speculation_state_t current, new;
   2311     0    stevel 	dtrace_buffer_t *buf;
   2312     0    stevel 
   2313     0    stevel 	if (which == 0)
   2314     0    stevel 		return;
   2315     0    stevel 
   2316     0    stevel 	if (which > state->dts_nspeculations) {
   2317     0    stevel 		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
   2318     0    stevel 		return;
   2319     0    stevel 	}
   2320     0    stevel 
   2321     0    stevel 	spec = &state->dts_speculations[which - 1];
   2322     0    stevel 	buf = &spec->dtsp_buffer[cpu];
   2323     0    stevel 
   2324     0    stevel 	do {
   2325     0    stevel 		current = spec->dtsp_state;
   2326     0    stevel 
   2327     0    stevel 		switch (current) {
   2328     0    stevel 		case DTRACESPEC_INACTIVE:
   2329     0    stevel 		case DTRACESPEC_COMMITTINGMANY:
   2330     0    stevel 		case DTRACESPEC_COMMITTING:
   2331     0    stevel 		case DTRACESPEC_DISCARDING:
   2332     0    stevel 			return;
   2333     0    stevel 
   2334     0    stevel 		case DTRACESPEC_ACTIVE:
   2335     0    stevel 		case DTRACESPEC_ACTIVEMANY:
   2336     0    stevel 			new = DTRACESPEC_DISCARDING;
   2337     0    stevel 			break;
   2338     0    stevel 
   2339     0    stevel 		case DTRACESPEC_ACTIVEONE:
   2340     0    stevel 			if (buf->dtb_offset != 0) {
   2341     0    stevel 				new = DTRACESPEC_INACTIVE;
   2342     0    stevel 			} else {
   2343     0    stevel 				new = DTRACESPEC_DISCARDING;
   2344     0    stevel 			}
   2345     0    stevel 			break;
   2346     0    stevel 
   2347     0    stevel 		default:
   2348     0    stevel 			ASSERT(0);
   2349     0    stevel 		}
   2350     0    stevel 	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
   2351     0    stevel 	    current, new) != current);
   2352     0    stevel 
   2353     0    stevel 	buf->dtb_offset = 0;
   2354     0    stevel 	buf->dtb_drops = 0;
   2355     0    stevel }
   2356     0    stevel 
   2357     0    stevel /*
   2358     0    stevel  * Note:  not called from probe context.  This function is called
   2359     0    stevel  * asynchronously from cross call context to clean any speculations that are
   2360     0    stevel  * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
   2361     0    stevel  * transitioned back to the INACTIVE state until all CPUs have cleaned the
   2362     0    stevel  * speculation.
   2363     0    stevel  */
   2364     0    stevel static void
   2365     0    stevel dtrace_speculation_clean_here(dtrace_state_t *state)
   2366     0    stevel {
   2367     0    stevel 	dtrace_icookie_t cookie;
   2368     0    stevel 	processorid_t cpu = CPU->cpu_id;
   2369     0    stevel 	dtrace_buffer_t *dest = &state->dts_buffer[cpu];
   2370     0    stevel 	dtrace_specid_t i;
   2371     0    stevel 
   2372     0    stevel 	cookie = dtrace_interrupt_disable();
   2373     0    stevel 
   2374     0    stevel 	if (dest->dtb_tomax == NULL) {
   2375     0    stevel 		dtrace_interrupt_enable(cookie);
   2376     0    stevel 		return;
   2377     0    stevel 	}
   2378     0    stevel 
   2379     0    stevel 	for (i = 0; i < state->dts_nspeculations; i++) {
   2380     0    stevel 		dtrace_speculation_t *spec = &state->dts_speculations[i];
   2381     0    stevel 		dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
   2382     0    stevel 
   2383     0    stevel 		if (src->dtb_tomax == NULL)
   2384     0    stevel 			continue;
   2385     0    stevel 
   2386     0    stevel 		if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
   2387     0    stevel 			src->dtb_offset = 0;
   2388     0    stevel 			continue;
   2389     0    stevel 		}
   2390     0    stevel 
   2391     0    stevel 		if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
   2392     0    stevel 			continue;
   2393     0    stevel 
   2394     0    stevel 		if (src->dtb_offset == 0)
   2395     0    stevel 			continue;
   2396     0    stevel 
   2397     0    stevel 		dtrace_speculation_commit(state, cpu, i + 1);
   2398     0    stevel 	}
   2399     0    stevel 
   2400     0    stevel 	dtrace_interrupt_enable(cookie);
   2401     0    stevel }
   2402     0    stevel 
   2403     0    stevel /*
   2404     0    stevel  * Note:  not called from probe context.  This function is called
   2405     0    stevel  * asynchronously (and at a regular interval) to clean any speculations that
   2406     0    stevel  * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
   2407     0    stevel  * is work to be done, it cross calls all CPUs to perform that work;
   2408     0    stevel  * COMMITMANY and DISCARDING speculations may not be transitioned back to the
   2409     0    stevel  * INACTIVE state until they have been cleaned by all CPUs.
   2410     0    stevel  */
   2411     0    stevel static void
   2412     0    stevel dtrace_speculation_clean(dtrace_state_t *state)
   2413     0    stevel {
   2414     0    stevel 	int work = 0, rv;
   2415     0    stevel 	dtrace_specid_t i;
   2416     0    stevel 
   2417     0    stevel 	for (i = 0; i < state->dts_nspeculations; i++) {
   2418     0    stevel 		dtrace_speculation_t *spec = &state->dts_speculations[i];
   2419     0    stevel 
   2420     0    stevel 		ASSERT(!spec->dtsp_cleaning);
   2421     0    stevel 
   2422     0    stevel 		if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
   2423     0    stevel 		    spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
   2424     0    stevel 			continue;
   2425     0    stevel 
   2426     0    stevel 		work++;
   2427     0    stevel 		spec->dtsp_cleaning = 1;
   2428     0    stevel 	}
   2429     0    stevel 
   2430     0    stevel 	if (!work)
   2431     0    stevel 		return;
   2432     0    stevel 
   2433     0    stevel 	dtrace_xcall(DTRACE_CPUALL,
   2434     0    stevel 	    (dtrace_xcall_t)dtrace_speculation_clean_here, state);
   2435     0    stevel 
   2436     0    stevel 	/*
   2437     0    stevel 	 * We now know that all CPUs have committed or discarded their
   2438     0    stevel 	 * speculation buffers, as appropriate.  We can now set the state
   2439     0    stevel 	 * to inactive.
   2440     0    stevel 	 */
   2441     0    stevel 	for (i = 0; i < state->dts_nspeculations; i++) {
   2442     0    stevel 		dtrace_speculation_t *spec = &state->dts_speculations[i];
   2443     0    stevel 		dtrace_speculation_state_t current, new;
   2444     0    stevel 
   2445     0    stevel 		if (!spec->dtsp_cleaning)
   2446     0    stevel 			continue;
   2447     0    stevel 
   2448     0    stevel 		current = spec->dtsp_state;
   2449     0    stevel 		ASSERT(current == DTRACESPEC_DISCARDING ||
   2450     0    stevel 		    current == DTRACESPEC_COMMITTINGMANY);
   2451     0    stevel 
   2452     0    stevel 		new = DTRACESPEC_INACTIVE;
   2453     0    stevel 
   2454     0    stevel 		rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
   2455     0    stevel 		ASSERT(rv == current);
   2456     0    stevel 		spec->dtsp_cleaning = 0;
   2457     0    stevel 	}
   2458     0    stevel }
   2459     0    stevel 
   2460     0    stevel /*
   2461     0    stevel  * Called as part of a speculate() to get the speculative buffer associated
   2462     0    stevel  * with a given speculation.  Returns NULL if the specified speculation is not
   2463     0    stevel  * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
   2464     0    stevel  * the active CPU is not the specified CPU -- the speculation will be
   2465     0    stevel  * atomically transitioned into the ACTIVEMANY state.
   2466     0    stevel  */
   2467     0    stevel static dtrace_buffer_t *
   2468     0    stevel dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
   2469     0    stevel     dtrace_specid_t which)
   2470     0    stevel {
   2471     0    stevel 	dtrace_speculation_t *spec;
   2472     0    stevel 	dtrace_speculation_state_t current, new;
   2473     0    stevel 	dtrace_buffer_t *buf;
   2474     0    stevel 
   2475     0    stevel 	if (which == 0)
   2476     0    stevel 		return (NULL);
   2477     0    stevel 
   2478     0    stevel 	if (which > state->dts_nspeculations) {
   2479     0    stevel 		cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
   2480     0    stevel 		return (NULL);
   2481     0    stevel 	}
   2482     0    stevel 
   2483     0    stevel 	spec = &state->dts_speculations[which - 1];
   2484     0    stevel 	buf = &spec->dtsp_buffer[cpuid];
   2485     0    stevel 
   2486     0    stevel 	do {
   2487     0    stevel 		current = spec->dtsp_state;
   2488     0    stevel 
   2489     0    stevel 		switch (current) {
   2490     0    stevel 		case DTRACESPEC_INACTIVE:
   2491     0    stevel 		case DTRACESPEC_COMMITTINGMANY:
   2492     0    stevel 		case DTRACESPEC_DISCARDING:
   2493     0    stevel 			return (NULL);
   2494     0    stevel 
   2495     0    stevel 		case DTRACESPEC_COMMITTING:
   2496     0    stevel 			ASSERT(buf->dtb_offset == 0);
   2497     0    stevel 			return (NULL);
   2498     0    stevel 
   2499     0    stevel 		case DTRACESPEC_ACTIVEONE:
   2500     0    stevel 			/*
   2501     0    stevel 			 * This speculation is currently active on one CPU.
   2502     0    stevel 			 * Check the offset in the buffer; if it's non-zero,
   2503     0    stevel 			 * that CPU must be us (and we leave the state alone).
   2504     0    stevel 			 * If it's zero, assume that we're starting on a new
   2505     0    stevel 			 * CPU -- and change the state to indicate that the
   2506     0    stevel 			 * speculation is active on more than one CPU.
   2507     0    stevel 			 */
   2508     0    stevel 			if (buf->dtb_offset != 0)
   2509     0    stevel 				return (buf);
   2510     0    stevel 
   2511     0    stevel 			new = DTRACESPEC_ACTIVEMANY;
   2512     0    stevel 			break;
   2513     0    stevel 
   2514     0    stevel 		case DTRACESPEC_ACTIVEMANY:
   2515     0    stevel 			return (buf);
   2516     0    stevel 
   2517     0    stevel 		case DTRACESPEC_ACTIVE:
   2518     0    stevel 			new = DTRACESPEC_ACTIVEONE;
   2519     0    stevel 			break;
   2520     0    stevel 
   2521     0    stevel 		default:
   2522     0    stevel 			ASSERT(0);
   2523     0    stevel 		}
   2524     0    stevel 	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
   2525     0    stevel 	    current, new) != current);
   2526     0    stevel 
   2527     0    stevel 	ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
   2528     0    stevel 	return (buf);
   2529     0    stevel }
   2530     0    stevel 
   2531     0    stevel /*
   2532  2870        dp  * Return a string.  In the event that the user lacks the privilege to access
   2533  2870        dp  * arbitrary kernel memory, we copy the string out to scratch memory so that we
   2534  2870        dp  * don't fail access checking.
   2535  2870        dp  *
   2536  2870        dp  * dtrace_dif_variable() uses this routine as a helper for various
   2537  2870        dp  * builtin values such as 'execname' and 'probefunc.'
   2538  2870        dp  */
   2539  2870        dp uintptr_t
   2540  2870        dp dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
   2541  2870        dp     dtrace_mstate_t *mstate)
   2542  2870        dp {
   2543  2870        dp 	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
   2544  2870        dp 	uintptr_t ret;
   2545  2870        dp 	size_t strsz;
   2546  2870        dp 
   2547  2870        dp 	/*
   2548  2870        dp 	 * The easy case: this probe is allowed to read all of memory, so
   2549  2870        dp 	 * we can just return this as a vanilla pointer.
   2550  2870        dp 	 */
   2551  2870        dp 	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
   2552  2870        dp 		return (addr);
   2553  2870        dp 
   2554  2870        dp 	/*
   2555  2870        dp 	 * This is the tougher case: we copy the string in question from
   2556  2870        dp 	 * kernel memory into scratch memory and return it that way: this
   2557  2870        dp 	 * ensures that we won't trip up when access checking tests the
   2558  2870        dp 	 * BYREF return value.
   2559  2870        dp 	 */
   2560  2870        dp 	strsz = dtrace_strlen((char *)addr, size) + 1;
   2561  2870        dp 
   2562  2870        dp 	if (mstate->dtms_scratch_ptr + strsz >
   2563  2870        dp 	    mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
   2564  2870        dp 		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
   2565  2870        dp 		return (NULL);
   2566  2870        dp 	}
   2567  2870        dp 
   2568  2870        dp 	dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
   2569  2870        dp 	    strsz);
   2570  2870        dp 	ret = mstate->dtms_scratch_ptr;
   2571  2870        dp 	mstate->dtms_scratch_ptr += strsz;
   2572  2870        dp 	return (ret);
   2573  2870        dp }
   2574  2870        dp 
   2575  2870        dp /*
   2576     0    stevel  * This function implements the DIF emulator's variable lookups.  The emulator
   2577     0    stevel  * passes a reserved variable identifier and optional built-in array index.
   2578     0    stevel  */
   2579     0    stevel static uint64_t
   2580     0    stevel dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
   2581   457       bmc     uint64_t ndx)
   2582     0    stevel {
   2583     0    stevel 	/*
   2584     0    stevel 	 * If we're accessing one of the uncached arguments, we'll turn this
   2585     0    stevel 	 * into a reference in the args array.
   2586     0    stevel 	 */
   2587     0    stevel 	if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
   2588   457       bmc 		ndx = v - DIF_VAR_ARG0;
   2589     0    stevel 		v = DIF_VAR_ARGS;
   2590     0    stevel 	}
   2591     0    stevel 
   2592     0    stevel 	switch (v) {
   2593     0    stevel 	case DIF_VAR_ARGS:
   2594     0    stevel 		ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
   2595   457       bmc 		if (ndx >= sizeof (mstate->dtms_arg) /
   2596     0    stevel 		    sizeof (mstate->dtms_arg[0])) {
   2597     0    stevel 			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
   2598     0    stevel 			dtrace_provider_t *pv;
   2599     0    stevel 			uint64_t val;
   2600     0    stevel 
   2601     0    stevel 			pv = mstate->dtms_probe->dtpr_provider;
   2602     0    stevel 			if (pv->dtpv_pops.dtps_getargval != NULL)
   2603     0    stevel 				val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
   2604     0    stevel 				    mstate->dtms_probe->dtpr_id,
   2605   457       bmc 				    mstate->dtms_probe->dtpr_arg, ndx, aframes);
   2606     0    stevel 			else
   2607   457       bmc 				val = dtrace_getarg(ndx, aframes);
   2608     0    stevel 
   2609     0    stevel 			/*
   2610     0    stevel 			 * This is regrettably required to keep the compiler
   2611     0    stevel 			 * from tail-optimizing the call to dtrace_getarg().
   2612     0    stevel 			 * The condition always evaluates to true, but the
   2613     0    stevel 			 * compiler has no way of figuring that out a priori.
   2614     0    stevel 			 * (None of this would be necessary if the compiler
   2615     0    stevel 			 * could be relied upon to _always_ tail-optimize
   2616     0    stevel 			 * the call to dtrace_getarg() -- but it can't.)
   2617     0    stevel 			 */
   2618     0    stevel 			if (mstate->dtms_probe != NULL)
   2619     0    stevel 				return (val);
   2620     0    stevel 
   2621     0    stevel 			ASSERT(0);
   2622     0    stevel 		}
   2623     0    stevel 
   2624   457       bmc 		return (mstate->dtms_arg[ndx]);
   2625     0    stevel 
   2626     0    stevel 	case DIF_VAR_UREGS: {
   2627     0    stevel 		klwp_t *lwp;
   2628     0    stevel 
   2629     0    stevel 		if (!dtrace_priv_proc(state))
   2630     0    stevel 			return (0);
   2631     0    stevel 
   2632     0    stevel 		if ((lwp = curthread->t_lwp) == NULL) {
   2633     0    stevel 			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
   2634     0    stevel 			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = NULL;
   2635     0    stevel 			return (0);
   2636     0    stevel 		}
   2637     0    stevel 
   2638   457       bmc 		return (dtrace_getreg(lwp->lwp_regs, ndx));
   2639     0    stevel 	}
   2640     0    stevel 
   2641     0    stevel 	case DIF_VAR_CURTHREAD:
   2642     0    stevel 		if (!dtrace_priv_kernel(state))
   2643     0    stevel 			return (0);
   2644     0    stevel 		return ((uint64_t)(uintptr_t)curthread);
   2645     0    stevel 
   2646     0    stevel 	case DIF_VAR_TIMESTAMP:
   2647     0    stevel 		if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
   2648     0    stevel 			mstate->dtms_timestamp = dtrace_gethrtime();
   2649     0    stevel 			mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
   2650     0    stevel 		}
   2651     0    stevel 		return (mstate->dtms_timestamp);
   2652     0    stevel 
   2653     0    stevel 	case DIF_VAR_VTIMESTAMP:
   2654     0    stevel 		ASSERT(dtrace_vtime_references != 0);
   2655     0    stevel 		return (curthread->t_dtrace_vtime);
   2656     0    stevel 
   2657     0    stevel 	case DIF_VAR_WALLTIMESTAMP:
   2658     0    stevel 		if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
   2659     0    stevel 			mstate->dtms_walltimestamp = dtrace_gethrestime();
   2660     0    stevel 			mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
   2661     0    stevel 		}
   2662     0    stevel 		return (mstate->dtms_walltimestamp);
   2663     0    stevel 
   2664     0    stevel 	case DIF_VAR_IPL:
   2665     0    stevel 		if (!dtrace_priv_kernel(state))
   2666     0    stevel 			return (0);
   2667     0    stevel 		if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
   2668     0    stevel 			mstate->dtms_ipl = dtrace_getipl();
   2669     0    stevel 			mstate->dtms_present |= DTRACE_MSTATE_IPL;
   2670     0    stevel 		}
   2671     0    stevel 		return (mstate->dtms_ipl);
   2672     0    stevel 
   2673     0    stevel 	case DIF_VAR_EPID:
   2674     0    stevel 		ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
   2675     0    stevel 		return (mstate->dtms_epid);
   2676     0    stevel 
   2677     0    stevel 	case DIF_VAR_ID:
   2678     0    stevel 		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
   2679     0    stevel 		return (mstate->dtms_probe->dtpr_id);
   2680     0    stevel 
   2681     0    stevel 	case DIF_VAR_STACKDEPTH:
   2682     0    stevel 		if (!dtrace_priv_kernel(state))
   2683     0    stevel 			return (0);
   2684     0    stevel 		if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
   2685     0    stevel 			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
   2686     0    stevel 
   2687     0    stevel 			mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
   2688     0    stevel 			mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
   2689     0    stevel 		}
   2690     0    stevel 		return (mstate->dtms_stackdepth);
   2691     0    stevel 
   2692   191       ahl 	case DIF_VAR_USTACKDEPTH:
   2693   191       ahl 		if (!dtrace_priv_proc(state))
   2694   191       ahl 			return (0);
   2695   191       ahl 		if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
   2696   630       ahl 			/*
   2697   630       ahl 			 * See comment in DIF_VAR_PID.
   2698   630       ahl 			 */
   2699   630       ahl 			if (DTRACE_ANCHORED(mstate->dtms_probe) &&
   2700   630       ahl 			    CPU_ON_INTR(CPU)) {
   2701   630       ahl 				mstate->dtms_ustackdepth = 0;
   2702   630       ahl 			} else {
   2703   630       ahl 				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
   2704   630       ahl 				mstate->dtms_ustackdepth =
   2705   630       ahl 				    dtrace_getustackdepth();
   2706   630       ahl 				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
   2707   630       ahl 			}
   2708   191       ahl 			mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
   2709   191       ahl 		}
   2710   191       ahl 		return (mstate->dtms_ustackdepth);
   2711   191       ahl 
   2712     0    stevel 	case DIF_VAR_CALLER:
   2713     0    stevel 		if (!dtrace_priv_kernel(state))
   2714     0    stevel 			return (0);
   2715     0    stevel 		if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
   2716     0    stevel 			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
   2717     0    stevel 
   2718     0    stevel 			if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
   2719     0    stevel 				/*
   2720     0    stevel 				 * If this is an unanchored probe, we are
   2721     0    stevel 				 * required to go through the slow path:
   2722     0    stevel 				 * dtrace_caller() only guarantees correct
   2723     0    stevel 				 * results for anchored probes.
   2724     0    stevel 				 */
   2725     0    stevel 				pc_t caller[2];
   2726     0    stevel 
   2727     0    stevel 				dtrace_getpcstack(caller, 2, aframes,
   2728   191       ahl 				    (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
   2729     0    stevel 				mstate->dtms_caller = caller[1];
   2730     0    stevel 			} else if ((mstate->dtms_caller =
   2731     0    stevel 			    dtrace_caller(aframes)) == -1) {
   2732     0    stevel 				/*
   2733     0    stevel 				 * We have failed to do this the quick way;
   2734     0    stevel 				 * we must resort to the slower approach of
   2735     0    stevel 				 * calling dtrace_getpcstack().
   2736     0    stevel 				 */
   2737     0    stevel 				pc_t caller;
   2738     0    stevel 
   2739     0    stevel 				dtrace_getpcstack(&caller, 1, aframes, NULL);
   2740     0    stevel 				mstate->dtms_caller = caller;
   2741     0    stevel 			}
   2742     0    stevel 
   2743     0    stevel 			mstate->dtms_present |= DTRACE_MSTATE_CALLER;
   2744     0    stevel 		}
   2745     0    stevel 		return (mstate->dtms_caller);
   2746   457       bmc 
   2747   457       bmc 	case DIF_VAR_UCALLER:
   2748   457       bmc 		if (!dtrace_priv_proc(state))
   2749   457       bmc 			return (0);
   2750   457       bmc 
   2751   457       bmc 		if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
   2752   457       bmc 			uint64_t ustack[3];
   2753   457       bmc 
   2754   457       bmc 			/*
   2755   457       bmc 			 * dtrace_getupcstack() fills in the first uint64_t
   2756   457       bmc 			 * with the current PID.  The second uint64_t will
   2757   457       bmc 			 * be the program counter at user-level.  The third
   2758   457       bmc 			 * uint64_t will contain the caller, which is what
   2759   457       bmc 			 * we're after.
   2760   457       bmc 			 */
   2761   457       bmc 			ustack[2] = NULL;
   2762  5114       ahl 			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
   2763   457       bmc 			dtrace_getupcstack(ustack, 3);
   2764  5114       ahl 			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
   2765   457       bmc 			mstate->dtms_ucaller = ustack[2];
   2766   457       bmc 			mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
   2767   457       bmc 		}
   2768   457       bmc 
   2769   457       bmc 		return (mstate->dtms_ucaller);
   2770     0    stevel 
   2771     0    stevel 	case DIF_VAR_PROBEPROV:
   2772     0    stevel 		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
   2773  2870        dp 		return (dtrace_dif_varstr(
   2774  2870        dp 		    (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
   2775  2870        dp 		    state, mstate));
   2776     0    stevel 
   2777     0    stevel 	case DIF_VAR_PROBEMOD:
   2778     0    stevel 		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
   2779  2870        dp 		return (dtrace_dif_varstr(
   2780  2870        dp 		    (uintptr_t)mstate->dtms_probe->dtpr_mod,
   2781  2870        dp 		    state, mstate));
   2782     0    stevel 
   2783     0    stevel 	case DIF_VAR_PROBEFUNC:
   2784     0    stevel 		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
   2785  2870        dp 		return (dtrace_dif_varstr(
   2786  2870        dp 		    (uintptr_t)mstate->dtms_probe->dtpr_func,
   2787  2870        dp 		    state, mstate));
   2788     0    stevel 
   2789     0    stevel 	case DIF_VAR_PROBENAME:
   2790     0    stevel 		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
   2791  2870        dp 		return (dtrace_dif_varstr(
   2792  2870        dp 		    (uintptr_t)mstate->dtms_probe->dtpr_name,
   2793  2870        dp 		    state, mstate));
   2794     0    stevel 
   2795     0    stevel 	case DIF_VAR_PID:
   2796     0    stevel 		if (!dtrace_priv_proc(state))
   2797     0    stevel 			return (0);
   2798     0    stevel 
   2799     0    stevel 		/*
   2800     0    stevel 		 * Note that we are assuming that an unanchored probe is
   2801     0    stevel 		 * always due to a high-level interrupt.  (And we're assuming
   2802     0    stevel 		 * that there is only a single high level interrupt.)
   2803     0    stevel 		 */
   2804     0    stevel 		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
   2805     0    stevel 			return (pid0.pid_id);
   2806     0    stevel 
   2807     0    stevel 		/*
   2808     0    stevel 		 * It is always safe to dereference one's own t_procp pointer:
   2809     0    stevel 		 * it always points to a valid, allocated proc structure.
   2810     0    stevel 		 * Further, it is always safe to dereference the p_pidp member
   2811     0    stevel 		 * of one's own proc structure.  (These are truisms becuase
   2812     0    stevel 		 * threads and processes don't clean up their own state --
   2813     0    stevel 		 * they leave that task to whomever reaps them.)
   2814     0    stevel 		 */
   2815     0    stevel 		return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
   2816     0    stevel 
   2817  2525        dp 	case DIF_VAR_PPID:
   2818  2525        dp 		if (!dtrace_priv_proc(state))
   2819  2525        dp 			return (0);
   2820  2525        dp 
   2821  2525        dp 		/*
   2822  2525        dp 		 * See comment in DIF_VAR_PID.
   2823  2525        dp 		 */
   2824  2525        dp 		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
   2825  2525        dp 			return (pid0.pid_id);
   2826  2525        dp 
   2827     0    stevel 		/*
   2828     0    stevel 		 * It is always safe to dereference one's own t_procp pointer:
   2829     0    stevel 		 * it always points to a valid, allocated proc structure.
   2830     0    stevel 		 * (This is true because threads don't clean up their own
   2831     0    stevel 		 * state -- they leave that task to whomever reaps them.)
   2832     0    stevel 		 */
   2833  2756        dp 		return ((uint64_t)curthread->t_procp->p_ppid);
   2834  2756        dp 
   2835  2756        dp 	case DIF_VAR_TID:
   2836  2756        dp 		/*
   2837  2756        dp 		 * See comment in DIF_VAR_PID.
   2838  2756        dp 		 */
   2839  2756        dp 		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
   2840  2756        dp 			return (0);
   2841  2756        dp 
   2842  2756        dp 		return ((uint64_t)curthread->t_tid);
   2843  2756        dp 
   2844  2756        dp 	case DIF_VAR_EXECNAME:
   2845     0    stevel 		if (!dtrace_priv_proc(state))
   2846     0    stevel 			return (0);
   2847     0    stevel 
   2848     0    stevel 		/*
   2849     0    stevel 		 * See comment in DIF_VAR_PID.
   2850     0    stevel 		 */
   2851     0    stevel 		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
   2852  2756        dp 			return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
   2853     0    stevel 
   2854     0    stevel 		/*
   2855     0    stevel 		 * It is always safe to dereference one's own t_procp pointer:
   2856     0    stevel 		 * it always points to a valid, allocated proc structure.
   2857     0    stevel 		 * (This is true because threads don't clean up their own
   2858     0    stevel 		 * state -- they leave that task to whomever reaps them.)
   2859     0    stevel 		 */
   2860  2870        dp 		return (dtrace_dif_varstr(
   2861  2870        dp 		    (uintptr_t)curthread->t_procp->p_user.u_comm,
   2862  2870        dp 		    state, mstate));
   2863  2756        dp 
   2864  2756        dp 	case DIF_VAR_ZONENAME:
   2865  2756        dp 		if (!dtrace_priv_proc(state))
   2866  2756        dp 			return (0);
   2867  2756        dp 
   2868  2756        dp 		/*
   2869  2756        dp 		 * See comment in DIF_VAR_PID.
   2870  2756        dp 		 */
   2871  2756        dp 		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
   2872  2756        dp 			return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
   2873  2756        dp 
   2874  2756        dp 		/*
   2875  2756        dp 		 * It is always safe to dereference one's own t_procp pointer:
   2876  2756        dp 		 * it always points to a valid, allocated proc structure.
   2877  2756        dp 		 * (This is true because threads don't clean up their own
   2878  2756        dp 		 * state -- they leave that task to whomever reaps them.)
   2879  2756        dp 		 */
   2880  2870        dp 		return (dtrace_dif_varstr(
   2881  2870        dp 		    (uintptr_t)curthread->t_procp->p_zone->zone_name,
   2882  2870        dp 		    state, mstate));
   2883     0    stevel 
   2884  2525        dp 	case DIF_VAR_UID:
   2885  2525        dp 		if (!dtrace_priv_proc(state))
   2886  2525        dp 			return (0);
   2887  2525        dp 
   2888  2525        dp 		/*
   2889  2525        dp 		 * See comment in DIF_VAR_PID.
   2890  2525        dp 		 */
   2891  2525        dp 		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
   2892  2525        dp 			return ((uint64_t)p0.p_cred->cr_uid);
   2893  2525        dp 
   2894  2756        dp 		/*
   2895  2756        dp 		 * It is always safe to dereference one's own t_procp pointer:
   2896  2756        dp 		 * it always points to a valid, allocated proc structure.
   2897  2756        dp 		 * (This is true because threads don't clean up their own
   2898  2756        dp 		 * state -- they leave that task to whomever reaps them.)
   2899  2756        dp 		 *
   2900  2756        dp 		 * Additionally, it is safe to dereference one's own process
   2901  2756        dp 		 * credential, since this is never NULL after process birth.
   2902  2756        dp 		 */
   2903  2756        dp 		return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
   2904  2525        dp 
   2905  2525        dp 	case DIF_VAR_GID:
   2906  2525        dp 		if (!dtrace_priv_proc(state))
   2907  2525        dp 			return (0);
   2908  2525        dp 
   2909  2525        dp 		/*
   2910  2525        dp 		 * See comment in DIF_VAR_PID.
   2911  2525        dp 		 */
   2912  2525        dp 		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
   2913  2525        dp 			return ((uint64_t)p0.p_cred->cr_gid);
   2914  2525        dp 
   2915  2756        dp 		/*
   2916  2756        dp 		 * It is always safe to dereference one's own t_procp pointer:
   2917  2756        dp 		 * it always points to a valid, allocated proc structure.
   2918  2756        dp 		 * (This is true because threads don't clean up their own
   2919  2756        dp 		 * state -- they leave that task to whomever reaps them.)
   2920  2756        dp 		 *
   2921  2756        dp 		 * Additionally, it is safe to dereference one's own process
   2922  2756        dp 		 * credential, since this is never NULL after process birth.
   2923  2756        dp 		 */
   2924  2756        dp 		return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
   2925  2525        dp 
   2926  2525        dp 	case DIF_VAR_ERRNO: {
   2927  2525        dp 		klwp_t *lwp;
   2928  2525        dp 		if (!dtrace_priv_proc(state))
   2929  2525        dp 			return (0);
   2930  2525        dp 
   2931  2525        dp 		/*
   2932  2525        dp 		 * See comment in DIF_VAR_PID.
   2933  2525        dp 		 */
   2934  2525        dp 		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
   2935  2525        dp 			return (0);
   2936  2525        dp 
   2937  2756        dp 		/*
   2938  2756        dp 		 * It is always safe to dereference one's own t_lwp pointer in
   2939  2756        dp 		 * the event that this pointer is non-NULL.  (This is true
   2940  2756        dp 		 * because threads and lwps don't clean up their own state --
   2941  2756        dp 		 * they leave that task to whomever reaps them.)
   2942  2756        dp 		 */
   2943  2525        dp 		if ((lwp = curthread->t_lwp) == NULL)
   2944  2525        dp 			return (0);
   2945  2525        dp 
   2946  2525        dp 		return ((uint64_t)lwp->lwp_errno);
   2947  2525        dp 	}
   2948     0    stevel 	default:
   2949     0    stevel 		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
   2950     0    stevel 		return (0);
   2951     0    stevel 	}
   2952     0    stevel }
   2953     0    stevel 
   2954     0    stevel /*
   2955     0    stevel  * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
   2956     0    stevel  * Notice that we don't bother validating the proper number of arguments or
   2957     0    stevel  * their types in the tuple stack.  This isn't needed because all argument
   2958     0    stevel  * interpretation is safe because of our load safety -- the worst that can
   2959     0    stevel  * happen is that a bogus program can obtain bogus results.
   2960     0    stevel  */
   2961     0    stevel static void
   2962     0    stevel dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
   2963     0    stevel     dtrace_key_t *tupregs, int nargs,
   2964     0    stevel     dtrace_mstate_t *mstate, dtrace_state_t *state)
   2965     0    stevel {
   2966     0    stevel 	volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
   2967     0    stevel 	volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
   2968  2870        dp 	dtrace_vstate_t *vstate = &state->dts_vstate;
   2969     0    stevel 
   2970     0    stevel 	union {
   2971     0    stevel 		mutex_impl_t mi;
   2972     0    stevel 		uint64_t mx;
   2973     0    stevel 	} m;
   2974     0    stevel 
   2975     0    stevel 	union {
   2976     0    stevel 		krwlock_t ri;
   2977     0    stevel 		uintptr_t rw;
   2978     0    stevel 	} r;
   2979     0    stevel 
   2980     0    stevel 	switch (subr) {
   2981     0    stevel 	case DIF_SUBR_RAND:
   2982     0    stevel 		regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
   2983     0    stevel 		break;
   2984     0    stevel 
   2985     0    stevel 	case DIF_SUBR_MUTEX_OWNED:
   2986  2870        dp 		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
   2987  2870        dp 		    mstate, vstate)) {
   2988  2870        dp 			regs[rd] = NULL;
   2989  2870        dp 			break;
   2990  2870        dp 		}
   2991  2870        dp 
   2992     0    stevel 		m.mx = dtrace_load64(tupregs[0].dttk_value);
   2993     0    stevel 		if (MUTEX_TYPE_ADAPTIVE(&m.mi))
   2994     0    stevel 			regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
   2995     0    stevel 		else
   2996     0    stevel 			regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
   2997     0    stevel 		break;
   2998     0    stevel 
   2999     0    stevel 	case DIF_SUBR_MUTEX_OWNER:
   3000  2870        dp 		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
   3001  2870        dp 		    mstate, vstate)) {
   3002  2870        dp 			regs[rd] = NULL;
   3003  2870        dp 			break;
   3004  2870        dp 		}
   3005  2870        dp 
   3006     0    stevel 		m.mx = dtrace_load64(tupregs[0].dttk_value);
   3007     0    stevel 		if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
   3008     0    stevel 		    MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
   3009     0    stevel 			regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
   3010     0    stevel 		else
   3011     0    stevel 			regs[rd] = 0;
   3012     0    stevel 		break;
   3013     0    stevel 
   3014     0    stevel 	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
   3015  2870        dp 		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
   3016  2870        dp 		    mstate, vstate)) {
   3017  2870        dp 			regs[rd] = NULL;
   3018  2870        dp 			break;
   3019  2870        dp 		}
   3020  2870        dp 
   3021     0    stevel 		m.mx = dtrace_load64(tupregs[0].dttk_value);
   3022     0    stevel 		regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
   3023     0    stevel 		break;
   3024     0    stevel 
   3025     0    stevel 	case DIF_SUBR_MUTEX_TYPE_SPIN:
   3026  2870        dp 		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
   3027  2870        dp 		    mstate, vstate)) {
   3028  2870        dp 			regs[rd] = NULL;
   3029  2870        dp 			break;
   3030  2870        dp 		}
   3031  2870        dp 
   3032     0    stevel 		m.mx = dtrace_load64(tupregs[0].dttk_value);
   3033     0    stevel 		regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
   3034     0    stevel 		break;
   3035     0    stevel 
   3036     0    stevel 	case DIF_SUBR_RW_READ_HELD: {
   3037     0    stevel 		uintptr_t tmp;
   3038     0    stevel 
   3039  2870        dp 		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
   3040  2870        dp 		    mstate, vstate)) {
   3041  2870        dp 			regs[rd] = NULL;
   3042  2870        dp 			break;
   3043  2870        dp 		}
   3044  2870        dp 
   3045     0    stevel 		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
   3046     0    stevel 		regs[rd] = _RW_READ_HELD(&r.ri, tmp);
   3047     0    stevel 		break;
   3048     0    stevel 	}
   3049     0    stevel 
   3050     0    stevel 	case DIF_SUBR_RW_WRITE_HELD:
   3051  2870        dp 		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
   3052  2870        dp 		    mstate, vstate)) {
   3053  2870        dp 			regs[rd] = NULL;
   3054  2870        dp 			break;
   3055  2870        dp 		}
   3056  2870        dp 
   3057     0    stevel 		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
   3058     0    stevel 		regs[rd] = _RW_WRITE_HELD(&r.ri);
   3059     0    stevel 		break;
   3060     0    stevel 
   3061     0    stevel 	case DIF_SUBR_RW_ISWRITER:
   3062  2870        dp 		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
   3063  2870        dp 		    mstate, vstate)) {
   3064  2870        dp 			regs[rd] = NULL;
   3065  2870        dp 			break;
   3066  2870        dp 		}
   3067  2870        dp 
   3068     0    stevel 		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
   3069     0    stevel 		regs[rd] = _RW_ISWRITER(&r.ri);
   3070     0    stevel 		break;
   3071     0    stevel 
   3072     0    stevel 	case DIF_SUBR_BCOPY: {
   3073     0    stevel 		/*
   3074     0    stevel 		 * We need to be sure that the destination is in the scratch
   3075     0    stevel 		 * region -- no other region is allowed.
   3076     0    stevel 		 */
   3077     0    stevel 		uintptr_t src = tupregs[0].dttk_value;
   3078     0    stevel 		uintptr_t dest = tupregs[1].dttk_value;
   3079     0    stevel 		size_t size = tupregs[2].dttk_value;
   3080     0    stevel 
   3081     0    stevel 		if (!dtrace_inscratch(dest, size, mstate)) {
   3082     0    stevel 			*flags |= CPU_DTRACE_BADADDR;
   3083     0    stevel 			*illval = regs[rd];
   3084  2870        dp 			break;
   3085  2870        dp 		}
   3086  2870        dp 
   3087  2870        dp 		if (!dtrace_canload(src, size, mstate, vstate)) {
   3088  2870        dp 			regs[rd] = NULL;
   3089     0    stevel 			break;
   3090     0    stevel 		}
   3091     0    stevel 
   3092     0    stevel 		dtrace_bcopy((void *)src, (void *)dest, size);
   3093     0    stevel 		break;
   3094     0    stevel 	}
   3095     0    stevel 
   3096     0    stevel 	case DIF_SUBR_ALLOCA:
   3097     0    stevel 	case DIF_SUBR_COPYIN: {
   3098     0    stevel 		uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
   3099     0    stevel 		uint64_t size =
   3100     0    stevel 		    tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
   3101     0    stevel 		size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
   3102     0    stevel 
   3103     0    stevel 		/*
   3104     0    stevel 		 * This action doesn't require any credential checks since
   3105     0    stevel 		 * probes will not activate in user contexts to which the
   3106     0    stevel 		 * enabling user does not have permissions.
   3107     0    stevel 		 */
   3108  2922        dp 
   3109  2922        dp 		/*
   3110  2922        dp 		 * Rounding up the user allocation size could have overflowed
   3111  2922        dp 		 * a large, bogus allocation (like -1ULL) to 0.
   3112  2922        dp 		 */
   3113  2922        dp 		if (scratch_size < size ||
   3114  2922        dp 		    !DTRACE_INSCRATCH(mstate, scratch_size)) {
   3115     0    stevel 			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
   3116     0    stevel 			regs[rd] = NULL;
   3117     0    stevel 			break;
   3118     0    stevel 		}
   3119     0    stevel 
   3120     0    stevel 		if (subr == DIF_SUBR_COPYIN) {
   3121     0    stevel 			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
   3122  3677   sudheer 			dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
   3123     0    stevel 			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
   3124     0    stevel 		}
   3125     0    stevel 
   3126     0    stevel 		mstate->dtms_scratch_ptr += scratch_size;
   3127     0    stevel 		regs[rd] = dest;
   3128     0    stevel 		break;
   3129     0    stevel 	}
   3130     0    stevel 
   3131     0    stevel 	case DIF_SUBR_COPYINTO: {
   3132     0    stevel 		uint64_t size = tupregs[1].dttk_value;
   3133     0    stevel 		uintptr_t dest = tupregs[2].dttk_value;
   3134     0    stevel 
   3135     0    stevel 		/*
   3136     0    stevel 		 * This action doesn't require any credential checks since
   3137     0    stevel 		 * probes will not activate in user contexts to which the
   3138     0    stevel 		 * enabling user does not have permissions.
   3139     0    stevel 		 */
   3140     0    stevel 		if (!dtrace_inscratch(dest, size, mstate)) {
   3141     0    stevel 			*flags |= CPU_DTRACE_BADADDR;
   3142     0    stevel 			*illval = regs[rd];
   3143     0    stevel 			break;
   3144     0    stevel 		}
   3145     0    stevel 
   3146     0    stevel 		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
   3147  3677   sudheer 		dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
   3148     0    stevel 		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
   3149     0    stevel 		break;
   3150     0    stevel 	}
   3151     0    stevel 
   3152     0    stevel 	case DIF_SUBR_COPYINSTR: {
   3153     0    stevel 		uintptr_t dest = mstate->dtms_scratch_ptr;
   3154     0    stevel 		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
   3155     0    stevel 
   3156     0    stevel 		if (nargs > 1 && tupregs[1].dttk_value < size)
   3157     0    stevel 			size = tupregs[1].dttk_value + 1;
   3158     0    stevel 
   3159     0    stevel 		/*
   3160     0    stevel 		 * This action doesn't require any credential checks since
   3161     0    stevel 		 * probes will not activate in user contexts to which the
   3162     0    stevel 		 * enabling user does not have permissions.
   3163     0    stevel 		 */
   3164  2922        dp 		if (!DTRACE_INSCRATCH(mstate, size)) {
   3165     0    stevel 			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
   3166     0    stevel 			regs[rd] = NULL;
   3167     0    stevel 			break;
   3168     0    stevel 		}
   3169     0    stevel 
   3170     0    stevel 		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
   3171  3677   sudheer 		dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
   3172     0    stevel 		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
   3173     0    stevel 
   3174     0    stevel 		((char *)dest)[size - 1] = '\0';
   3175     0    stevel 		mstate->dtms_scratch_ptr += size;
   3176     0    stevel 		regs[rd] = dest;
   3177     0    stevel 		break;
   3178     0    stevel 	}
   3179     0    stevel 
   3180     0    stevel 	case DIF_SUBR_MSGSIZE:
   3181     0    stevel 	case DIF_SUBR_MSGDSIZE</