1 0 stevel /* 2 0 stevel * CDDL HEADER START 3 0 stevel * 4 0 stevel * The contents of this file are subject to the terms of the 5 1677 dp * Common Development and Distribution License (the "License"). 6 1677 dp * You may not use this file except in compliance with the License. 7 0 stevel * 8 0 stevel * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 0 stevel * or http://www.opensolaris.org/os/licensing. 10 0 stevel * See the License for the specific language governing permissions 11 0 stevel * and limitations under the License. 12 0 stevel * 13 0 stevel * When distributing Covered Code, include this CDDL HEADER in each 14 0 stevel * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 0 stevel * If applicable, add the following below this CDDL HEADER, with the 16 0 stevel * fields enclosed by brackets "[]" replaced with your own identifying 17 0 stevel * information: Portions Copyright [yyyy] [name of copyright owner] 18 0 stevel * 19 0 stevel * CDDL HEADER END 20 0 stevel */ 21 1710 ahl 22 0 stevel /* 23 8803 Jonathan * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 0 stevel * Use is subject to license terms. 25 0 stevel */ 26 0 stevel 27 0 stevel /* 28 0 stevel * DTrace - Dynamic Tracing for Solaris 29 0 stevel * 30 0 stevel * This is the implementation of the Solaris Dynamic Tracing framework 31 0 stevel * (DTrace). The user-visible interface to DTrace is described at length in 32 0 stevel * the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace 33 0 stevel * library, the in-kernel DTrace framework, and the DTrace providers are 34 0 stevel * described in the block comments in the <sys/dtrace.h> header file. The 35 0 stevel * internal architecture of DTrace is described in the block comments in the 36 0 stevel * <sys/dtrace_impl.h> header file. The comments contained within the DTrace 37 0 stevel * implementation very much assume mastery of all of these sources; if one has 38 0 stevel * an unanswered question about the implementation, one should consult them 39 0 stevel * first. 40 0 stevel * 41 0 stevel * The functions here are ordered roughly as follows: 42 0 stevel * 43 0 stevel * - Probe context functions 44 0 stevel * - Probe hashing functions 45 0 stevel * - Non-probe context utility functions 46 0 stevel * - Matching functions 47 0 stevel * - Provider-to-Framework API functions 48 0 stevel * - Probe management functions 49 0 stevel * - DIF object functions 50 0 stevel * - Format functions 51 0 stevel * - Predicate functions 52 0 stevel * - ECB functions 53 0 stevel * - Buffer functions 54 0 stevel * - Enabling functions 55 0 stevel * - DOF functions 56 0 stevel * - Anonymous enabling functions 57 0 stevel * - Consumer state functions 58 0 stevel * - Helper functions 59 0 stevel * - Hook functions 60 0 stevel * - Driver cookbook functions 61 0 stevel * 62 0 stevel * Each group of functions begins with a block comment labelled the "DTrace 63 0 stevel * [Group] Functions", allowing one to find each block by searching forward 64 0 stevel * on capital-f functions. 65 0 stevel */ 66 0 stevel #include <sys/errno.h> 67 0 stevel #include <sys/stat.h> 68 0 stevel #include <sys/modctl.h> 69 0 stevel #include <sys/conf.h> 70 0 stevel #include <sys/systm.h> 71 0 stevel #include <sys/ddi.h> 72 0 stevel #include <sys/sunddi.h> 73 0 stevel #include <sys/cpuvar.h> 74 0 stevel #include <sys/kmem.h> 75 0 stevel #include <sys/strsubr.h> 76 0 stevel #include <sys/sysmacros.h> 77 0 stevel #include <sys/dtrace_impl.h> 78 0 stevel #include <sys/atomic.h> 79 0 stevel #include <sys/cmn_err.h> 80 0 stevel #include <sys/mutex_impl.h> 81 0 stevel #include <sys/rwlock_impl.h> 82 0 stevel #include <sys/ctf_api.h> 83 0 stevel #include <sys/panic.h> 84 0 stevel #include <sys/priv_impl.h> 85 0 stevel #include <sys/policy.h> 86 0 stevel #include <sys/cred_impl.h> 87 0 stevel #include <sys/procfs_isa.h> 88 0 stevel #include <sys/taskq.h> 89 0 stevel #include <sys/mkdev.h> 90 0 stevel #include <sys/kdi.h> 91 0 stevel #include <sys/zone.h> 92 4291 brendan #include <sys/socket.h> 93 4291 brendan #include <netinet/in.h> 94 0 stevel 95 0 stevel /* 96 0 stevel * DTrace Tunable Variables 97 0 stevel * 98 0 stevel * The following variables may be tuned by adding a line to /etc/system that 99 0 stevel * includes both the name of the DTrace module ("dtrace") and the name of the 100 0 stevel * variable. For example: 101 0 stevel * 102 0 stevel * set dtrace:dtrace_destructive_disallow = 1 103 0 stevel * 104 0 stevel * In general, the only variables that one should be tuning this way are those 105 0 stevel * that affect system-wide DTrace behavior, and for which the default behavior 106 0 stevel * is undesirable. Most of these variables are tunable on a per-consumer 107 0 stevel * basis using DTrace options, and need not be tuned on a system-wide basis. 108 0 stevel * When tuning these variables, avoid pathological values; while some attempt 109 0 stevel * is made to verify the integrity of these variables, they are not considered 110 0 stevel * part of the supported interface to DTrace, and they are therefore not 111 0 stevel * checked comprehensively. Further, these variables should not be tuned 112 0 stevel * dynamically via "mdb -kw" or other means; they should only be tuned via 113 0 stevel * /etc/system. 114 0 stevel */ 115 0 stevel int dtrace_destructive_disallow = 0; 116 0 stevel dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024); 117 0 stevel size_t dtrace_difo_maxsize = (256 * 1024); 118 0 stevel dtrace_optval_t dtrace_dof_maxsize = (256 * 1024); 119 0 stevel size_t dtrace_global_maxsize = (16 * 1024); 120 0 stevel size_t dtrace_actions_max = (16 * 1024); 121 0 stevel size_t dtrace_retain_max = 1024; 122 0 stevel dtrace_optval_t dtrace_helper_actions_max = 32; 123 0 stevel dtrace_optval_t dtrace_helper_providers_max = 32; 124 0 stevel dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024); 125 0 stevel size_t dtrace_strsize_default = 256; 126 0 stevel dtrace_optval_t dtrace_cleanrate_default = 9900990; /* 101 hz */ 127 0 stevel dtrace_optval_t dtrace_cleanrate_min = 200000; /* 5000 hz */ 128 0 stevel dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */ 129 0 stevel dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */ 130 0 stevel dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */ 131 0 stevel dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */ 132 0 stevel dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */ 133 0 stevel dtrace_optval_t dtrace_nspec_default = 1; 134 0 stevel dtrace_optval_t dtrace_specsize_default = 32 * 1024; 135 0 stevel dtrace_optval_t dtrace_stackframes_default = 20; 136 0 stevel dtrace_optval_t dtrace_ustackframes_default = 20; 137 0 stevel dtrace_optval_t dtrace_jstackframes_default = 50; 138 0 stevel dtrace_optval_t dtrace_jstackstrsize_default = 512; 139 0 stevel int dtrace_msgdsize_max = 128; 140 0 stevel hrtime_t dtrace_chill_max = 500 * (NANOSEC / MILLISEC); /* 500 ms */ 141 0 stevel hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */ 142 0 stevel int dtrace_devdepth_max = 32; 143 0 stevel int dtrace_err_verbose; 144 0 stevel hrtime_t dtrace_deadman_interval = NANOSEC; 145 0 stevel hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC; 146 0 stevel hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC; 147 0 stevel 148 0 stevel /* 149 0 stevel * DTrace External Variables 150 0 stevel * 151 0 stevel * As dtrace(7D) is a kernel module, any DTrace variables are obviously 152 0 stevel * available to DTrace consumers via the backtick (`) syntax. One of these, 153 0 stevel * dtrace_zero, is made deliberately so: it is provided as a source of 154 0 stevel * well-known, zero-filled memory. While this variable is not documented, 155 0 stevel * it is used by some translators as an implementation detail. 156 0 stevel */ 157 0 stevel const char dtrace_zero[256] = { 0 }; /* zero-filled memory */ 158 0 stevel 159 0 stevel /* 160 0 stevel * DTrace Internal Variables 161 0 stevel */ 162 0 stevel static dev_info_t *dtrace_devi; /* device info */ 163 0 stevel static vmem_t *dtrace_arena; /* probe ID arena */ 164 0 stevel static vmem_t *dtrace_minor; /* minor number arena */ 165 0 stevel static taskq_t *dtrace_taskq; /* task queue */ 166 0 stevel static dtrace_probe_t **dtrace_probes; /* array of all probes */ 167 0 stevel static int dtrace_nprobes; /* number of probes */ 168 0 stevel static dtrace_provider_t *dtrace_provider; /* provider list */ 169 0 stevel static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */ 170 0 stevel static int dtrace_opens; /* number of opens */ 171 457 bmc static int dtrace_helpers; /* number of helpers */ 172 0 stevel static void *dtrace_softstate; /* softstate pointer */ 173 0 stevel static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */ 174 0 stevel static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */ 175 0 stevel static dtrace_hash_t *dtrace_byname; /* probes hashed by name */ 176 0 stevel static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */ 177 0 stevel static int dtrace_toxranges; /* number of toxic ranges */ 178 0 stevel static int dtrace_toxranges_max; /* size of toxic range array */ 179 0 stevel static dtrace_anon_t dtrace_anon; /* anonymous enabling */ 180 0 stevel static kmem_cache_t *dtrace_state_cache; /* cache for dynamic state */ 181 0 stevel static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */ 182 0 stevel static kthread_t *dtrace_panicked; /* panicking thread */ 183 0 stevel static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */ 184 0 stevel static dtrace_genid_t dtrace_probegen; /* current probe generation */ 185 0 stevel static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */ 186 0 stevel static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */ 187 7590 Jonathan static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */ 188 1739 bmc static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */ 189 0 stevel 190 0 stevel /* 191 0 stevel * DTrace Locking 192 0 stevel * DTrace is protected by three (relatively coarse-grained) locks: 193 0 stevel * 194 0 stevel * (1) dtrace_lock is required to manipulate essentially any DTrace state, 195 0 stevel * including enabling state, probes, ECBs, consumer state, helper state, 196 0 stevel * etc. Importantly, dtrace_lock is _not_ required when in probe context; 197 0 stevel * probe context is lock-free -- synchronization is handled via the 198 0 stevel * dtrace_sync() cross call mechanism. 199 0 stevel * 200 0 stevel * (2) dtrace_provider_lock is required when manipulating provider state, or 201 0 stevel * when provider state must be held constant. 202 0 stevel * 203 0 stevel * (3) dtrace_meta_lock is required when manipulating meta provider state, or 204 0 stevel * when meta provider state must be held constant. 205 0 stevel * 206 0 stevel * The lock ordering between these three locks is dtrace_meta_lock before 207 0 stevel * dtrace_provider_lock before dtrace_lock. (In particular, there are 208 0 stevel * several places where dtrace_provider_lock is held by the framework as it 209 0 stevel * calls into the providers -- which then call back into the framework, 210 0 stevel * grabbing dtrace_lock.) 211 0 stevel * 212 457 bmc * There are two other locks in the mix: mod_lock and cpu_lock. With respect 213 457 bmc * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical 214 457 bmc * role as a coarse-grained lock; it is acquired before both of these locks. 215 457 bmc * With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must 216 457 bmc * be acquired _between_ dtrace_meta_lock and any other DTrace locks. 217 457 bmc * mod_lock is similar with respect to dtrace_provider_lock in that it must be 218 457 bmc * acquired _between_ dtrace_provider_lock and dtrace_lock. 219 0 stevel */ 220 0 stevel static kmutex_t dtrace_lock; /* probe state lock */ 221 0 stevel static kmutex_t dtrace_provider_lock; /* provider state lock */ 222 0 stevel static kmutex_t dtrace_meta_lock; /* meta-provider state lock */ 223 0 stevel 224 0 stevel /* 225 0 stevel * DTrace Provider Variables 226 0 stevel * 227 0 stevel * These are the variables relating to DTrace as a provider (that is, the 228 0 stevel * provider of the BEGIN, END, and ERROR probes). 229 0 stevel */ 230 0 stevel static dtrace_pattr_t dtrace_provider_attr = { 231 0 stevel { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON }, 232 0 stevel { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, 233 0 stevel { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, 234 0 stevel { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON }, 235 0 stevel { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON }, 236 0 stevel }; 237 0 stevel 238 0 stevel static void 239 0 stevel dtrace_nullop(void) 240 0 stevel {} 241 0 stevel 242 8803 Jonathan static int 243 8803 Jonathan dtrace_enable_nullop(void) 244 8803 Jonathan { 245 8803 Jonathan return (0); 246 8803 Jonathan } 247 8803 Jonathan 248 0 stevel static dtrace_pops_t dtrace_provider_ops = { 249 0 stevel (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop, 250 0 stevel (void (*)(void *, struct modctl *))dtrace_nullop, 251 8803 Jonathan (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop, 252 0 stevel (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, 253 0 stevel (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, 254 0 stevel (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, 255 0 stevel NULL, 256 0 stevel NULL, 257 0 stevel NULL, 258 0 stevel (void (*)(void *, dtrace_id_t, void *))dtrace_nullop 259 0 stevel }; 260 0 stevel 261 0 stevel static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */ 262 0 stevel static dtrace_id_t dtrace_probeid_end; /* special END probe */ 263 0 stevel dtrace_id_t dtrace_probeid_error; /* special ERROR probe */ 264 0 stevel 265 0 stevel /* 266 0 stevel * DTrace Helper Tracing Variables 267 0 stevel */ 268 0 stevel uint32_t dtrace_helptrace_next = 0; 269 0 stevel uint32_t dtrace_helptrace_nlocals; 270 0 stevel char *dtrace_helptrace_buffer; 271 0 stevel int dtrace_helptrace_bufsize = 512 * 1024; 272 0 stevel 273 0 stevel #ifdef DEBUG 274 0 stevel int dtrace_helptrace_enabled = 1; 275 0 stevel #else 276 0 stevel int dtrace_helptrace_enabled = 0; 277 0 stevel #endif 278 0 stevel 279 0 stevel /* 280 0 stevel * DTrace Error Hashing 281 0 stevel * 282 0 stevel * On DEBUG kernels, DTrace will track the errors that has seen in a hash 283 0 stevel * table. This is very useful for checking coverage of tests that are 284 0 stevel * expected to induce DIF or DOF processing errors, and may be useful for 285 0 stevel * debugging problems in the DIF code generator or in DOF generation . The 286 0 stevel * error hash may be examined with the ::dtrace_errhash MDB dcmd. 287 0 stevel */ 288 0 stevel #ifdef DEBUG 289 0 stevel static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ]; 290 0 stevel static const char *dtrace_errlast; 291 0 stevel static kthread_t *dtrace_errthread; 292 0 stevel static kmutex_t dtrace_errlock; 293 0 stevel #endif 294 0 stevel 295 0 stevel /* 296 0 stevel * DTrace Macros and Constants 297 0 stevel * 298 0 stevel * These are various macros that are useful in various spots in the 299 0 stevel * implementation, along with a few random constants that have no meaning 300 0 stevel * outside of the implementation. There is no real structure to this cpp 301 0 stevel * mishmash -- but is there ever? 302 0 stevel */ 303 0 stevel #define DTRACE_HASHSTR(hash, probe) \ 304 0 stevel dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs))) 305 0 stevel 306 0 stevel #define DTRACE_HASHNEXT(hash, probe) \ 307 0 stevel (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs) 308 0 stevel 309 0 stevel #define DTRACE_HASHPREV(hash, probe) \ 310 0 stevel (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs) 311 0 stevel 312 0 stevel #define DTRACE_HASHEQ(hash, lhs, rhs) \ 313 0 stevel (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \ 314 0 stevel *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0) 315 0 stevel 316 0 stevel #define DTRACE_AGGHASHSIZE_SLEW 17 317 4291 brendan 318 4291 brendan #define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3) 319 0 stevel 320 0 stevel /* 321 0 stevel * The key for a thread-local variable consists of the lower 61 bits of the 322 0 stevel * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL. 323 0 stevel * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never 324 0 stevel * equal to a variable identifier. This is necessary (but not sufficient) to 325 0 stevel * assure that global associative arrays never collide with thread-local 326 0 stevel * variables. To guarantee that they cannot collide, we must also define the 327 0 stevel * order for keying dynamic variables. That order is: 328 0 stevel * 329 0 stevel * [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ] 330 0 stevel * 331 0 stevel * Because the variable-key and the tls-key are in orthogonal spaces, there is 332 0 stevel * no way for a global variable key signature to match a thread-local key 333 0 stevel * signature. 334 0 stevel */ 335 0 stevel #define DTRACE_TLS_THRKEY(where) { \ 336 0 stevel uint_t intr = 0; \ 337 0 stevel uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \ 338 0 stevel for (; actv; actv >>= 1) \ 339 0 stevel intr++; \ 340 0 stevel ASSERT(intr < (1 << 3)); \ 341 0 stevel (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \ 342 0 stevel (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \ 343 0 stevel } 344 2769 ahl 345 2769 ahl #define DT_BSWAP_8(x) ((x) & 0xff) 346 2769 ahl #define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8)) 347 2769 ahl #define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16)) 348 2769 ahl #define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32)) 349 5984 jhaslam 350 5984 jhaslam #define DT_MASK_LO 0x00000000FFFFFFFFULL 351 0 stevel 352 0 stevel #define DTRACE_STORE(type, tomax, offset, what) \ 353 0 stevel *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what); 354 0 stevel 355 0 stevel #ifndef __i386 356 0 stevel #define DTRACE_ALIGNCHECK(addr, size, flags) \ 357 0 stevel if (addr & (size - 1)) { \ 358 0 stevel *flags |= CPU_DTRACE_BADALIGN; \ 359 0 stevel cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \ 360 0 stevel return (0); \ 361 0 stevel } 362 0 stevel #else 363 0 stevel #define DTRACE_ALIGNCHECK(addr, size, flags) 364 0 stevel #endif 365 2870 dp 366 2870 dp /* 367 2870 dp * Test whether a range of memory starting at testaddr of size testsz falls 368 2922 dp * within the range of memory described by addr, sz. We take care to avoid 369 2922 dp * problems with overflow and underflow of the unsigned quantities, and 370 2922 dp * disallow all negative sizes. Ranges of size 0 are allowed. 371 2870 dp */ 372 2870 dp #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \ 373 2870 dp ((testaddr) - (baseaddr) < (basesz) && \ 374 2922 dp (testaddr) + (testsz) - (baseaddr) <= (basesz) && \ 375 2922 dp (testaddr) + (testsz) >= (testaddr)) 376 2922 dp 377 2922 dp /* 378 2922 dp * Test whether alloc_sz bytes will fit in the scratch region. We isolate 379 2922 dp * alloc_sz on the righthand side of the comparison in order to avoid overflow 380 2922 dp * or underflow in the comparison with it. This is simpler than the INRANGE 381 2922 dp * check above, because we know that the dtms_scratch_ptr is valid in the 382 2922 dp * range. Allocations of size zero are allowed. 383 2922 dp */ 384 2922 dp #define DTRACE_INSCRATCH(mstate, alloc_sz) \ 385 2922 dp ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \ 386 2922 dp (mstate)->dtms_scratch_ptr >= (alloc_sz)) 387 0 stevel 388 0 stevel #define DTRACE_LOADFUNC(bits) \ 389 0 stevel /*CSTYLED*/ \ 390 0 stevel uint##bits##_t \ 391 0 stevel dtrace_load##bits(uintptr_t addr) \ 392 0 stevel { \ 393 0 stevel size_t size = bits / NBBY; \ 394 0 stevel /*CSTYLED*/ \ 395 0 stevel uint##bits##_t rval; \ 396 0 stevel int i; \ 397 0 stevel volatile uint16_t *flags = (volatile uint16_t *) \ 398 0 stevel &cpu_core[CPU->cpu_id].cpuc_dtrace_flags; \ 399 0 stevel \ 400 0 stevel DTRACE_ALIGNCHECK(addr, size, flags); \ 401 0 stevel \ 402 0 stevel for (i = 0; i < dtrace_toxranges; i++) { \ 403 0 stevel if (addr >= dtrace_toxrange[i].dtt_limit) \ 404 0 stevel continue; \ 405 0 stevel \ 406 0 stevel if (addr + size <= dtrace_toxrange[i].dtt_base) \ 407 0 stevel continue; \ 408 0 stevel \ 409 0 stevel /* \ 410 0 stevel * This address falls within a toxic region; return 0. \ 411 0 stevel */ \ 412 0 stevel *flags |= CPU_DTRACE_BADADDR; \ 413 0 stevel cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \ 414 0 stevel return (0); \ 415 0 stevel } \ 416 0 stevel \ 417 0 stevel *flags |= CPU_DTRACE_NOFAULT; \ 418 0 stevel /*CSTYLED*/ \ 419 0 stevel rval = *((volatile uint##bits##_t *)addr); \ 420 0 stevel *flags &= ~CPU_DTRACE_NOFAULT; \ 421 0 stevel \ 422 3043 bmc return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0); \ 423 0 stevel } 424 0 stevel 425 0 stevel #ifdef _LP64 426 0 stevel #define dtrace_loadptr dtrace_load64 427 0 stevel #else 428 0 stevel #define dtrace_loadptr dtrace_load32 429 0 stevel #endif 430 0 stevel 431 1739 bmc #define DTRACE_DYNHASH_FREE 0 432 1739 bmc #define DTRACE_DYNHASH_SINK 1 433 1739 bmc #define DTRACE_DYNHASH_VALID 2 434 1739 bmc 435 8803 Jonathan #define DTRACE_MATCH_FAIL -1 436 0 stevel #define DTRACE_MATCH_NEXT 0 437 0 stevel #define DTRACE_MATCH_DONE 1 438 0 stevel #define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0') 439 0 stevel #define DTRACE_STATE_ALIGN 64 440 491 bmc 441 491 bmc #define DTRACE_FLAGS2FLT(flags) \ 442 491 bmc (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \ 443 491 bmc ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \ 444 491 bmc ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \ 445 491 bmc ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \ 446 491 bmc ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \ 447 491 bmc ((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \ 448 491 bmc ((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \ 449 491 bmc ((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \ 450 3682 jhaslam ((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \ 451 491 bmc DTRACEFLT_UNKNOWN) 452 0 stevel 453 1017 bmc #define DTRACEACT_ISSTRING(act) \ 454 1017 bmc ((act)->dta_kind == DTRACEACT_DIFEXPR && \ 455 1017 bmc (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) 456 1017 bmc 457 2870 dp static size_t dtrace_strlen(const char *, size_t); 458 0 stevel static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id); 459 0 stevel static void dtrace_enabling_provide(dtrace_provider_t *); 460 0 stevel static int dtrace_enabling_match(dtrace_enabling_t *, int *); 461 0 stevel static void dtrace_enabling_matchall(void); 462 0 stevel static dtrace_state_t *dtrace_anon_grab(void); 463 0 stevel static uint64_t dtrace_helper(int, dtrace_mstate_t *, 464 0 stevel dtrace_state_t *, uint64_t, uint64_t); 465 0 stevel static dtrace_helpers_t *dtrace_helpers_create(proc_t *); 466 0 stevel static void dtrace_buffer_drop(dtrace_buffer_t *); 467 0 stevel static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t, 468 0 stevel dtrace_state_t *, dtrace_mstate_t *); 469 0 stevel static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t, 470 0 stevel dtrace_optval_t); 471 0 stevel static int dtrace_ecb_create_enable(dtrace_probe_t *, void *); 472 2179 ahl static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *); 473 0 stevel 474 0 stevel /* 475 0 stevel * DTrace Probe Context Functions 476 0 stevel * 477 0 stevel * These functions are called from probe context. Because probe context is 478 0 stevel * any context in which C may be called, arbitrarily locks may be held, 479 0 stevel * interrupts may be disabled, we may be in arbitrary dispatched state, etc. 480 0 stevel * As a result, functions called from probe context may only call other DTrace 481 0 stevel * support functions -- they may not interact at all with the system at large. 482 0 stevel * (Note that the ASSERT macro is made probe-context safe by redefining it in 483 0 stevel * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary 484 0 stevel * loads are to be performed from probe context, they _must_ be in terms of 485 0 stevel * the safe dtrace_load*() variants. 486 0 stevel * 487 0 stevel * Some functions in this block are not actually called from probe context; 488 0 stevel * for these functions, there will be a comment above the function reading 489 0 stevel * "Note: not called from probe context." 490 0 stevel */ 491 0 stevel void 492 0 stevel dtrace_panic(const char *format, ...) 493 0 stevel { 494 0 stevel va_list alist; 495 0 stevel 496 0 stevel va_start(alist, format); 497 0 stevel dtrace_vpanic(format, alist); 498 0 stevel va_end(alist); 499 0 stevel } 500 0 stevel 501 0 stevel int 502 0 stevel dtrace_assfail(const char *a, const char *f, int l) 503 0 stevel { 504 0 stevel dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l); 505 0 stevel 506 0 stevel /* 507 0 stevel * We just need something here that even the most clever compiler 508 0 stevel * cannot optimize away. 509 0 stevel */ 510 0 stevel return (a[(uintptr_t)f]); 511 0 stevel } 512 0 stevel 513 0 stevel /* 514 457 bmc * Atomically increment a specified error counter from probe context. 515 457 bmc */ 516 457 bmc static void 517 457 bmc dtrace_error(uint32_t *counter) 518 457 bmc { 519 457 bmc /* 520 457 bmc * Most counters stored to in probe context are per-CPU counters. 521 457 bmc * However, there are some error conditions that are sufficiently 522 457 bmc * arcane that they don't merit per-CPU storage. If these counters 523 457 bmc * are incremented concurrently on different CPUs, scalability will be 524 457 bmc * adversely affected -- but we don't expect them to be white-hot in a 525 457 bmc * correctly constructed enabling... 526 457 bmc */ 527 457 bmc uint32_t oval, nval; 528 457 bmc 529 457 bmc do { 530 457 bmc oval = *counter; 531 457 bmc 532 457 bmc if ((nval = oval + 1) == 0) { 533 457 bmc /* 534 457 bmc * If the counter would wrap, set it to 1 -- assuring 535 457 bmc * that the counter is never zero when we have seen 536 457 bmc * errors. (The counter must be 32-bits because we 537 457 bmc * aren't guaranteed a 64-bit compare&swap operation.) 538 457 bmc * To save this code both the infamy of being fingered 539 457 bmc * by a priggish news story and the indignity of being 540 457 bmc * the target of a neo-puritan witch trial, we're 541 457 bmc * carefully avoiding any colorful description of the 542 457 bmc * likelihood of this condition -- but suffice it to 543 457 bmc * say that it is only slightly more likely than the 544 457 bmc * overflow of predicate cache IDs, as discussed in 545 457 bmc * dtrace_predicate_create(). 546 457 bmc */ 547 457 bmc nval = 1; 548 457 bmc } 549 457 bmc } while (dtrace_cas32(counter, oval, nval) != oval); 550 457 bmc } 551 457 bmc 552 457 bmc /* 553 0 stevel * Use the DTRACE_LOADFUNC macro to define functions for each of loading a 554 0 stevel * uint8_t, a uint16_t, a uint32_t and a uint64_t. 555 0 stevel */ 556 0 stevel DTRACE_LOADFUNC(8) 557 0 stevel DTRACE_LOADFUNC(16) 558 0 stevel DTRACE_LOADFUNC(32) 559 0 stevel DTRACE_LOADFUNC(64) 560 0 stevel 561 0 stevel static int 562 0 stevel dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate) 563 0 stevel { 564 0 stevel if (dest < mstate->dtms_scratch_base) 565 0 stevel return (0); 566 0 stevel 567 0 stevel if (dest + size < dest) 568 0 stevel return (0); 569 0 stevel 570 0 stevel if (dest + size > mstate->dtms_scratch_ptr) 571 0 stevel return (0); 572 0 stevel 573 0 stevel return (1); 574 0 stevel } 575 0 stevel 576 0 stevel static int 577 0 stevel dtrace_canstore_statvar(uint64_t addr, size_t sz, 578 0 stevel dtrace_statvar_t **svars, int nsvars) 579 0 stevel { 580 0 stevel int i; 581 0 stevel 582 0 stevel for (i = 0; i < nsvars; i++) { 583 0 stevel dtrace_statvar_t *svar = svars[i]; 584 0 stevel 585 0 stevel if (svar == NULL || svar->dtsv_size == 0) 586 0 stevel continue; 587 0 stevel 588 2870 dp if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size)) 589 0 stevel return (1); 590 0 stevel } 591 0 stevel 592 0 stevel return (0); 593 0 stevel } 594 0 stevel 595 0 stevel /* 596 0 stevel * Check to see if the address is within a memory region to which a store may 597 0 stevel * be issued. This includes the DTrace scratch areas, and any DTrace variable 598 0 stevel * region. The caller of dtrace_canstore() is responsible for performing any 599 0 stevel * alignment checks that are needed before stores are actually executed. 600 0 stevel */ 601 0 stevel static int 602 0 stevel dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, 603 0 stevel dtrace_vstate_t *vstate) 604 0 stevel { 605 0 stevel /* 606 0 stevel * First, check to see if the address is in scratch space... 607 0 stevel */ 608 2870 dp if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base, 609 2870 dp mstate->dtms_scratch_size)) 610 0 stevel return (1); 611 0 stevel 612 0 stevel /* 613 0 stevel * Now check to see if it's a dynamic variable. This check will pick 614 0 stevel * up both thread-local variables and any global dynamically-allocated 615 0 stevel * variables. 616 0 stevel */ 617 2870 dp if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base, 618 4682 jhaslam vstate->dtvs_dynvars.dtds_size)) { 619 4682 jhaslam dtrace_dstate_t *dstate = &vstate->dtvs_dynvars; 620 4682 jhaslam uintptr_t base = (uintptr_t)dstate->dtds_base + 621 4682 jhaslam (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t)); 622 4682 jhaslam uintptr_t chunkoffs; 623 4682 jhaslam 624 4682 jhaslam /* 625 4682 jhaslam * Before we assume that we can store here, we need to make 626 4682 jhaslam * sure that it isn't in our metadata -- storing to our 627 4682 jhaslam * dynamic variable metadata would corrupt our state. For 628 4682 jhaslam * the range to not include any dynamic variable metadata, 629 4682 jhaslam * it must: 630 4682 jhaslam * 631 4682 jhaslam * (1) Start above the hash table that is at the base of 632 4682 jhaslam * the dynamic variable space 633 4682 jhaslam * 634 4682 jhaslam * (2) Have a starting chunk offset that is beyond the 635 4682 jhaslam * dtrace_dynvar_t that is at the base of every chunk 636 4682 jhaslam * 637 4682 jhaslam * (3) Not span a chunk boundary 638 4682 jhaslam * 639 4682 jhaslam */ 640 4682 jhaslam if (addr < base) 641 4682 jhaslam return (0); 642 4682 jhaslam 643 4682 jhaslam chunkoffs = (addr - base) % dstate->dtds_chunksize; 644 4682 jhaslam 645 4682 jhaslam if (chunkoffs < sizeof (dtrace_dynvar_t)) 646 4682 jhaslam return (0); 647 4682 jhaslam 648 4682 jhaslam if (chunkoffs + sz > dstate->dtds_chunksize) 649 4682 jhaslam return (0); 650 4682 jhaslam 651 4682 jhaslam return (1); 652 4682 jhaslam } 653 0 stevel 654 0 stevel /* 655 0 stevel * Finally, check the static local and global variables. These checks 656 0 stevel * take the longest, so we perform them last. 657 0 stevel */ 658 0 stevel if (dtrace_canstore_statvar(addr, sz, 659 0 stevel vstate->dtvs_locals, vstate->dtvs_nlocals)) 660 0 stevel return (1); 661 0 stevel 662 0 stevel if (dtrace_canstore_statvar(addr, sz, 663 0 stevel vstate->dtvs_globals, vstate->dtvs_nglobals)) 664 0 stevel return (1); 665 0 stevel 666 0 stevel return (0); 667 2870 dp } 668 2870 dp 669 2870 dp 670 2870 dp /* 671 2870 dp * Convenience routine to check to see if the address is within a memory 672 2870 dp * region in which a load may be issued given the user's privilege level; 673 2870 dp * if not, it sets the appropriate error flags and loads 'addr' into the 674 2870 dp * illegal value slot. 675 2870 dp * 676 2870 dp * DTrace subroutines (DIF_SUBR_*) should use this helper to implement 677 2870 dp * appropriate memory access protection. 678 2870 dp */ 679 2870 dp static int 680 2870 dp dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, 681 2870 dp dtrace_vstate_t *vstate) 682 2870 dp { 683 2870 dp volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval; 684 2870 dp 685 2870 dp /* 686 2870 dp * If we hold the privilege to read from kernel memory, then 687 2870 dp * everything is readable. 688 2870 dp */ 689 2870 dp if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) 690 2870 dp return (1); 691 2870 dp 692 2870 dp /* 693 2870 dp * You can obviously read that which you can store. 694 2870 dp */ 695 2870 dp if (dtrace_canstore(addr, sz, mstate, vstate)) 696 2870 dp return (1); 697 2870 dp 698 2870 dp /* 699 2870 dp * We're allowed to read from our own string table. 700 2870 dp */ 701 2870 dp if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab, 702 2870 dp mstate->dtms_difo->dtdo_strlen)) 703 2870 dp return (1); 704 2870 dp 705 2870 dp DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV); 706 2870 dp *illval = addr; 707 2870 dp return (0); 708 2870 dp } 709 2870 dp 710 2870 dp /* 711 2870 dp * Convenience routine to check to see if a given string is within a memory 712 2870 dp * region in which a load may be issued given the user's privilege level; 713 2870 dp * this exists so that we don't need to issue unnecessary dtrace_strlen() 714 2870 dp * calls in the event that the user has all privileges. 715 2870 dp */ 716 2870 dp static int 717 2870 dp dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, 718 2870 dp dtrace_vstate_t *vstate) 719 2870 dp { 720 2870 dp size_t strsz; 721 2870 dp 722 2870 dp /* 723 2870 dp * If we hold the privilege to read from kernel memory, then 724 2870 dp * everything is readable. 725 2870 dp */ 726 2870 dp if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) 727 2870 dp return (1); 728 2870 dp 729 2870 dp strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz); 730 2870 dp if (dtrace_canload(addr, strsz, mstate, vstate)) 731 2870 dp return (1); 732 2870 dp 733 2870 dp return (0); 734 2870 dp } 735 2870 dp 736 2870 dp /* 737 2870 dp * Convenience routine to check to see if a given variable is within a memory 738 2870 dp * region in which a load may be issued given the user's privilege level. 739 2870 dp */ 740 2870 dp static int 741 2870 dp dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate, 742 2870 dp dtrace_vstate_t *vstate) 743 2870 dp { 744 2870 dp size_t sz; 745 2870 dp ASSERT(type->dtdt_flags & DIF_TF_BYREF); 746 2870 dp 747 2870 dp /* 748 2870 dp * If we hold the privilege to read from kernel memory, then 749 2870 dp * everything is readable. 750 2870 dp */ 751 2870 dp if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) 752 2870 dp return (1); 753 2870 dp 754 2870 dp if (type->dtdt_kind == DIF_TYPE_STRING) 755 2870 dp sz = dtrace_strlen(src, 756 2870 dp vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1; 757 2870 dp else 758 2870 dp sz = type->dtdt_size; 759 2870 dp 760 2870 dp return (dtrace_canload((uintptr_t)src, sz, mstate, vstate)); 761 0 stevel } 762 0 stevel 763 0 stevel /* 764 0 stevel * Compare two strings using safe loads. 765 0 stevel */ 766 0 stevel static int 767 0 stevel dtrace_strncmp(char *s1, char *s2, size_t limit) 768 0 stevel { 769 0 stevel uint8_t c1, c2; 770 0 stevel volatile uint16_t *flags; 771 0 stevel 772 0 stevel if (s1 == s2 || limit == 0) 773 0 stevel return (0); 774 0 stevel 775 0 stevel flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags; 776 0 stevel 777 0 stevel do { 778 0 stevel if (s1 == NULL) { 779 0 stevel c1 = '\0'; 780 0 stevel } else { 781 0 stevel c1 = dtrace_load8((uintptr_t)s1++); 782 0 stevel } 783 0 stevel 784 0 stevel if (s2 == NULL) { 785 0 stevel c2 = '\0'; 786 0 stevel } else { 787 0 stevel c2 = dtrace_load8((uintptr_t)s2++); 788 0 stevel } 789 0 stevel 790 0 stevel if (c1 != c2) 791 0 stevel return (c1 - c2); 792 0 stevel } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT)); 793 0 stevel 794 0 stevel return (0); 795 0 stevel } 796 0 stevel 797 0 stevel /* 798 0 stevel * Compute strlen(s) for a string using safe memory accesses. The additional 799 0 stevel * len parameter is used to specify a maximum length to ensure completion. 800 0 stevel */ 801 0 stevel static size_t 802 0 stevel dtrace_strlen(const char *s, size_t lim) 803 0 stevel { 804 0 stevel uint_t len; 805 0 stevel 806 0 stevel for (len = 0; len != lim; len++) { 807 0 stevel if (dtrace_load8((uintptr_t)s++) == '\0') 808 0 stevel break; 809 0 stevel } 810 0 stevel 811 0 stevel return (len); 812 0 stevel } 813 0 stevel 814 0 stevel /* 815 0 stevel * Check if an address falls within a toxic region. 816 0 stevel */ 817 0 stevel static int 818 0 stevel dtrace_istoxic(uintptr_t kaddr, size_t size) 819 0 stevel { 820 0 stevel uintptr_t taddr, tsize; 821 0 stevel int i; 822 0 stevel 823 0 stevel for (i = 0; i < dtrace_toxranges; i++) { 824 0 stevel taddr = dtrace_toxrange[i].dtt_base; 825 0 stevel tsize = dtrace_toxrange[i].dtt_limit - taddr; 826 0 stevel 827 0 stevel if (kaddr - taddr < tsize) { 828 0 stevel DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR); 829 0 stevel cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr; 830 0 stevel return (1); 831 0 stevel } 832 0 stevel 833 0 stevel if (taddr - kaddr < size) { 834 0 stevel DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR); 835 0 stevel cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr; 836 0 stevel return (1); 837 0 stevel } 838 0 stevel } 839 0 stevel 840 0 stevel return (0); 841 0 stevel } 842 0 stevel 843 0 stevel /* 844 0 stevel * Copy src to dst using safe memory accesses. The src is assumed to be unsafe 845 0 stevel * memory specified by the DIF program. The dst is assumed to be safe memory 846 0 stevel * that we can store to directly because it is managed by DTrace. As with 847 0 stevel * standard bcopy, overlapping copies are handled properly. 848 0 stevel */ 849 0 stevel static void 850 0 stevel dtrace_bcopy(const void *src, void *dst, size_t len) 851 0 stevel { 852 0 stevel if (len != 0) { 853 0 stevel uint8_t *s1 = dst; 854 0 stevel const uint8_t *s2 = src; 855 0 stevel 856 0 stevel if (s1 <= s2) { 857 0 stevel do { 858 0 stevel *s1++ = dtrace_load8((uintptr_t)s2++); 859 0 stevel } while (--len != 0); 860 0 stevel } else { 861 0 stevel s2 += len; 862 0 stevel s1 += len; 863 0 stevel 864 0 stevel do { 865 0 stevel *--s1 = dtrace_load8((uintptr_t)--s2); 866 0 stevel } while (--len != 0); 867 0 stevel } 868 0 stevel } 869 0 stevel } 870 0 stevel 871 0 stevel /* 872 0 stevel * Copy src to dst using safe memory accesses, up to either the specified 873 0 stevel * length, or the point that a nul byte is encountered. The src is assumed to 874 0 stevel * be unsafe memory specified by the DIF program. The dst is assumed to be 875 0 stevel * safe memory that we can store to directly because it is managed by DTrace. 876 0 stevel * Unlike dtrace_bcopy(), overlapping regions are not handled. 877 0 stevel */ 878 0 stevel static void 879 0 stevel dtrace_strcpy(const void *src, void *dst, size_t len) 880 0 stevel { 881 0 stevel if (len != 0) { 882 0 stevel uint8_t *s1 = dst, c; 883 0 stevel const uint8_t *s2 = src; 884 0 stevel 885 0 stevel do { 886 0 stevel *s1++ = c = dtrace_load8((uintptr_t)s2++); 887 0 stevel } while (--len != 0 && c != '\0'); 888 0 stevel } 889 0 stevel } 890 0 stevel 891 0 stevel /* 892 0 stevel * Copy src to dst, deriving the size and type from the specified (BYREF) 893 0 stevel * variable type. The src is assumed to be unsafe memory specified by the DIF 894 0 stevel * program. The dst is assumed to be DTrace variable memory that is of the 895 0 stevel * specified type; we assume that we can store to directly. 896 0 stevel */ 897 0 stevel static void 898 0 stevel dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type) 899 0 stevel { 900 0 stevel ASSERT(type->dtdt_flags & DIF_TF_BYREF); 901 0 stevel 902 0 stevel if (type->dtdt_kind == DIF_TYPE_STRING) { 903 0 stevel dtrace_strcpy(src, dst, type->dtdt_size); 904 0 stevel } else { 905 0 stevel dtrace_bcopy(src, dst, type->dtdt_size); 906 0 stevel } 907 0 stevel } 908 0 stevel 909 0 stevel /* 910 0 stevel * Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be 911 0 stevel * unsafe memory specified by the DIF program. The s2 data is assumed to be 912 0 stevel * safe memory that we can access directly because it is managed by DTrace. 913 0 stevel */ 914 0 stevel static int 915 0 stevel dtrace_bcmp(const void *s1, const void *s2, size_t len) 916 0 stevel { 917 0 stevel volatile uint16_t *flags; 918 0 stevel 919 0 stevel flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags; 920 0 stevel 921 0 stevel if (s1 == s2) 922 0 stevel return (0); 923 0 stevel 924 0 stevel if (s1 == NULL || s2 == NULL) 925 0 stevel return (1); 926 0 stevel 927 0 stevel if (s1 != s2 && len != 0) { 928 0 stevel const uint8_t *ps1 = s1; 929 0 stevel const uint8_t *ps2 = s2; 930 0 stevel 931 0 stevel do { 932 0 stevel if (dtrace_load8((uintptr_t)ps1++) != *ps2++) 933 0 stevel return (1); 934 0 stevel } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT)); 935 0 stevel } 936 0 stevel return (0); 937 0 stevel } 938 0 stevel 939 0 stevel /* 940 0 stevel * Zero the specified region using a simple byte-by-byte loop. Note that this 941 0 stevel * is for safe DTrace-managed memory only. 942 0 stevel */ 943 0 stevel static void 944 0 stevel dtrace_bzero(void *dst, size_t len) 945 0 stevel { 946 0 stevel uchar_t *cp; 947 0 stevel 948 0 stevel for (cp = dst; len != 0; len--) 949 0 stevel *cp++ = 0; 950 0 stevel } 951 0 stevel 952 5984 jhaslam static void 953 5984 jhaslam dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum) 954 5984 jhaslam { 955 5984 jhaslam uint64_t result[2]; 956 5984 jhaslam 957 5984 jhaslam result[0] = addend1[0] + addend2[0]; 958 5984 jhaslam result[1] = addend1[1] + addend2[1] + 959 5984 jhaslam (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0); 960 5984 jhaslam 961 5984 jhaslam sum[0] = result[0]; 962 5984 jhaslam sum[1] = result[1]; 963 5984 jhaslam } 964 5984 jhaslam 965 5984 jhaslam /* 966 5984 jhaslam * Shift the 128-bit value in a by b. If b is positive, shift left. 967 5984 jhaslam * If b is negative, shift right. 968 5984 jhaslam */ 969 5984 jhaslam static void 970 5984 jhaslam dtrace_shift_128(uint64_t *a, int b) 971 5984 jhaslam { 972 5984 jhaslam uint64_t mask; 973 5984 jhaslam 974 5984 jhaslam if (b == 0) 975 5984 jhaslam return; 976 5984 jhaslam 977 5984 jhaslam if (b < 0) { 978 5984 jhaslam b = -b; 979 5984 jhaslam if (b >= 64) { 980 5984 jhaslam a[0] = a[1] >> (b - 64); 981 5984 jhaslam a[1] = 0; 982 5984 jhaslam } else { 983 5984 jhaslam a[0] >>= b; 984 5984 jhaslam mask = 1LL << (64 - b); 985 5984 jhaslam mask -= 1; 986 5984 jhaslam a[0] |= ((a[1] & mask) << (64 - b)); 987 5984 jhaslam a[1] >>= b; 988 5984 jhaslam } 989 5984 jhaslam } else { 990 5984 jhaslam if (b >= 64) { 991 5984 jhaslam a[1] = a[0] << (b - 64); 992 5984 jhaslam a[0] = 0; 993 5984 jhaslam } else { 994 5984 jhaslam a[1] <<= b; 995 5984 jhaslam mask = a[0] >> (64 - b); 996 5984 jhaslam a[1] |= mask; 997 5984 jhaslam a[0] <<= b; 998 5984 jhaslam } 999 5984 jhaslam } 1000 5984 jhaslam } 1001 5984 jhaslam 1002 5984 jhaslam /* 1003 5984 jhaslam * The basic idea is to break the 2 64-bit values into 4 32-bit values, 1004 5984 jhaslam * use native multiplication on those, and then re-combine into the 1005 5984 jhaslam * resulting 128-bit value. 1006 5984 jhaslam * 1007 5984 jhaslam * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) = 1008 5984 jhaslam * hi1 * hi2 << 64 + 1009 5984 jhaslam * hi1 * lo2 << 32 + 1010 5984 jhaslam * hi2 * lo1 << 32 + 1011 5984 jhaslam * lo1 * lo2 1012 5984 jhaslam */ 1013 5984 jhaslam static void 1014 5984 jhaslam dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product) 1015 5984 jhaslam { 1016 5984 jhaslam uint64_t hi1, hi2, lo1, lo2; 1017 5984 jhaslam uint64_t tmp[2]; 1018 5984 jhaslam 1019 5984 jhaslam hi1 = factor1 >> 32; 1020 5984 jhaslam hi2 = factor2 >> 32; 1021 5984 jhaslam 1022 5984 jhaslam lo1 = factor1 & DT_MASK_LO; 1023 5984 jhaslam lo2 = factor2 & DT_MASK_LO; 1024 5984 jhaslam 1025 5984 jhaslam product[0] = lo1 * lo2; 1026 5984 jhaslam product[1] = hi1 * hi2; 1027 5984 jhaslam 1028 5984 jhaslam tmp[0] = hi1 * lo2; 1029 5984 jhaslam tmp[1] = 0; 1030 5984 jhaslam dtrace_shift_128(tmp, 32); 1031 5984 jhaslam dtrace_add_128(product, tmp, product); 1032 5984 jhaslam 1033 5984 jhaslam tmp[0] = hi2 * lo1; 1034 5984 jhaslam tmp[1] = 0; 1035 5984 jhaslam dtrace_shift_128(tmp, 32); 1036 5984 jhaslam dtrace_add_128(product, tmp, product); 1037 5984 jhaslam } 1038 5984 jhaslam 1039 0 stevel /* 1040 1677 dp * This privilege check should be used by actions and subroutines to 1041 1677 dp * verify that the user credentials of the process that enabled the 1042 1677 dp * invoking ECB match the target credentials 1043 1677 dp */ 1044 1677 dp static int 1045 1677 dp dtrace_priv_proc_common_user(dtrace_state_t *state) 1046 1677 dp { 1047 1677 dp cred_t *cr, *s_cr = state->dts_cred.dcr_cred; 1048 1677 dp 1049 1677 dp /* 1050 1677 dp * We should always have a non-NULL state cred here, since if cred 1051 1677 dp * is null (anonymous tracing), we fast-path bypass this routine. 1052 1677 dp */ 1053 1677 dp ASSERT(s_cr != NULL); 1054 1677 dp 1055 1677 dp if ((cr = CRED()) != NULL && 1056 1677 dp s_cr->cr_uid == cr->cr_uid && 1057 1677 dp s_cr->cr_uid == cr->cr_ruid && 1058 1677 dp s_cr->cr_uid == cr->cr_suid && 1059 1677 dp s_cr->cr_gid == cr->cr_gid && 1060 1677 dp s_cr->cr_gid == cr->cr_rgid && 1061 1677 dp s_cr->cr_gid == cr->cr_sgid) 1062 1677 dp return (1); 1063 1677 dp 1064 1677 dp return (0); 1065 1677 dp } 1066 1677 dp 1067 1677 dp /* 1068 1677 dp * This privilege check should be used by actions and subroutines to 1069 1677 dp * verify that the zone of the process that enabled the invoking ECB 1070 1677 dp * matches the target credentials 1071 1677 dp */ 1072 1677 dp static int 1073 1677 dp dtrace_priv_proc_common_zone(dtrace_state_t *state) 1074 1677 dp { 1075 1677 dp cred_t *cr, *s_cr = state->dts_cred.dcr_cred; 1076 1677 dp 1077 1677 dp /* 1078 1677 dp * We should always have a non-NULL state cred here, since if cred 1079 1677 dp * is null (anonymous tracing), we fast-path bypass this routine. 1080 1677 dp */ 1081 1677 dp ASSERT(s_cr != NULL); 1082 1677 dp 1083 1677 dp if ((cr = CRED()) != NULL && 1084 1677 dp s_cr->cr_zone == cr->cr_zone) 1085 1677 dp return (1); 1086 1677 dp 1087 1677 dp return (0); 1088 1677 dp } 1089 1677 dp 1090 1677 dp /* 1091 1677 dp * This privilege check should be used by actions and subroutines to 1092 1677 dp * verify that the process has not setuid or changed credentials. 1093 1677 dp */ 1094 1677 dp static int 1095 1677 dp dtrace_priv_proc_common_nocd() 1096 1677 dp { 1097 0 stevel proc_t *proc; 1098 0 stevel 1099 1677 dp if ((proc = ttoproc(curthread)) != NULL && 1100 0 stevel !(proc->p_flag & SNOCD)) 1101 0 stevel return (1); 1102 0 stevel 1103 1677 dp return (0); 1104 1677 dp } 1105 1677 dp 1106 1677 dp static int 1107 1677 dp dtrace_priv_proc_destructive(dtrace_state_t *state) 1108 1677 dp { 1109 1677 dp int action = state->dts_cred.dcr_action; 1110 1677 dp 1111 1677 dp if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) && 1112 1677 dp dtrace_priv_proc_common_zone(state) == 0) 1113 1677 dp goto bad; 1114 1677 dp 1115 1677 dp if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) && 1116 1677 dp dtrace_priv_proc_common_user(state) == 0) 1117 1677 dp goto bad; 1118 1677 dp 1119 1677 dp if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) && 1120 1677 dp dtrace_priv_proc_common_nocd() == 0) 1121 1677 dp goto bad; 1122 1677 dp 1123 1677 dp return (1); 1124 1677 dp 1125 1677 dp bad: 1126 0 stevel cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV; 1127 0 stevel 1128 0 stevel return (0); 1129 0 stevel } 1130 0 stevel 1131 0 stevel static int 1132 0 stevel dtrace_priv_proc_control(dtrace_state_t *state) 1133 0 stevel { 1134 0 stevel if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL) 1135 0 stevel return (1); 1136 0 stevel 1137 1677 dp if (dtrace_priv_proc_common_zone(state) && 1138 1677 dp dtrace_priv_proc_common_user(state) && 1139 1677 dp dtrace_priv_proc_common_nocd()) 1140 1677 dp return (1); 1141 1677 dp 1142 1677 dp cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV; 1143 1677 dp 1144 1677 dp return (0); 1145 0 stevel } 1146 0 stevel 1147 0 stevel static int 1148 0 stevel dtrace_priv_proc(dtrace_state_t *state) 1149 0 stevel { 1150 0 stevel if (state->dts_cred.dcr_action & DTRACE_CRA_PROC) 1151 0 stevel return (1); 1152 0 stevel 1153 0 stevel cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV; 1154 0 stevel 1155 0 stevel return (0); 1156 0 stevel } 1157 0 stevel 1158 0 stevel static int 1159 0 stevel dtrace_priv_kernel(dtrace_state_t *state) 1160 0 stevel { 1161 0 stevel if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL) 1162 0 stevel return (1); 1163 0 stevel 1164 0 stevel cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV; 1165 0 stevel 1166 0 stevel return (0); 1167 0 stevel } 1168 0 stevel 1169 0 stevel static int 1170 0 stevel dtrace_priv_kernel_destructive(dtrace_state_t *state) 1171 0 stevel { 1172 0 stevel if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE) 1173 0 stevel return (1); 1174 0 stevel 1175 0 stevel cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV; 1176 0 stevel 1177 0 stevel return (0); 1178 0 stevel } 1179 0 stevel 1180 0 stevel /* 1181 0 stevel * Note: not called from probe context. This function is called 1182 0 stevel * asynchronously (and at a regular interval) from outside of probe context to 1183 0 stevel * clean the dirty dynamic variable lists on all CPUs. Dynamic variable 1184 0 stevel * cleaning is explained in detail in <sys/dtrace_impl.h>. 1185 0 stevel */ 1186 0 stevel void 1187 0 stevel dtrace_dynvar_clean(dtrace_dstate_t *dstate) 1188 0 stevel { 1189 0 stevel dtrace_dynvar_t *dirty; 1190 0 stevel dtrace_dstate_percpu_t *dcpu; 1191 0 stevel int i, work = 0; 1192 0 stevel 1193 0 stevel for (i = 0; i < NCPU; i++) { 1194 0 stevel dcpu = &dstate->dtds_percpu[i]; 1195 0 stevel 1196 0 stevel ASSERT(dcpu->dtdsc_rinsing == NULL); 1197 0 stevel 1198 0 stevel /* 1199 0 stevel * If the dirty list is NULL, there is no dirty work to do. 1200 0 stevel */ 1201 0 stevel if (dcpu->dtdsc_dirty == NULL) 1202 0 stevel continue; 1203 0 stevel 1204 0 stevel /* 1205 0 stevel * If the clean list is non-NULL, then we're not going to do 1206 0 stevel * any work for this CPU -- it means that there has not been 1207 0 stevel * a dtrace_dynvar() allocation on this CPU (or from this CPU) 1208 0 stevel * since the last time we cleaned house. 1209 0 stevel */ 1210 0 stevel if (dcpu->dtdsc_clean != NULL) 1211 0 stevel continue; 1212 0 stevel 1213 0 stevel work = 1; 1214 0 stevel 1215 0 stevel /* 1216 0 stevel * Atomically move the dirty list aside. 1217 0 stevel */ 1218 0 stevel do { 1219 0 stevel dirty = dcpu->dtdsc_dirty; 1220 0 stevel 1221 0 stevel /* 1222 0 stevel * Before we zap the dirty list, set the rinsing list. 1223 0 stevel * (This allows for a potential assertion in 1224 0 stevel * dtrace_dynvar(): if a free dynamic variable appears 1225 0 stevel * on a hash chain, either the dirty list or the 1226 0 stevel * rinsing list for some CPU must be non-NULL.) 1227 0 stevel */ 1228 0 stevel dcpu->dtdsc_rinsing = dirty; 1229 0 stevel dtrace_membar_producer(); 1230 0 stevel } while (dtrace_casptr(&dcpu->dtdsc_dirty, 1231 0 stevel dirty, NULL) != dirty); 1232 0 stevel } 1233 0 stevel 1234 0 stevel if (!work) { 1235 0 stevel /* 1236 0 stevel * We have no work to do; we can simply return. 1237 0 stevel */ 1238 0 stevel return; 1239 0 stevel } 1240 0 stevel 1241 0 stevel dtrace_sync(); 1242 0 stevel 1243 0 stevel for (i = 0; i < NCPU; i++) { 1244 0 stevel dcpu = &dstate->dtds_percpu[i]; 1245 0 stevel 1246 0 stevel if (dcpu->dtdsc_rinsing == NULL) 1247 0 stevel continue; 1248 0 stevel 1249 0 stevel /* 1250 0 stevel * We are now guaranteed that no hash chain contains a pointer 1251 0 stevel * into this dirty list; we can make it clean. 1252 0 stevel */ 1253 0 stevel ASSERT(dcpu->dtdsc_clean == NULL); 1254 0 stevel dcpu->dtdsc_clean = dcpu->dtdsc_rinsing; 1255 0 stevel dcpu->dtdsc_rinsing = NULL; 1256 0 stevel } 1257 0 stevel 1258 0 stevel /* 1259 0 stevel * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make 1260 0 stevel * sure that all CPUs have seen all of the dtdsc_clean pointers. 1261 0 stevel * This prevents a race whereby a CPU incorrectly decides that 1262 0 stevel * the state should be something other than DTRACE_DSTATE_CLEAN 1263 0 stevel * after dtrace_dynvar_clean() has completed. 1264 0 stevel */ 1265 0 stevel dtrace_sync(); 1266 0 stevel 1267 0 stevel dstate->dtds_state = DTRACE_DSTATE_CLEAN; 1268 0 stevel } 1269 0 stevel 1270 0 stevel /* 1271 0 stevel * Depending on the value of the op parameter, this function looks-up, 1272 0 stevel * allocates or deallocates an arbitrarily-keyed dynamic variable. If an 1273 0 stevel * allocation is requested, this function will return a pointer to a 1274 0 stevel * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no 1275 0 stevel * variable can be allocated. If NULL is returned, the appropriate counter 1276 0 stevel * will be incremented. 1277 0 stevel */ 1278 0 stevel dtrace_dynvar_t * 1279 0 stevel dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys, 1280 2870 dp dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op, 1281 2870 dp dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) 1282 0 stevel { 1283 1739 bmc uint64_t hashval = DTRACE_DYNHASH_VALID; 1284 0 stevel dtrace_dynhash_t *hash = dstate->dtds_hash; 1285 0 stevel dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL; 1286 0 stevel processorid_t me = CPU->cpu_id, cpu = me; 1287 0 stevel dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me]; 1288 0 stevel size_t bucket, ksize; 1289 0 stevel size_t chunksize = dstate->dtds_chunksize; 1290 0 stevel uintptr_t kdata, lock, nstate; 1291 0 stevel uint_t i; 1292 0 stevel 1293 0 stevel ASSERT(nkeys != 0); 1294 0 stevel 1295 0 stevel /* 1296 0 stevel * Hash the key. As with aggregations, we use Jenkins' "One-at-a-time" 1297 0 stevel * algorithm. For the by-value portions, we perform the algorithm in 1298 0 stevel * 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a 1299 0 stevel * bit, and seems to have only a minute effect on distribution. For 1300 0 stevel * the by-reference data, we perform "One-at-a-time" iterating (safely) 1301 0 stevel * over each referenced byte. It's painful to do this, but it's much 1302 0 stevel * better than pathological hash distribution. The efficacy of the 1303 0 stevel * hashing algorithm (and a comparison with other algorithms) may be 1304 0 stevel * found by running the ::dtrace_dynstat MDB dcmd. 1305 0 stevel */ 1306 0 stevel for (i = 0; i < nkeys; i++) { 1307 0 stevel if (key[i].dttk_size == 0) { 1308 0 stevel uint64_t val = key[i].dttk_value; 1309 0 stevel 1310 0 stevel hashval += (val >> 48) & 0xffff; 1311 0 stevel hashval += (hashval << 10); 1312 0 stevel hashval ^= (hashval >> 6); 1313 0 stevel 1314 0 stevel hashval += (val >> 32) & 0xffff; 1315 0 stevel hashval += (hashval << 10); 1316 0 stevel hashval ^= (hashval >> 6); 1317 0 stevel 1318 0 stevel hashval += (val >> 16) & 0xffff; 1319 0 stevel hashval += (hashval << 10); 1320 0 stevel hashval ^= (hashval >> 6); 1321 0 stevel 1322 0 stevel hashval += val & 0xffff; 1323 0 stevel hashval += (hashval << 10); 1324 0 stevel hashval ^= (hashval >> 6); 1325 0 stevel } else { 1326 0 stevel /* 1327 0 stevel * This is incredibly painful, but it beats the hell 1328 0 stevel * out of the alternative. 1329 0 stevel */ 1330 0 stevel uint64_t j, size = key[i].dttk_size; 1331 0 stevel uintptr_t base = (uintptr_t)key[i].dttk_value; 1332 0 stevel 1333 2870 dp if (!dtrace_canload(base, size, mstate, vstate)) 1334 2870 dp break; 1335 2870 dp 1336 0 stevel for (j = 0; j < size; j++) { 1337 0 stevel hashval += dtrace_load8(base + j); 1338 0 stevel hashval += (hashval << 10); 1339 0 stevel hashval ^= (hashval >> 6); 1340 0 stevel } 1341 0 stevel } 1342 0 stevel } 1343 2870 dp 1344 2870 dp if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT)) 1345 2870 dp return (NULL); 1346 0 stevel 1347 0 stevel hashval += (hashval << 3); 1348 0 stevel hashval ^= (hashval >> 11); 1349 0 stevel hashval += (hashval << 15); 1350 0 stevel 1351 0 stevel /* 1352 1739 bmc * There is a remote chance (ideally, 1 in 2^31) that our hashval 1353 1739 bmc * comes out to be one of our two sentinel hash values. If this 1354 1739 bmc * actually happens, we set the hashval to be a value known to be a 1355 1739 bmc * non-sentinel value. 1356 1739 bmc */ 1357 1739 bmc if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK) 1358 1739 bmc hashval = DTRACE_DYNHASH_VALID; 1359 0 stevel 1360 0 stevel /* 1361 0 stevel * Yes, it's painful to do a divide here. If the cycle count becomes 1362 0 stevel * important here, tricks can be pulled to reduce it. (However, it's 1363 0 stevel * critical that hash collisions be kept to an absolute minimum; 1364 0 stevel * they're much more painful than a divide.) It's better to have a 1365 0 stevel * solution that generates few collisions and still keeps things 1366 0 stevel * relatively simple. 1367 0 stevel */ 1368 0 stevel bucket = hashval % dstate->dtds_hashsize; 1369 0 stevel 1370 0 stevel if (op == DTRACE_DYNVAR_DEALLOC) { 1371 0 stevel volatile uintptr_t *lockp = &hash[bucket].dtdh_lock; 1372 0 stevel 1373 0 stevel for (;;) { 1374 0 stevel while ((lock = *lockp) & 1) 1375 0 stevel continue; 1376 0 stevel 1377 0 stevel if (dtrace_casptr((void *)lockp, 1378 0 stevel (void *)lock, (void *)(lock + 1)) == (void *)lock) 1379 0 stevel break; 1380 0 stevel } 1381 0 stevel 1382 0 stevel dtrace_membar_producer(); 1383 0 stevel } 1384 0 stevel 1385 0 stevel top: 1386 0 stevel prev = NULL; 1387 0 stevel lock = hash[bucket].dtdh_lock; 1388 0 stevel 1389 0 stevel dtrace_membar_consumer(); 1390 0 stevel 1391 0 stevel start = hash[bucket].dtdh_chain; 1392 1739 bmc ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK || 1393 1739 bmc start->dtdv_hashval != DTRACE_DYNHASH_FREE || 1394 1739 bmc op != DTRACE_DYNVAR_DEALLOC)); 1395 0 stevel 1396 0 stevel for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) { 1397 0 stevel dtrace_tuple_t *dtuple = &dvar->dtdv_tuple; 1398 0 stevel dtrace_key_t *dkey = &dtuple->dtt_key[0]; 1399 0 stevel 1400 0 stevel if (dvar->dtdv_hashval != hashval) { 1401 1739 bmc if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) { 1402 1739 bmc /* 1403 1739 bmc * We've reached the sink, and therefore the 1404 1739 bmc * end of the hash chain; we can kick out of 1405 1739 bmc * the loop knowing that we have seen a valid 1406 1739 bmc * snapshot of state. 1407 1739 bmc */ 1408 1739 bmc ASSERT(dvar->dtdv_next == NULL); 1409 1739 bmc ASSERT(dvar == &dtrace_dynhash_sink); 1410 1739 bmc break; 1411 1739 bmc } 1412 1739 bmc 1413 1739 bmc if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) { 1414 1739 bmc /* 1415 1739 bmc * We've gone off the rails: somewhere along 1416 1739 bmc * the line, one of the members of this hash 1417 1739 bmc * chain was deleted. Note that we could also 1418 1739 bmc * detect this by simply letting this loop run 1419 1739 bmc * to completion, as we would eventually hit 1420 1739 bmc * the end of the dirty list. However, we 1421 1739 bmc * want to avoid running the length of the 1422 1739 bmc * dirty list unnecessarily (it might be quite 1423 1739 bmc * long), so we catch this as early as 1424 1739 bmc * possible by detecting the hash marker. In 1425 1739 bmc * this case, we simply set dvar to NULL and 1426 1739 bmc * break; the conditional after the loop will 1427 1739 bmc * send us back to top. 1428 1739 bmc */ 1429 1739 bmc dvar = NULL; 1430 1739 bmc break; 1431 0 stevel } 1432 0 stevel 1433 0 stevel goto next; 1434 0 stevel } 1435 0 stevel 1436 0 stevel if (dtuple->dtt_nkeys != nkeys) 1437 0 stevel goto next; 1438 0 stevel 1439 0 stevel for (i = 0; i < nkeys; i++, dkey++) { 1440 0 stevel if (dkey->dttk_size != key[i].dttk_size) 1441 0 stevel goto next; /* size or type mismatch */ 1442 0 stevel 1443 0 stevel if (dkey->dttk_size != 0) { 1444 0 stevel if (dtrace_bcmp( 1445 0 stevel (void *)(uintptr_t)key[i].dttk_value, 1446 0 stevel (void *)(uintptr_t)dkey->dttk_value, 1447 0 stevel dkey->dttk_size)) 1448 0 stevel goto next; 1449 0 stevel } else { 1450 0 stevel if (dkey->dttk_value != key[i].dttk_value) 1451 0 stevel goto next; 1452 0 stevel } 1453 0 stevel } 1454 0 stevel 1455 0 stevel if (op != DTRACE_DYNVAR_DEALLOC) 1456 0 stevel return (dvar); 1457 0 stevel 1458 0 stevel ASSERT(dvar->dtdv_next == NULL || 1459 1739 bmc dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE); 1460 0 stevel 1461 0 stevel if (prev != NULL) { 1462 0 stevel ASSERT(hash[bucket].dtdh_chain != dvar); 1463 0 stevel ASSERT(start != dvar); 1464 0 stevel ASSERT(prev->dtdv_next == dvar); 1465 0 stevel prev->dtdv_next = dvar->dtdv_next; 1466 0 stevel } else { 1467 0 stevel if (dtrace_casptr(&hash[bucket].dtdh_chain, 1468 0 stevel start, dvar->dtdv_next) != start) { 1469 0 stevel /* 1470 0 stevel * We have failed to atomically swing the 1471 0 stevel * hash table head pointer, presumably because 1472 0 stevel * of a conflicting allocation on another CPU. 1473 0 stevel * We need to reread the hash chain and try 1474 0 stevel * again. 1475 0 stevel */ 1476 0 stevel goto top; 1477 0 stevel } 1478 0 stevel } 1479 0 stevel 1480 0 stevel dtrace_membar_producer(); 1481 0 stevel 1482 0 stevel /* 1483 1739 bmc * Now set the hash value to indicate that it's free. 1484 0 stevel */ 1485 0 stevel ASSERT(hash[bucket].dtdh_chain != dvar); 1486 1739 bmc dvar->dtdv_hashval = DTRACE_DYNHASH_FREE; 1487 0 stevel 1488 0 stevel dtrace_membar_producer(); 1489 0 stevel 1490 0 stevel /* 1491 0 stevel * Set the next pointer to point at the dirty list, and 1492 0 stevel * atomically swing the dirty pointer to the newly freed dvar. 1493 0 stevel */ 1494 0 stevel do { 1495 0 stevel next = dcpu->dtdsc_dirty; 1496 0 stevel dvar->dtdv_next = next; 1497 0 stevel } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next); 1498 0 stevel 1499 0 stevel /* 1500 0 stevel * Finally, unlock this hash bucket. 1501 0 stevel */ 1502 0 stevel ASSERT(hash[bucket].dtdh_lock == lock); 1503 0 stevel ASSERT(lock & 1); 1504 0 stevel hash[bucket].dtdh_lock++; 1505 0 stevel 1506 0 stevel return (NULL); 1507 0 stevel next: 1508 0 stevel prev = dvar; 1509 0 stevel continue; 1510 1739 bmc } 1511 1739 bmc 1512 1739 bmc if (dvar == NULL) { 1513 1739 bmc /* 1514 1739 bmc * If dvar is NULL, it is because we went off the rails: 1515 1739 bmc * one of the elements that we traversed in the hash chain 1516 1739 bmc * was deleted while we were traversing it. In this case, 1517 1739 bmc * we assert that we aren't doing a dealloc (deallocs lock 1518 1739 bmc * the hash bucket to prevent themselves from racing with 1519 1739 bmc * one another), and retry the hash chain traversal. 1520 1739 bmc */ 1521 1739 bmc ASSERT(op != DTRACE_DYNVAR_DEALLOC); 1522 1739 bmc goto top; 1523 0 stevel } 1524 0 stevel 1525 0 stevel if (op != DTRACE_DYNVAR_ALLOC) { 1526 0 stevel /* 1527 0 stevel * If we are not to allocate a new variable, we want to 1528 0 stevel * return NULL now. Before we return, check that the value 1529 0 stevel * of the lock word hasn't changed. If it has, we may have 1530 0 stevel * seen an inconsistent snapshot. 1531 0 stevel */ 1532 0 stevel if (op == DTRACE_DYNVAR_NOALLOC) { 1533 0 stevel if (hash[bucket].dtdh_lock != lock) 1534 0 stevel goto top; 1535 0 stevel } else { 1536 0 stevel ASSERT(op == DTRACE_DYNVAR_DEALLOC); 1537 0 stevel ASSERT(hash[bucket].dtdh_lock == lock); 1538 0 stevel ASSERT(lock & 1); 1539 0 stevel hash[bucket].dtdh_lock++; 1540 0 stevel } 1541 0 stevel 1542 0 stevel return (NULL); 1543 0 stevel } 1544 0 stevel 1545 0 stevel /* 1546 0 stevel * We need to allocate a new dynamic variable. The size we need is the 1547 0 stevel * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the 1548 0 stevel * size of any auxiliary key data (rounded up to 8-byte alignment) plus 1549 0 stevel * the size of any referred-to data (dsize). We then round the final 1550 0 stevel * size up to the chunksize for allocation. 1551 0 stevel */ 1552 0 stevel for (ksize = 0, i = 0; i < nkeys; i++) 1553 0 stevel ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t)); 1554 0 stevel 1555 0 stevel /* 1556 0 stevel * This should be pretty much impossible, but could happen if, say, 1557 0 stevel * strange DIF specified the tuple. Ideally, this should be an 1558 0 stevel * assertion and not an error condition -- but that requires that the 1559 0 stevel * chunksize calculation in dtrace_difo_chunksize() be absolutely 1560 0 stevel * bullet-proof. (That is, it must not be able to be fooled by 1561 0 stevel * malicious DIF.) Given the lack of backwards branches in DIF, 1562 0 stevel * solving this would presumably not amount to solving the Halting 1563 0 stevel * Problem -- but it still seems awfully hard. 1564 0 stevel */ 1565 0 stevel if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) + 1566 0 stevel ksize + dsize > chunksize) { 1567 0 stevel dcpu->dtdsc_drops++; 1568 0 stevel return (NULL); 1569 0 stevel } 1570 0 stevel 1571 0 stevel nstate = DTRACE_DSTATE_EMPTY; 1572 0 stevel 1573 0 stevel do { 1574 0 stevel retry: 1575 0 stevel free = dcpu->dtdsc_free; 1576 0 stevel 1577 0 stevel if (free == NULL) { 1578 0 stevel dtrace_dynvar_t *clean = dcpu->dtdsc_clean; 1579 0 stevel void *rval; 1580 0 stevel 1581 0 stevel if (clean == NULL) { 1582 0 stevel /* 1583 0 stevel * We're out of dynamic variable space on 1584 0 stevel * this CPU. Unless we have tried all CPUs, 1585 0 stevel * we'll try to allocate from a different 1586 0 stevel * CPU. 1587 0 stevel */ 1588 0 stevel switch (dstate->dtds_state) { 1589 0 stevel case DTRACE_DSTATE_CLEAN: { 1590 0 stevel void *sp = &dstate->dtds_state; 1591 0 stevel 1592 0 stevel if (++cpu >= NCPU) 1593 0 stevel cpu = 0; 1594 0 stevel 1595 0 stevel if (dcpu->dtdsc_dirty != NULL && 1596 0 stevel nstate == DTRACE_DSTATE_EMPTY) 1597 0 stevel nstate = DTRACE_DSTATE_DIRTY; 1598 0 stevel 1599 0 stevel if (dcpu->dtdsc_rinsing != NULL) 1600 0 stevel nstate = DTRACE_DSTATE_RINSING; 1601 0 stevel 1602 0 stevel dcpu = &dstate->dtds_percpu[cpu]; 1603 0 stevel 1604 0 stevel if (cpu != me) 1605 0 stevel goto retry; 1606 0 stevel 1607 0 stevel (void) dtrace_cas32(sp, 1608 0 stevel DTRACE_DSTATE_CLEAN, nstate); 1609 0 stevel 1610 0 stevel /* 1611 0 stevel * To increment the correct bean 1612 0 stevel * counter, take another lap. 1613 0 stevel */ 1614 0 stevel goto retry; 1615 0 stevel } 1616 0 stevel 1617 0 stevel case DTRACE_DSTATE_DIRTY: 1618 0 stevel dcpu->dtdsc_dirty_drops++; 1619 0 stevel break; 1620 0 stevel 1621 0 stevel case DTRACE_DSTATE_RINSING: 1622 0 stevel dcpu->dtdsc_rinsing_drops++; 1623 0 stevel break; 1624 0 stevel 1625 0 stevel case DTRACE_DSTATE_EMPTY: 1626 0 stevel dcpu->dtdsc_drops++; 1627 0 stevel break; 1628 0 stevel } 1629 0 stevel 1630 0 stevel DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP); 1631 0 stevel return (NULL); 1632 0 stevel } 1633 0 stevel 1634 0 stevel /* 1635 0 stevel * The clean list appears to be non-empty. We want to 1636 0 stevel * move the clean list to the free list; we start by 1637 0 stevel * moving the clean pointer aside. 1638 0 stevel */ 1639 0 stevel if (dtrace_casptr(&dcpu->dtdsc_clean, 1640 0 stevel clean, NULL) != clean) { 1641 0 stevel /* 1642 0 stevel * We are in one of two situations: 1643 0 stevel * 1644 0 stevel * (a) The clean list was switched to the 1645 0 stevel * free list by another CPU. 1646 0 stevel * 1647 0 stevel * (b) The clean list was added to by the 1648 0 stevel * cleansing cyclic. 1649 0 stevel * 1650 0 stevel * In either of these situations, we can 1651 0 stevel * just reattempt the free list allocation. 1652 0 stevel */ 1653 0 stevel goto retry; 1654 0 stevel } 1655 0 stevel 1656 1739 bmc ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE); 1657 0 stevel 1658 0 stevel /* 1659 0 stevel * Now we'll move the clean list to the free list. 1660 0 stevel * It's impossible for this to fail: the only way 1661 0 stevel * the free list can be updated is through this 1662 0 stevel * code path, and only one CPU can own the clean list. 1663 0 stevel * Thus, it would only be possible for this to fail if 1664 0 stevel * this code were racing with dtrace_dynvar_clean(). 1665 0 stevel * (That is, if dtrace_dynvar_clean() updated the clean 1666 0 stevel * list, and we ended up racing to update the free 1667 0 stevel * list.) This race is prevented by the dtrace_sync() 1668 0 stevel * in dtrace_dynvar_clean() -- which flushes the 1669 0 stevel * owners of the clean lists out before resetting 1670 0 stevel * the clean lists. 1671 0 stevel */ 1672 0 stevel rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean); 1673 0 stevel ASSERT(rval == NULL); 1674 0 stevel goto retry; 1675 0 stevel } 1676 0 stevel 1677 0 stevel dvar = free; 1678 0 stevel new_free = dvar->dtdv_next; 1679 0 stevel } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free); 1680 0 stevel 1681 0 stevel /* 1682 0 stevel * We have now allocated a new chunk. We copy the tuple keys into the 1683 0 stevel * tuple array and copy any referenced key data into the data space 1684 0 stevel * following the tuple array. As we do this, we relocate dttk_value 1685 0 stevel * in the final tuple to point to the key data address in the chunk. 1686 0 stevel */ 1687 0 stevel kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys]; 1688 0 stevel dvar->dtdv_data = (void *)(kdata + ksize); 1689 0 stevel dvar->dtdv_tuple.dtt_nkeys = nkeys; 1690 0 stevel 1691 0 stevel for (i = 0; i < nkeys; i++) { 1692 0 stevel dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i]; 1693 0 stevel size_t kesize = key[i].dttk_size; 1694 0 stevel 1695 0 stevel if (kesize != 0) { 1696 0 stevel dtrace_bcopy( 1697 0 stevel (const void *)(uintptr_t)key[i].dttk_value, 1698 0 stevel (void *)kdata, kesize); 1699 0 stevel dkey->dttk_value = kdata; 1700 0 stevel kdata += P2ROUNDUP(kesize, sizeof (uint64_t)); 1701 0 stevel } else { 1702 0 stevel dkey->dttk_value = key[i].dttk_value; 1703 0 stevel } 1704 0 stevel 1705 0 stevel dkey->dttk_size = kesize; 1706 0 stevel } 1707 0 stevel 1708 1739 bmc ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE); 1709 0 stevel dvar->dtdv_hashval = hashval; 1710 0 stevel dvar->dtdv_next = start; 1711 0 stevel 1712 0 stevel if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start) 1713 0 stevel return (dvar); 1714 0 stevel 1715 0 stevel /* 1716 0 stevel * The cas has failed. Either another CPU is adding an element to 1717 0 stevel * this hash chain, or another CPU is deleting an element from this 1718 0 stevel * hash chain. The simplest way to deal with both of these cases 1719 0 stevel * (though not necessarily the most efficient) is to free our 1720 0 stevel * allocated block and tail-call ourselves. Note that the free is 1721 0 stevel * to the dirty list and _not_ to the free list. This is to prevent 1722 0 stevel * races with allocators, above. 1723 0 stevel */ 1724 1739 bmc dvar->dtdv_hashval = DTRACE_DYNHASH_FREE; 1725 0 stevel 1726 0 stevel dtrace_membar_producer(); 1727 0 stevel 1728 0 stevel do { 1729 0 stevel free = dcpu->dtdsc_dirty; 1730 0 stevel dvar->dtdv_next = free; 1731 0 stevel } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free); 1732 0 stevel 1733 2870 dp return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate)); 1734 0 stevel } 1735 0 stevel 1736 457 bmc /*ARGSUSED*/ 1737 457 bmc static void 1738 457 bmc dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg) 1739 0 stevel { 1740 5984 jhaslam if ((int64_t)nval < (int64_t)*oval) 1741 0 stevel *oval = nval; 1742 0 stevel } 1743 0 stevel 1744 457 bmc /*ARGSUSED*/ 1745 457 bmc static void 1746 457 bmc dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg) 1747 0 stevel { 1748 5984 jhaslam if ((int64_t)nval > (int64_t)*oval) 1749 0 stevel *oval = nval; 1750 0 stevel } 1751 0 stevel 1752 0 stevel static void 1753 457 bmc dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr) 1754 0 stevel { 1755 0 stevel int i, zero = DTRACE_QUANTIZE_ZEROBUCKET; 1756 0 stevel int64_t val = (int64_t)nval; 1757 0 stevel 1758 0 stevel if (val < 0) { 1759 0 stevel for (i = 0; i < zero; i++) { 1760 0 stevel if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) { 1761 457 bmc quanta[i] += incr; 1762 0 stevel return; 1763 0 stevel } 1764 0 stevel } 1765 0 stevel } else { 1766 0 stevel for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) { 1767 0 stevel if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) { 1768 457 bmc quanta[i - 1] += incr; 1769 0 stevel return; 1770 0 stevel } 1771 0 stevel } 1772 0 stevel 1773 457 bmc quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr; 1774 0 stevel return; 1775 0 stevel } 1776 0 stevel 1777 0 stevel ASSERT(0); 1778 0 stevel } 1779 0 stevel 1780 0 stevel static void 1781 457 bmc dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr) 1782 0 stevel { 1783 0 stevel uint64_t arg = *lquanta++; 1784 0 stevel int32_t base = DTRACE_LQUANTIZE_BASE(arg); 1785 0 stevel uint16_t step = DTRACE_LQUANTIZE_STEP(arg); 1786 0 stevel uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg); 1787 0 stevel int32_t val = (int32_t)nval, level; 1788 0 stevel 1789 0 stevel ASSERT(step != 0); 1790 0 stevel ASSERT(levels != 0); 1791 0 stevel 1792 0 stevel if (val < base) { 1793 0 stevel /* 1794 0 stevel * This is an underflow. 1795 0 stevel */ 1796 457 bmc lquanta[0] += incr; 1797 0 stevel return; 1798 0 stevel } 1799 0 stevel 1800 0 stevel level = (val - base) / step; 1801 0 stevel 1802 0 stevel if (level < levels) { 1803 457 bmc lquanta[level + 1] += incr; 1804 0 stevel return; 1805 0 stevel } 1806 0 stevel 1807 0 stevel /* 1808 0 stevel * This is an overflow. 1809 0 stevel */ 1810 457 bmc lquanta[levels + 1] += incr; 1811 457 bmc } 1812 457 bmc 1813 457 bmc /*ARGSUSED*/ 1814 457 bmc static void 1815 457 bmc dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg) 1816 0 stevel { 1817 0 stevel data[0]++; 1818 0 stevel data[1] += nval; 1819 5984 jhaslam } 1820 5984 jhaslam 1821 5984 jhaslam /*ARGSUSED*/ 1822 5984 jhaslam static void 1823 5984 jhaslam dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg) 1824 5984 jhaslam { 1825 5984 jhaslam int64_t snval = (int64_t)nval; 1826 5984 jhaslam uint64_t tmp[2]; 1827 5984 jhaslam 1828 5984 jhaslam data[0]++; 1829 5984 jhaslam data[1] += nval; 1830 5984 jhaslam 1831 5984 jhaslam /* 1832 5984 jhaslam * What we want to say here is: 1833 5984 jhaslam * 1834 5984 jhaslam * data[2] += nval * nval; 1835 5984 jhaslam * 1836 5984 jhaslam * But given that nval is 64-bit, we could easily overflow, so 1837 5984 jhaslam * we do this as 128-bit arithmetic. 1838 5984 jhaslam */ 1839 5984 jhaslam if (snval < 0) 1840 5984 jhaslam snval = -snval; 1841 5984 jhaslam 1842 5984 jhaslam dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp); 1843 5984 jhaslam dtrace_add_128(data + 2, tmp, data + 2); 1844 0 stevel } 1845 0 stevel 1846 0 stevel /*ARGSUSED*/ 1847 0 stevel static void 1848 457 bmc dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg) 1849 0 stevel { 1850 0 stevel *oval = *oval + 1; 1851 0 stevel } 1852 0 stevel 1853 0 stevel /*ARGSUSED*/ 1854 0 stevel static void 1855 457 bmc dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg) 1856 0 stevel { 1857 0 stevel *oval += nval; 1858 0 stevel } 1859 0 stevel 1860 0 stevel /* 1861 0 stevel * Aggregate given the tuple in the principal data buffer, and the aggregating 1862 0 stevel * action denoted by the specified dtrace_aggregation_t. The aggregation 1863 0 stevel * buffer is specified as the buf parameter. This routine does not return 1864 0 stevel * failure; if there is no space in the aggregation buffer, the data will be 1865 0 stevel * dropped, and a corresponding counter incremented. 1866 0 stevel */ 1867 0 stevel static void 1868 0 stevel dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf, 1869 457 bmc intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg) 1870 0 stevel { 1871 0 stevel dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec; 1872 0 stevel uint32_t i, ndx, size, fsize; 1873 0 stevel uint32_t align = sizeof (uint64_t) - 1; 1874 0 stevel dtrace_aggbuffer_t *agb; 1875 0 stevel dtrace_aggkey_t *key; 1876 1017 bmc uint32_t hashval = 0, limit, isstr; 1877 0 stevel caddr_t tomax, data, kdata; 1878 0 stevel dtrace_actkind_t action; 1879 1017 bmc dtrace_action_t *act; 1880 0 stevel uintptr_t offs; 1881 0 stevel 1882 0 stevel if (buf == NULL) 1883 0 stevel return; 1884 0 stevel 1885 457 bmc if (!agg->dtag_hasarg) { 1886 457 bmc /* 1887 457 bmc * Currently, only quantize() and lquantize() take additional 1888 457 bmc * arguments, and they have the same semantics: an increment 1889 457 bmc * value that defaults to 1 when not present. If additional 1890 457 bmc * aggregating actions take arguments, the setting of the 1891 457 bmc * default argument value will presumably have to become more 1892 457 bmc * sophisticated... 1893 457 bmc */ 1894 457 bmc arg = 1; 1895 457 bmc } 1896 457 bmc 1897 0 stevel action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION; 1898 0 stevel size = rec->dtrd_offset - agg->dtag_base; 1899 0 stevel fsize = size + rec->dtrd_size; 1900 0 stevel 1901 0 stevel ASSERT(dbuf->dtb_tomax != NULL); 1902 0 stevel data = dbuf->dtb_tomax + offset + agg->dtag_base; 1903 0 stevel 1904 0 stevel if ((tomax = buf->dtb_tomax) == NULL) { 1905 0 stevel dtrace_buffer_drop(buf); 1906 0 stevel return; 1907 0 stevel } 1908 0 stevel 1909 0 stevel /* 1910 0 stevel * The metastructure is always at the bottom of the buffer. 1911 0 stevel */ 1912 0 stevel agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size - 1913 0 stevel sizeof (dtrace_aggbuffer_t)); 1914 0 stevel 1915 0 stevel if (buf->dtb_offset == 0) { 1916 0 stevel /* 1917 0 stevel * We just kludge up approximately 1/8th of the size to be 1918 0 stevel * buckets. If this guess ends up being routinely 1919 0 stevel * off-the-mark, we may need to dynamically readjust this 1920 0 stevel * based on past performance. 1921 0 stevel */ 1922 0 stevel uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t); 1923 0 stevel 1924 0 stevel if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) < 1925 0 stevel (uintptr_t)tomax || hashsize == 0) { 1926 0 stevel /* 1927 0 stevel * We've been given a ludicrously small buffer; 1928 0 stevel * increment our drop count and leave. 1929 0 stevel */ 1930 0 stevel dtrace_buffer_drop(buf); 1931 0 stevel return; 1932 0 stevel } 1933 0 stevel 1934 0 stevel /* 1935 0 stevel * And now, a pathetic attempt to try to get a an odd (or 1936 0 stevel * perchance, a prime) hash size for better hash distribution. 1937 0 stevel */ 1938 0 stevel if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3)) 1939 0 stevel hashsize -= DTRACE_AGGHASHSIZE_SLEW; 1940 0 stevel 1941 0 stevel agb->dtagb_hashsize = hashsize; 1942 0 stevel agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb - 1943 0 stevel agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *)); 1944 0 stevel agb->dtagb_free = (uintptr_t)agb->dtagb_hash; 1945 0 stevel 1946 0 stevel for (i = 0; i < agb->dtagb_hashsize; i++) 1947 0 stevel agb->dtagb_hash[i] = NULL; 1948 0 stevel } 1949 1017 bmc 1950 1017 bmc ASSERT(agg->dtag_first != NULL); 1951 1017 bmc ASSERT(agg->dtag_first->dta_intuple); 1952 0 stevel 1953 0 stevel /* 1954 0 stevel * Calculate the hash value based on the key. Note that we _don't_ 1955 0 stevel * include the aggid in the hashing (but we will store it as part of 1956 0 stevel * the key). The hashing algorithm is Bob Jenkins' "One-at-a-time" 1957 0 stevel * algorithm: a simple, quick algorithm that has no known funnels, and 1958 0 stevel * gets good distribution in practice. The efficacy of the hashing 1959 0 stevel * algorithm (and a comparison with other algorithms) may be found by 1960 0 stevel * running the ::dtrace_aggstat MDB dcmd. 1961 0 stevel */ 1962 1017 bmc for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) { 1963 1017 bmc i = act->dta_rec.dtrd_offset - agg->dtag_base; 1964 1017 bmc limit = i + act->dta_rec.dtrd_size; 1965 1017 bmc ASSERT(limit <= size); 1966 1017 bmc isstr = DTRACEACT_ISSTRING(act); 1967 1017 bmc 1968 1017 bmc for (; i < limit; i++) { 1969 1017 bmc hashval += data[i]; 1970 1017 bmc hashval += (hashval << 10); 1971 1017 bmc hashval ^= (hashval >> 6); 1972 1017 bmc 1973 1017 bmc if (isstr && data[i] == '\0') 1974 1017 bmc break; 1975 1017 bmc } 1976 0 stevel } 1977 0 stevel 1978 0 stevel hashval += (hashval << 3); 1979 0 stevel hashval ^= (hashval >> 11); 1980 0 stevel hashval += (hashval << 15); 1981 0 stevel 1982 0 stevel /* 1983 1017 bmc * Yes, the divide here is expensive -- but it's generally the least 1984 1017 bmc * of the performance issues given the amount of data that we iterate 1985 1017 bmc * over to compute hash values, compare data, etc. 1986 0 stevel */ 1987 0 stevel ndx = hashval % agb->dtagb_hashsize; 1988 0 stevel 1989 0 stevel for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) { 1990 0 stevel ASSERT((caddr_t)key >= tomax); 1991 0 stevel ASSERT((caddr_t)key < tomax + buf->dtb_size); 1992 0 stevel 1993 0 stevel if (hashval != key->dtak_hashval || key->dtak_size != size) 1994 0 stevel continue; 1995 0 stevel 1996 0 stevel kdata = key->dtak_data; 1997 0 stevel ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size); 1998 0 stevel 1999 1017 bmc for (act = agg->dtag_first; act->dta_intuple; 2000 1017 bmc act = act->dta_next) { 2001 1017 bmc i = act->dta_rec.dtrd_offset - agg->dtag_base; 2002 1017 bmc limit = i + act->dta_rec.dtrd_size; 2003 1017 bmc ASSERT(limit <= size); 2004 1017 bmc isstr = DTRACEACT_ISSTRING(act); 2005 1017 bmc 2006 1017 bmc for (; i < limit; i++) { 2007 1017 bmc if (kdata[i] != data[i]) 2008 1017 bmc goto next; 2009 1017 bmc 2010 1017 bmc if (isstr && data[i] == '\0') 2011 1017 bmc break; 2012 1017 bmc } 2013 0 stevel } 2014 0 stevel 2015 0 stevel if (action != key->dtak_action) { 2016 0 stevel /* 2017 0 stevel * We are aggregating on the same value in the same 2018 0 stevel * aggregation with two different aggregating actions. 2019 0 stevel * (This should have been picked up in the compiler, 2020 0 stevel * so we may be dealing with errant or devious DIF.) 2021 0 stevel * This is an error condition; we indicate as much, 2022 0 stevel * and return. 2023 0 stevel */ 2024 0 stevel DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); 2025 0 stevel return; 2026 0 stevel } 2027 0 stevel 2028 0 stevel /* 2029 0 stevel * This is a hit: we need to apply the aggregator to 2030 0 stevel * the value at this key. 2031 0 stevel */ 2032 457 bmc agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg); 2033 0 stevel return; 2034 0 stevel next: 2035 0 stevel continue; 2036 0 stevel } 2037 0 stevel 2038 0 stevel /* 2039 0 stevel * We didn't find it. We need to allocate some zero-filled space, 2040 0 stevel * link it into the hash table appropriately, and apply the aggregator 2041 0 stevel * to the (zero-filled) value. 2042 0 stevel */ 2043 0 stevel offs = buf->dtb_offset; 2044 0 stevel while (offs & (align - 1)) 2045 0 stevel offs += sizeof (uint32_t); 2046 0 stevel 2047 0 stevel /* 2048 0 stevel * If we don't have enough room to both allocate a new key _and_ 2049 0 stevel * its associated data, increment the drop count and return. 2050 0 stevel */ 2051 0 stevel if ((uintptr_t)tomax + offs + fsize > 2052 0 stevel agb->dtagb_free - sizeof (dtrace_aggkey_t)) { 2053 0 stevel dtrace_buffer_drop(buf); 2054 0 stevel return; 2055 0 stevel } 2056 0 stevel 2057 0 stevel /*CONSTCOND*/ 2058 0 stevel ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1))); 2059 0 stevel key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t)); 2060 0 stevel agb->dtagb_free -= sizeof (dtrace_aggkey_t); 2061 0 stevel 2062 0 stevel key->dtak_data = kdata = tomax + offs; 2063 0 stevel buf->dtb_offset = offs + fsize; 2064 0 stevel 2065 0 stevel /* 2066 0 stevel * Now copy the data across. 2067 0 stevel */ 2068 0 stevel *((dtrace_aggid_t *)kdata) = agg->dtag_id; 2069 0 stevel 2070 0 stevel for (i = sizeof (dtrace_aggid_t); i < size; i++) 2071 0 stevel kdata[i] = data[i]; 2072 1017 bmc 2073 1017 bmc /* 2074 1017 bmc * Because strings are not zeroed out by default, we need to iterate 2075 1017 bmc * looking for actions that store strings, and we need to explicitly 2076 1017 bmc * pad these strings out with zeroes. 2077 1017 bmc */ 2078 1017 bmc for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) { 2079 1017 bmc int nul; 2080 1017 bmc 2081 1017 bmc if (!DTRACEACT_ISSTRING(act)) 2082 1017 bmc continue; 2083 1017 bmc 2084 1017 bmc i = act->dta_rec.dtrd_offset - agg->dtag_base; 2085 1017 bmc limit = i + act->dta_rec.dtrd_size; 2086 1017 bmc ASSERT(limit <= size); 2087 1017 bmc 2088 1017 bmc for (nul = 0; i < limit; i++) { 2089 1017 bmc if (nul) { 2090 1017 bmc kdata[i] = '\0'; 2091 1017 bmc continue; 2092 1017 bmc } 2093 1017 bmc 2094 1017 bmc if (data[i] != '\0') 2095 1017 bmc continue; 2096 1017 bmc 2097 1017 bmc nul = 1; 2098 1017 bmc } 2099 1017 bmc } 2100 0 stevel 2101 0 stevel for (i = size; i < fsize; i++) 2102 0 stevel kdata[i] = 0; 2103 0 stevel 2104 0 stevel key->dtak_hashval = hashval; 2105 0 stevel key->dtak_size = size; 2106 0 stevel key->dtak_action = action; 2107 0 stevel key->dtak_next = agb->dtagb_hash[ndx]; 2108 0 stevel agb->dtagb_hash[ndx] = key; 2109 0 stevel 2110 0 stevel /* 2111 0 stevel * Finally, apply the aggregator. 2112 0 stevel */ 2113 0 stevel *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial; 2114 457 bmc agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg); 2115 0 stevel } 2116 0 stevel 2117 0 stevel /* 2118 0 stevel * Given consumer state, this routine finds a speculation in the INACTIVE 2119 0 stevel * state and transitions it into the ACTIVE state. If there is no speculation 2120 0 stevel * in the INACTIVE state, 0 is returned. In this case, no error counter is 2121 0 stevel * incremented -- it is up to the caller to take appropriate action. 2122 0 stevel */ 2123 0 stevel static int 2124 0 stevel dtrace_speculation(dtrace_state_t *state) 2125 0 stevel { 2126 0 stevel int i = 0; 2127 0 stevel dtrace_speculation_state_t current; 2128 0 stevel uint32_t *stat = &state->dts_speculations_unavail, count; 2129 0 stevel 2130 0 stevel while (i < state->dts_nspeculations) { 2131 0 stevel dtrace_speculation_t *spec = &state->dts_speculations[i]; 2132 0 stevel 2133 0 stevel current = spec->dtsp_state; 2134 0 stevel 2135 0 stevel if (current != DTRACESPEC_INACTIVE) { 2136 0 stevel if (current == DTRACESPEC_COMMITTINGMANY || 2137 0 stevel current == DTRACESPEC_COMMITTING || 2138 0 stevel current == DTRACESPEC_DISCARDING) 2139 0 stevel stat = &state->dts_speculations_busy; 2140 0 stevel i++; 2141 0 stevel continue; 2142 0 stevel } 2143 0 stevel 2144 0 stevel if (dtrace_cas32((uint32_t *)&spec->dtsp_state, 2145 0 stevel current, DTRACESPEC_ACTIVE) == current) 2146 0 stevel return (i + 1); 2147 0 stevel } 2148 0 stevel 2149 0 stevel /* 2150 0 stevel * We couldn't find a speculation. If we found as much as a single 2151 0 stevel * busy speculation buffer, we'll attribute this failure as "busy" 2152 0 stevel * instead of "unavail". 2153 0 stevel */ 2154 0 stevel do { 2155 0 stevel count = *stat; 2156 0 stevel } while (dtrace_cas32(stat, count, count + 1) != count); 2157 0 stevel 2158 0 stevel return (0); 2159 0 stevel } 2160 0 stevel 2161 0 stevel /* 2162 0 stevel * This routine commits an active speculation. If the specified speculation 2163 0 stevel * is not in a valid state to perform a commit(), this routine will silently do 2164 0 stevel * nothing. The state of the specified speculation is transitioned according 2165 0 stevel * to the state transition diagram outlined in <sys/dtrace_impl.h> 2166 0 stevel */ 2167 0 stevel static void 2168 0 stevel dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu, 2169 0 stevel dtrace_specid_t which) 2170 0 stevel { 2171 0 stevel dtrace_speculation_t *spec; 2172 0 stevel dtrace_buffer_t *src, *dest; 2173 0 stevel uintptr_t daddr, saddr, dlimit; 2174 0 stevel dtrace_speculation_state_t current, new; 2175 0 stevel intptr_t offs; 2176 0 stevel 2177 0 stevel if (which == 0) 2178 0 stevel return; 2179 0 stevel 2180 0 stevel if (which > state->dts_nspeculations) { 2181 0 stevel cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP; 2182 0 stevel return; 2183 0 stevel } 2184 0 stevel 2185 0 stevel spec = &state->dts_speculations[which - 1]; 2186 0 stevel src = &spec->dtsp_buffer[cpu]; 2187 0 stevel dest = &state->dts_buffer[cpu]; 2188 0 stevel 2189 0 stevel do { 2190 0 stevel current = spec->dtsp_state; 2191 0 stevel 2192 0 stevel if (current == DTRACESPEC_COMMITTINGMANY) 2193 0 stevel break; 2194 0 stevel 2195 0 stevel switch (current) { 2196 0 stevel case DTRACESPEC_INACTIVE: 2197 0 stevel case DTRACESPEC_DISCARDING: 2198 0 stevel return; 2199 0 stevel 2200 0 stevel case DTRACESPEC_COMMITTING: 2201 0 stevel /* 2202 0 stevel * This is only possible if we are (a) commit()'ing 2203 0 stevel * without having done a prior speculate() on this CPU 2204 0 stevel * and (b) racing with another commit() on a different 2205 0 stevel * CPU. There's nothing to do -- we just assert that 2206 0 stevel * our offset is 0. 2207 0 stevel */ 2208 0 stevel ASSERT(src->dtb_offset == 0); 2209 0 stevel return; 2210 0 stevel 2211 0 stevel case DTRACESPEC_ACTIVE: 2212 0 stevel new = DTRACESPEC_COMMITTING; 2213 0 stevel break; 2214 0 stevel 2215 0 stevel case DTRACESPEC_ACTIVEONE: 2216 0 stevel /* 2217 0 stevel * This speculation is active on one CPU. If our 2218 0 stevel * buffer offset is non-zero, we know that the one CPU 2219 0 stevel * must be us. Otherwise, we are committing on a 2220 0 stevel * different CPU from the speculate(), and we must 2221 0 stevel * rely on being asynchronously cleaned. 2222 0 stevel */ 2223 0 stevel if (src->dtb_offset != 0) { 2224 0 stevel new = DTRACESPEC_COMMITTING; 2225 0 stevel break; 2226 0 stevel } 2227 0 stevel /*FALLTHROUGH*/ 2228 0 stevel 2229 0 stevel case DTRACESPEC_ACTIVEMANY: 2230 0 stevel new = DTRACESPEC_COMMITTINGMANY; 2231 0 stevel break; 2232 0 stevel 2233 0 stevel default: 2234 0 stevel ASSERT(0); 2235 0 stevel } 2236 0 stevel } while (dtrace_cas32((uint32_t *)&spec->dtsp_state, 2237 0 stevel current, new) != current); 2238 0 stevel 2239 0 stevel /* 2240 0 stevel * We have set the state to indicate that we are committing this 2241 0 stevel * speculation. Now reserve the necessary space in the destination 2242 0 stevel * buffer. 2243 0 stevel */ 2244 0 stevel if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset, 2245 0 stevel sizeof (uint64_t), state, NULL)) < 0) { 2246 0 stevel dtrace_buffer_drop(dest); 2247 0 stevel goto out; 2248 0 stevel } 2249 0 stevel 2250 0 stevel /* 2251 0 stevel * We have the space; copy the buffer across. (Note that this is a 2252 0 stevel * highly subobtimal bcopy(); in the unlikely event that this becomes 2253 0 stevel * a serious performance issue, a high-performance DTrace-specific 2254 0 stevel * bcopy() should obviously be invented.) 2255 0 stevel */ 2256 0 stevel daddr = (uintptr_t)dest->dtb_tomax + offs; 2257 0 stevel dlimit = daddr + src->dtb_offset; 2258 0 stevel saddr = (uintptr_t)src->dtb_tomax; 2259 0 stevel 2260 0 stevel /* 2261 0 stevel * First, the aligned portion. 2262 0 stevel */ 2263 0 stevel while (dlimit - daddr >= sizeof (uint64_t)) { 2264 0 stevel *((uint64_t *)daddr) = *((uint64_t *)saddr); 2265 0 stevel 2266 0 stevel daddr += sizeof (uint64_t); 2267 0 stevel saddr += sizeof (uint64_t); 2268 0 stevel } 2269 0 stevel 2270 0 stevel /* 2271 0 stevel * Now any left-over bit... 2272 0 stevel */ 2273 0 stevel while (dlimit - daddr) 2274 0 stevel *((uint8_t *)daddr++) = *((uint8_t *)saddr++); 2275 0 stevel 2276 0 stevel /* 2277 0 stevel * Finally, commit the reserved space in the destination buffer. 2278 0 stevel */ 2279 0 stevel dest->dtb_offset = offs + src->dtb_offset; 2280 0 stevel 2281 0 stevel out: 2282 0 stevel /* 2283 0 stevel * If we're lucky enough to be the only active CPU on this speculation 2284 0 stevel * buffer, we can just set the state back to DTRACESPEC_INACTIVE. 2285 0 stevel */ 2286 0 stevel if (current == DTRACESPEC_ACTIVE || 2287 0 stevel (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) { 2288 0 stevel uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state, 2289 0 stevel DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE); 2290 0 stevel 2291 0 stevel ASSERT(rval == DTRACESPEC_COMMITTING); 2292 0 stevel } 2293 0 stevel 2294 0 stevel src->dtb_offset = 0; 2295 0 stevel src->dtb_xamot_drops += src->dtb_drops; 2296 0 stevel src->dtb_drops = 0; 2297 0 stevel } 2298 0 stevel 2299 0 stevel /* 2300 0 stevel * This routine discards an active speculation. If the specified speculation 2301 0 stevel * is not in a valid state to perform a discard(), this routine will silently 2302 0 stevel * do nothing. The state of the specified speculation is transitioned 2303 0 stevel * according to the state transition diagram outlined in <sys/dtrace_impl.h> 2304 0 stevel */ 2305 0 stevel static void 2306 0 stevel dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu, 2307 0 stevel dtrace_specid_t which) 2308 0 stevel { 2309 0 stevel dtrace_speculation_t *spec; 2310 0 stevel dtrace_speculation_state_t current, new; 2311 0 stevel dtrace_buffer_t *buf; 2312 0 stevel 2313 0 stevel if (which == 0) 2314 0 stevel return; 2315 0 stevel 2316 0 stevel if (which > state->dts_nspeculations) { 2317 0 stevel cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP; 2318 0 stevel return; 2319 0 stevel } 2320 0 stevel 2321 0 stevel spec = &state->dts_speculations[which - 1]; 2322 0 stevel buf = &spec->dtsp_buffer[cpu]; 2323 0 stevel 2324 0 stevel do { 2325 0 stevel current = spec->dtsp_state; 2326 0 stevel 2327 0 stevel switch (current) { 2328 0 stevel case DTRACESPEC_INACTIVE: 2329 0 stevel case DTRACESPEC_COMMITTINGMANY: 2330 0 stevel case DTRACESPEC_COMMITTING: 2331 0 stevel case DTRACESPEC_DISCARDING: 2332 0 stevel return; 2333 0 stevel 2334 0 stevel case DTRACESPEC_ACTIVE: 2335 0 stevel case DTRACESPEC_ACTIVEMANY: 2336 0 stevel new = DTRACESPEC_DISCARDING; 2337 0 stevel break; 2338 0 stevel 2339 0 stevel case DTRACESPEC_ACTIVEONE: 2340 0 stevel if (buf->dtb_offset != 0) { 2341 0 stevel new = DTRACESPEC_INACTIVE; 2342 0 stevel } else { 2343 0 stevel new = DTRACESPEC_DISCARDING; 2344 0 stevel } 2345 0 stevel break; 2346 0 stevel 2347 0 stevel default: 2348 0 stevel ASSERT(0); 2349 0 stevel } 2350 0 stevel } while (dtrace_cas32((uint32_t *)&spec->dtsp_state, 2351 0 stevel current, new) != current); 2352 0 stevel 2353 0 stevel buf->dtb_offset = 0; 2354 0 stevel buf->dtb_drops = 0; 2355 0 stevel } 2356 0 stevel 2357 0 stevel /* 2358 0 stevel * Note: not called from probe context. This function is called 2359 0 stevel * asynchronously from cross call context to clean any speculations that are 2360 0 stevel * in the COMMITTINGMANY or DISCARDING states. These speculations may not be 2361 0 stevel * transitioned back to the INACTIVE state until all CPUs have cleaned the 2362 0 stevel * speculation. 2363 0 stevel */ 2364 0 stevel static void 2365 0 stevel dtrace_speculation_clean_here(dtrace_state_t *state) 2366 0 stevel { 2367 0 stevel dtrace_icookie_t cookie; 2368 0 stevel processorid_t cpu = CPU->cpu_id; 2369 0 stevel dtrace_buffer_t *dest = &state->dts_buffer[cpu]; 2370 0 stevel dtrace_specid_t i; 2371 0 stevel 2372 0 stevel cookie = dtrace_interrupt_disable(); 2373 0 stevel 2374 0 stevel if (dest->dtb_tomax == NULL) { 2375 0 stevel dtrace_interrupt_enable(cookie); 2376 0 stevel return; 2377 0 stevel } 2378 0 stevel 2379 0 stevel for (i = 0; i < state->dts_nspeculations; i++) { 2380 0 stevel dtrace_speculation_t *spec = &state->dts_speculations[i]; 2381 0 stevel dtrace_buffer_t *src = &spec->dtsp_buffer[cpu]; 2382 0 stevel 2383 0 stevel if (src->dtb_tomax == NULL) 2384 0 stevel continue; 2385 0 stevel 2386 0 stevel if (spec->dtsp_state == DTRACESPEC_DISCARDING) { 2387 0 stevel src->dtb_offset = 0; 2388 0 stevel continue; 2389 0 stevel } 2390 0 stevel 2391 0 stevel if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY) 2392 0 stevel continue; 2393 0 stevel 2394 0 stevel if (src->dtb_offset == 0) 2395 0 stevel continue; 2396 0 stevel 2397 0 stevel dtrace_speculation_commit(state, cpu, i + 1); 2398 0 stevel } 2399 0 stevel 2400 0 stevel dtrace_interrupt_enable(cookie); 2401 0 stevel } 2402 0 stevel 2403 0 stevel /* 2404 0 stevel * Note: not called from probe context. This function is called 2405 0 stevel * asynchronously (and at a regular interval) to clean any speculations that 2406 0 stevel * are in the COMMITTINGMANY or DISCARDING states. If it discovers that there 2407 0 stevel * is work to be done, it cross calls all CPUs to perform that work; 2408 0 stevel * COMMITMANY and DISCARDING speculations may not be transitioned back to the 2409 0 stevel * INACTIVE state until they have been cleaned by all CPUs. 2410 0 stevel */ 2411 0 stevel static void 2412 0 stevel dtrace_speculation_clean(dtrace_state_t *state) 2413 0 stevel { 2414 0 stevel int work = 0, rv; 2415 0 stevel dtrace_specid_t i; 2416 0 stevel 2417 0 stevel for (i = 0; i < state->dts_nspeculations; i++) { 2418 0 stevel dtrace_speculation_t *spec = &state->dts_speculations[i]; 2419 0 stevel 2420 0 stevel ASSERT(!spec->dtsp_cleaning); 2421 0 stevel 2422 0 stevel if (spec->dtsp_state != DTRACESPEC_DISCARDING && 2423 0 stevel spec->dtsp_state != DTRACESPEC_COMMITTINGMANY) 2424 0 stevel continue; 2425 0 stevel 2426 0 stevel work++; 2427 0 stevel spec->dtsp_cleaning = 1; 2428 0 stevel } 2429 0 stevel 2430 0 stevel if (!work) 2431 0 stevel return; 2432 0 stevel 2433 0 stevel dtrace_xcall(DTRACE_CPUALL, 2434 0 stevel (dtrace_xcall_t)dtrace_speculation_clean_here, state); 2435 0 stevel 2436 0 stevel /* 2437 0 stevel * We now know that all CPUs have committed or discarded their 2438 0 stevel * speculation buffers, as appropriate. We can now set the state 2439 0 stevel * to inactive. 2440 0 stevel */ 2441 0 stevel for (i = 0; i < state->dts_nspeculations; i++) { 2442 0 stevel dtrace_speculation_t *spec = &state->dts_speculations[i]; 2443 0 stevel dtrace_speculation_state_t current, new; 2444 0 stevel 2445 0 stevel if (!spec->dtsp_cleaning) 2446 0 stevel continue; 2447 0 stevel 2448 0 stevel current = spec->dtsp_state; 2449 0 stevel ASSERT(current == DTRACESPEC_DISCARDING || 2450 0 stevel current == DTRACESPEC_COMMITTINGMANY); 2451 0 stevel 2452 0 stevel new = DTRACESPEC_INACTIVE; 2453 0 stevel 2454 0 stevel rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new); 2455 0 stevel ASSERT(rv == current); 2456 0 stevel spec->dtsp_cleaning = 0; 2457 0 stevel } 2458 0 stevel } 2459 0 stevel 2460 0 stevel /* 2461 0 stevel * Called as part of a speculate() to get the speculative buffer associated 2462 0 stevel * with a given speculation. Returns NULL if the specified speculation is not 2463 0 stevel * in an ACTIVE state. If the speculation is in the ACTIVEONE state -- and 2464 0 stevel * the active CPU is not the specified CPU -- the speculation will be 2465 0 stevel * atomically transitioned into the ACTIVEMANY state. 2466 0 stevel */ 2467 0 stevel static dtrace_buffer_t * 2468 0 stevel dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid, 2469 0 stevel dtrace_specid_t which) 2470 0 stevel { 2471 0 stevel dtrace_speculation_t *spec; 2472 0 stevel dtrace_speculation_state_t current, new; 2473 0 stevel dtrace_buffer_t *buf; 2474 0 stevel 2475 0 stevel if (which == 0) 2476 0 stevel return (NULL); 2477 0 stevel 2478 0 stevel if (which > state->dts_nspeculations) { 2479 0 stevel cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP; 2480 0 stevel return (NULL); 2481 0 stevel } 2482 0 stevel 2483 0 stevel spec = &state->dts_speculations[which - 1]; 2484 0 stevel buf = &spec->dtsp_buffer[cpuid]; 2485 0 stevel 2486 0 stevel do { 2487 0 stevel current = spec->dtsp_state; 2488 0 stevel 2489 0 stevel switch (current) { 2490 0 stevel case DTRACESPEC_INACTIVE: 2491 0 stevel case DTRACESPEC_COMMITTINGMANY: 2492 0 stevel case DTRACESPEC_DISCARDING: 2493 0 stevel return (NULL); 2494 0 stevel 2495 0 stevel case DTRACESPEC_COMMITTING: 2496 0 stevel ASSERT(buf->dtb_offset == 0); 2497 0 stevel return (NULL); 2498 0 stevel 2499 0 stevel case DTRACESPEC_ACTIVEONE: 2500 0 stevel /* 2501 0 stevel * This speculation is currently active on one CPU. 2502 0 stevel * Check the offset in the buffer; if it's non-zero, 2503 0 stevel * that CPU must be us (and we leave the state alone). 2504 0 stevel * If it's zero, assume that we're starting on a new 2505 0 stevel * CPU -- and change the state to indicate that the 2506 0 stevel * speculation is active on more than one CPU. 2507 0 stevel */ 2508 0 stevel if (buf->dtb_offset != 0) 2509 0 stevel return (buf); 2510 0 stevel 2511 0 stevel new = DTRACESPEC_ACTIVEMANY; 2512 0 stevel break; 2513 0 stevel 2514 0 stevel case DTRACESPEC_ACTIVEMANY: 2515 0 stevel return (buf); 2516 0 stevel 2517 0 stevel case DTRACESPEC_ACTIVE: 2518 0 stevel new = DTRACESPEC_ACTIVEONE; 2519 0 stevel break; 2520 0 stevel 2521 0 stevel default: 2522 0 stevel ASSERT(0); 2523 0 stevel } 2524 0 stevel } while (dtrace_cas32((uint32_t *)&spec->dtsp_state, 2525 0 stevel current, new) != current); 2526 0 stevel 2527 0 stevel ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY); 2528 0 stevel return (buf); 2529 0 stevel } 2530 0 stevel 2531 0 stevel /* 2532 2870 dp * Return a string. In the event that the user lacks the privilege to access 2533 2870 dp * arbitrary kernel memory, we copy the string out to scratch memory so that we 2534 2870 dp * don't fail access checking. 2535 2870 dp * 2536 2870 dp * dtrace_dif_variable() uses this routine as a helper for various 2537 2870 dp * builtin values such as 'execname' and 'probefunc.' 2538 2870 dp */ 2539 2870 dp uintptr_t 2540 2870 dp dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state, 2541 2870 dp dtrace_mstate_t *mstate) 2542 2870 dp { 2543 2870 dp uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; 2544 2870 dp uintptr_t ret; 2545 2870 dp size_t strsz; 2546 2870 dp 2547 2870 dp /* 2548 2870 dp * The easy case: this probe is allowed to read all of memory, so 2549 2870 dp * we can just return this as a vanilla pointer. 2550 2870 dp */ 2551 2870 dp if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) 2552 2870 dp return (addr); 2553 2870 dp 2554 2870 dp /* 2555 2870 dp * This is the tougher case: we copy the string in question from 2556 2870 dp * kernel memory into scratch memory and return it that way: this 2557 2870 dp * ensures that we won't trip up when access checking tests the 2558 2870 dp * BYREF return value. 2559 2870 dp */ 2560 2870 dp strsz = dtrace_strlen((char *)addr, size) + 1; 2561 2870 dp 2562 2870 dp if (mstate->dtms_scratch_ptr + strsz > 2563 2870 dp mstate->dtms_scratch_base + mstate->dtms_scratch_size) { 2564 2870 dp DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); 2565 2870 dp return (NULL); 2566 2870 dp } 2567 2870 dp 2568 2870 dp dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr, 2569 2870 dp strsz); 2570 2870 dp ret = mstate->dtms_scratch_ptr; 2571 2870 dp mstate->dtms_scratch_ptr += strsz; 2572 2870 dp return (ret); 2573 2870 dp } 2574 2870 dp 2575 2870 dp /* 2576 0 stevel * This function implements the DIF emulator's variable lookups. The emulator 2577 0 stevel * passes a reserved variable identifier and optional built-in array index. 2578 0 stevel */ 2579 0 stevel static uint64_t 2580 0 stevel dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, 2581 457 bmc uint64_t ndx) 2582 0 stevel { 2583 0 stevel /* 2584 0 stevel * If we're accessing one of the uncached arguments, we'll turn this 2585 0 stevel * into a reference in the args array. 2586 0 stevel */ 2587 0 stevel if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) { 2588 457 bmc ndx = v - DIF_VAR_ARG0; 2589 0 stevel v = DIF_VAR_ARGS; 2590 0 stevel } 2591 0 stevel 2592 0 stevel switch (v) { 2593 0 stevel case DIF_VAR_ARGS: 2594 0 stevel ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS); 2595 457 bmc if (ndx >= sizeof (mstate->dtms_arg) / 2596 0 stevel sizeof (mstate->dtms_arg[0])) { 2597 0 stevel int aframes = mstate->dtms_probe->dtpr_aframes + 2; 2598 0 stevel dtrace_provider_t *pv; 2599 0 stevel uint64_t val; 2600 0 stevel 2601 0 stevel pv = mstate->dtms_probe->dtpr_provider; 2602 0 stevel if (pv->dtpv_pops.dtps_getargval != NULL) 2603 0 stevel val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg, 2604 0 stevel mstate->dtms_probe->dtpr_id, 2605 457 bmc mstate->dtms_probe->dtpr_arg, ndx, aframes); 2606 0 stevel else 2607 457 bmc val = dtrace_getarg(ndx, aframes); 2608 0 stevel 2609 0 stevel /* 2610 0 stevel * This is regrettably required to keep the compiler 2611 0 stevel * from tail-optimizing the call to dtrace_getarg(). 2612 0 stevel * The condition always evaluates to true, but the 2613 0 stevel * compiler has no way of figuring that out a priori. 2614 0 stevel * (None of this would be necessary if the compiler 2615 0 stevel * could be relied upon to _always_ tail-optimize 2616 0 stevel * the call to dtrace_getarg() -- but it can't.) 2617 0 stevel */ 2618 0 stevel if (mstate->dtms_probe != NULL) 2619 0 stevel return (val); 2620 0 stevel 2621 0 stevel ASSERT(0); 2622 0 stevel } 2623 0 stevel 2624 457 bmc return (mstate->dtms_arg[ndx]); 2625 0 stevel 2626 0 stevel case DIF_VAR_UREGS: { 2627 0 stevel klwp_t *lwp; 2628 0 stevel 2629 0 stevel if (!dtrace_priv_proc(state)) 2630 0 stevel return (0); 2631 0 stevel 2632 0 stevel if ((lwp = curthread->t_lwp) == NULL) { 2633 0 stevel DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR); 2634 0 stevel cpu_core[CPU->cpu_id].cpuc_dtrace_illval = NULL; 2635 0 stevel return (0); 2636 0 stevel } 2637 0 stevel 2638 457 bmc return (dtrace_getreg(lwp->lwp_regs, ndx)); 2639 0 stevel } 2640 0 stevel 2641 0 stevel case DIF_VAR_CURTHREAD: 2642 0 stevel if (!dtrace_priv_kernel(state)) 2643 0 stevel return (0); 2644 0 stevel return ((uint64_t)(uintptr_t)curthread); 2645 0 stevel 2646 0 stevel case DIF_VAR_TIMESTAMP: 2647 0 stevel if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) { 2648 0 stevel mstate->dtms_timestamp = dtrace_gethrtime(); 2649 0 stevel mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP; 2650 0 stevel } 2651 0 stevel return (mstate->dtms_timestamp); 2652 0 stevel 2653 0 stevel case DIF_VAR_VTIMESTAMP: 2654 0 stevel ASSERT(dtrace_vtime_references != 0); 2655 0 stevel return (curthread->t_dtrace_vtime); 2656 0 stevel 2657 0 stevel case DIF_VAR_WALLTIMESTAMP: 2658 0 stevel if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) { 2659 0 stevel mstate->dtms_walltimestamp = dtrace_gethrestime(); 2660 0 stevel mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP; 2661 0 stevel } 2662 0 stevel return (mstate->dtms_walltimestamp); 2663 0 stevel 2664 0 stevel case DIF_VAR_IPL: 2665 0 stevel if (!dtrace_priv_kernel(state)) 2666 0 stevel return (0); 2667 0 stevel if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) { 2668 0 stevel mstate->dtms_ipl = dtrace_getipl(); 2669 0 stevel mstate->dtms_present |= DTRACE_MSTATE_IPL; 2670 0 stevel } 2671 0 stevel return (mstate->dtms_ipl); 2672 0 stevel 2673 0 stevel case DIF_VAR_EPID: 2674 0 stevel ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID); 2675 0 stevel return (mstate->dtms_epid); 2676 0 stevel 2677 0 stevel case DIF_VAR_ID: 2678 0 stevel ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE); 2679 0 stevel return (mstate->dtms_probe->dtpr_id); 2680 0 stevel 2681 0 stevel case DIF_VAR_STACKDEPTH: 2682 0 stevel if (!dtrace_priv_kernel(state)) 2683 0 stevel return (0); 2684 0 stevel if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) { 2685 0 stevel int aframes = mstate->dtms_probe->dtpr_aframes + 2; 2686 0 stevel 2687 0 stevel mstate->dtms_stackdepth = dtrace_getstackdepth(aframes); 2688 0 stevel mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH; 2689 0 stevel } 2690 0 stevel return (mstate->dtms_stackdepth); 2691 0 stevel 2692 191 ahl case DIF_VAR_USTACKDEPTH: 2693 191 ahl if (!dtrace_priv_proc(state)) 2694 191 ahl return (0); 2695 191 ahl if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) { 2696 630 ahl /* 2697 630 ahl * See comment in DIF_VAR_PID. 2698 630 ahl */ 2699 630 ahl if (DTRACE_ANCHORED(mstate->dtms_probe) && 2700 630 ahl CPU_ON_INTR(CPU)) { 2701 630 ahl mstate->dtms_ustackdepth = 0; 2702 630 ahl } else { 2703 630 ahl DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); 2704 630 ahl mstate->dtms_ustackdepth = 2705 630 ahl dtrace_getustackdepth(); 2706 630 ahl DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); 2707 630 ahl } 2708 191 ahl mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH; 2709 191 ahl } 2710 191 ahl return (mstate->dtms_ustackdepth); 2711 191 ahl 2712 0 stevel case DIF_VAR_CALLER: 2713 0 stevel if (!dtrace_priv_kernel(state)) 2714 0 stevel return (0); 2715 0 stevel if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) { 2716 0 stevel int aframes = mstate->dtms_probe->dtpr_aframes + 2; 2717 0 stevel 2718 0 stevel if (!DTRACE_ANCHORED(mstate->dtms_probe)) { 2719 0 stevel /* 2720 0 stevel * If this is an unanchored probe, we are 2721 0 stevel * required to go through the slow path: 2722 0 stevel * dtrace_caller() only guarantees correct 2723 0 stevel * results for anchored probes. 2724 0 stevel */ 2725 0 stevel pc_t caller[2]; 2726 0 stevel 2727 0 stevel dtrace_getpcstack(caller, 2, aframes, 2728 191 ahl (uint32_t *)(uintptr_t)mstate->dtms_arg[0]); 2729 0 stevel mstate->dtms_caller = caller[1]; 2730 0 stevel } else if ((mstate->dtms_caller = 2731 0 stevel dtrace_caller(aframes)) == -1) { 2732 0 stevel /* 2733 0 stevel * We have failed to do this the quick way; 2734 0 stevel * we must resort to the slower approach of 2735 0 stevel * calling dtrace_getpcstack(). 2736 0 stevel */ 2737 0 stevel pc_t caller; 2738 0 stevel 2739 0 stevel dtrace_getpcstack(&caller, 1, aframes, NULL); 2740 0 stevel mstate->dtms_caller = caller; 2741 0 stevel } 2742 0 stevel 2743 0 stevel mstate->dtms_present |= DTRACE_MSTATE_CALLER; 2744 0 stevel } 2745 0 stevel return (mstate->dtms_caller); 2746 457 bmc 2747 457 bmc case DIF_VAR_UCALLER: 2748 457 bmc if (!dtrace_priv_proc(state)) 2749 457 bmc return (0); 2750 457 bmc 2751 457 bmc if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) { 2752 457 bmc uint64_t ustack[3]; 2753 457 bmc 2754 457 bmc /* 2755 457 bmc * dtrace_getupcstack() fills in the first uint64_t 2756 457 bmc * with the current PID. The second uint64_t will 2757 457 bmc * be the program counter at user-level. The third 2758 457 bmc * uint64_t will contain the caller, which is what 2759 457 bmc * we're after. 2760 457 bmc */ 2761 457 bmc ustack[2] = NULL; 2762 5114 ahl DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); 2763 457 bmc dtrace_getupcstack(ustack, 3); 2764 5114 ahl DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); 2765 457 bmc mstate->dtms_ucaller = ustack[2]; 2766 457 bmc mstate->dtms_present |= DTRACE_MSTATE_UCALLER; 2767 457 bmc } 2768 457 bmc 2769 457 bmc return (mstate->dtms_ucaller); 2770 0 stevel 2771 0 stevel case DIF_VAR_PROBEPROV: 2772 0 stevel ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE); 2773 2870 dp return (dtrace_dif_varstr( 2774 2870 dp (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name, 2775 2870 dp state, mstate)); 2776 0 stevel 2777 0 stevel case DIF_VAR_PROBEMOD: 2778 0 stevel ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE); 2779 2870 dp return (dtrace_dif_varstr( 2780 2870 dp (uintptr_t)mstate->dtms_probe->dtpr_mod, 2781 2870 dp state, mstate)); 2782 0 stevel 2783 0 stevel case DIF_VAR_PROBEFUNC: 2784 0 stevel ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE); 2785 2870 dp return (dtrace_dif_varstr( 2786 2870 dp (uintptr_t)mstate->dtms_probe->dtpr_func, 2787 2870 dp state, mstate)); 2788 0 stevel 2789 0 stevel case DIF_VAR_PROBENAME: 2790 0 stevel ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE); 2791 2870 dp return (dtrace_dif_varstr( 2792 2870 dp (uintptr_t)mstate->dtms_probe->dtpr_name, 2793 2870 dp state, mstate)); 2794 0 stevel 2795 0 stevel case DIF_VAR_PID: 2796 0 stevel if (!dtrace_priv_proc(state)) 2797 0 stevel return (0); 2798 0 stevel 2799 0 stevel /* 2800 0 stevel * Note that we are assuming that an unanchored probe is 2801 0 stevel * always due to a high-level interrupt. (And we're assuming 2802 0 stevel * that there is only a single high level interrupt.) 2803 0 stevel */ 2804 0 stevel if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU)) 2805 0 stevel return (pid0.pid_id); 2806 0 stevel 2807 0 stevel /* 2808 0 stevel * It is always safe to dereference one's own t_procp pointer: 2809 0 stevel * it always points to a valid, allocated proc structure. 2810 0 stevel * Further, it is always safe to dereference the p_pidp member 2811 0 stevel * of one's own proc structure. (These are truisms becuase 2812 0 stevel * threads and processes don't clean up their own state -- 2813 0 stevel * they leave that task to whomever reaps them.) 2814 0 stevel */ 2815 0 stevel return ((uint64_t)curthread->t_procp->p_pidp->pid_id); 2816 0 stevel 2817 2525 dp case DIF_VAR_PPID: 2818 2525 dp if (!dtrace_priv_proc(state)) 2819 2525 dp return (0); 2820 2525 dp 2821 2525 dp /* 2822 2525 dp * See comment in DIF_VAR_PID. 2823 2525 dp */ 2824 2525 dp if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU)) 2825 2525 dp return (pid0.pid_id); 2826 2525 dp 2827 0 stevel /* 2828 0 stevel * It is always safe to dereference one's own t_procp pointer: 2829 0 stevel * it always points to a valid, allocated proc structure. 2830 0 stevel * (This is true because threads don't clean up their own 2831 0 stevel * state -- they leave that task to whomever reaps them.) 2832 0 stevel */ 2833 2756 dp return ((uint64_t)curthread->t_procp->p_ppid); 2834 2756 dp 2835 2756 dp case DIF_VAR_TID: 2836 2756 dp /* 2837 2756 dp * See comment in DIF_VAR_PID. 2838 2756 dp */ 2839 2756 dp if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU)) 2840 2756 dp return (0); 2841 2756 dp 2842 2756 dp return ((uint64_t)curthread->t_tid); 2843 2756 dp 2844 2756 dp case DIF_VAR_EXECNAME: 2845 0 stevel if (!dtrace_priv_proc(state)) 2846 0 stevel return (0); 2847 0 stevel 2848 0 stevel /* 2849 0 stevel * See comment in DIF_VAR_PID. 2850 0 stevel */ 2851 0 stevel if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU)) 2852 2756 dp return ((uint64_t)(uintptr_t)p0.p_user.u_comm); 2853 0 stevel 2854 0 stevel /* 2855 0 stevel * It is always safe to dereference one's own t_procp pointer: 2856 0 stevel * it always points to a valid, allocated proc structure. 2857 0 stevel * (This is true because threads don't clean up their own 2858 0 stevel * state -- they leave that task to whomever reaps them.) 2859 0 stevel */ 2860 2870 dp return (dtrace_dif_varstr( 2861 2870 dp (uintptr_t)curthread->t_procp->p_user.u_comm, 2862 2870 dp state, mstate)); 2863 2756 dp 2864 2756 dp case DIF_VAR_ZONENAME: 2865 2756 dp if (!dtrace_priv_proc(state)) 2866 2756 dp return (0); 2867 2756 dp 2868 2756 dp /* 2869 2756 dp * See comment in DIF_VAR_PID. 2870 2756 dp */ 2871 2756 dp if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU)) 2872 2756 dp return ((uint64_t)(uintptr_t)p0.p_zone->zone_name); 2873 2756 dp 2874 2756 dp /* 2875 2756 dp * It is always safe to dereference one's own t_procp pointer: 2876 2756 dp * it always points to a valid, allocated proc structure. 2877 2756 dp * (This is true because threads don't clean up their own 2878 2756 dp * state -- they leave that task to whomever reaps them.) 2879 2756 dp */ 2880 2870 dp return (dtrace_dif_varstr( 2881 2870 dp (uintptr_t)curthread->t_procp->p_zone->zone_name, 2882 2870 dp state, mstate)); 2883 0 stevel 2884 2525 dp case DIF_VAR_UID: 2885 2525 dp if (!dtrace_priv_proc(state)) 2886 2525 dp return (0); 2887 2525 dp 2888 2525 dp /* 2889 2525 dp * See comment in DIF_VAR_PID. 2890 2525 dp */ 2891 2525 dp if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU)) 2892 2525 dp return ((uint64_t)p0.p_cred->cr_uid); 2893 2525 dp 2894 2756 dp /* 2895 2756 dp * It is always safe to dereference one's own t_procp pointer: 2896 2756 dp * it always points to a valid, allocated proc structure. 2897 2756 dp * (This is true because threads don't clean up their own 2898 2756 dp * state -- they leave that task to whomever reaps them.) 2899 2756 dp * 2900 2756 dp * Additionally, it is safe to dereference one's own process 2901 2756 dp * credential, since this is never NULL after process birth. 2902 2756 dp */ 2903 2756 dp return ((uint64_t)curthread->t_procp->p_cred->cr_uid); 2904 2525 dp 2905 2525 dp case DIF_VAR_GID: 2906 2525 dp if (!dtrace_priv_proc(state)) 2907 2525 dp return (0); 2908 2525 dp 2909 2525 dp /* 2910 2525 dp * See comment in DIF_VAR_PID. 2911 2525 dp */ 2912 2525 dp if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU)) 2913 2525 dp return ((uint64_t)p0.p_cred->cr_gid); 2914 2525 dp 2915 2756 dp /* 2916 2756 dp * It is always safe to dereference one's own t_procp pointer: 2917 2756 dp * it always points to a valid, allocated proc structure. 2918 2756 dp * (This is true because threads don't clean up their own 2919 2756 dp * state -- they leave that task to whomever reaps them.) 2920 2756 dp * 2921 2756 dp * Additionally, it is safe to dereference one's own process 2922 2756 dp * credential, since this is never NULL after process birth. 2923 2756 dp */ 2924 2756 dp return ((uint64_t)curthread->t_procp->p_cred->cr_gid); 2925 2525 dp 2926 2525 dp case DIF_VAR_ERRNO: { 2927 2525 dp klwp_t *lwp; 2928 2525 dp if (!dtrace_priv_proc(state)) 2929 2525 dp return (0); 2930 2525 dp 2931 2525 dp /* 2932 2525 dp * See comment in DIF_VAR_PID. 2933 2525 dp */ 2934 2525 dp if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU)) 2935 2525 dp return (0); 2936 2525 dp 2937 2756 dp /* 2938 2756 dp * It is always safe to dereference one's own t_lwp pointer in 2939 2756 dp * the event that this pointer is non-NULL. (This is true 2940 2756 dp * because threads and lwps don't clean up their own state -- 2941 2756 dp * they leave that task to whomever reaps them.) 2942 2756 dp */ 2943 2525 dp if ((lwp = curthread->t_lwp) == NULL) 2944 2525 dp return (0); 2945 2525 dp 2946 2525 dp return ((uint64_t)lwp->lwp_errno); 2947 2525 dp } 2948 0 stevel default: 2949 0 stevel DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); 2950 0 stevel return (0); 2951 0 stevel } 2952 0 stevel } 2953 0 stevel 2954 0 stevel /* 2955 0 stevel * Emulate the execution of DTrace ID subroutines invoked by the call opcode. 2956 0 stevel * Notice that we don't bother validating the proper number of arguments or 2957 0 stevel * their types in the tuple stack. This isn't needed because all argument 2958 0 stevel * interpretation is safe because of our load safety -- the worst that can 2959 0 stevel * happen is that a bogus program can obtain bogus results. 2960 0 stevel */ 2961 0 stevel static void 2962 0 stevel dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, 2963 0 stevel dtrace_key_t *tupregs, int nargs, 2964 0 stevel dtrace_mstate_t *mstate, dtrace_state_t *state) 2965 0 stevel { 2966 0 stevel volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags; 2967 0 stevel volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval; 2968 2870 dp dtrace_vstate_t *vstate = &state->dts_vstate; 2969 0 stevel 2970 0 stevel union { 2971 0 stevel mutex_impl_t mi; 2972 0 stevel uint64_t mx; 2973 0 stevel } m; 2974 0 stevel 2975 0 stevel union { 2976 0 stevel krwlock_t ri; 2977 0 stevel uintptr_t rw; 2978 0 stevel } r; 2979 0 stevel 2980 0 stevel switch (subr) { 2981 0 stevel case DIF_SUBR_RAND: 2982 0 stevel regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875; 2983 0 stevel break; 2984 0 stevel 2985 0 stevel case DIF_SUBR_MUTEX_OWNED: 2986 2870 dp if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t), 2987 2870 dp mstate, vstate)) { 2988 2870 dp regs[rd] = NULL; 2989 2870 dp break; 2990 2870 dp } 2991 2870 dp 2992 0 stevel m.mx = dtrace_load64(tupregs[0].dttk_value); 2993 0 stevel if (MUTEX_TYPE_ADAPTIVE(&m.mi)) 2994 0 stevel regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER; 2995 0 stevel else 2996 0 stevel regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock); 2997 0 stevel break; 2998 0 stevel 2999 0 stevel case DIF_SUBR_MUTEX_OWNER: 3000 2870 dp if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t), 3001 2870 dp mstate, vstate)) { 3002 2870 dp regs[rd] = NULL; 3003 2870 dp break; 3004 2870 dp } 3005 2870 dp 3006 0 stevel m.mx = dtrace_load64(tupregs[0].dttk_value); 3007 0 stevel if (MUTEX_TYPE_ADAPTIVE(&m.mi) && 3008 0 stevel MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER) 3009 0 stevel regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi); 3010 0 stevel else 3011 0 stevel regs[rd] = 0; 3012 0 stevel break; 3013 0 stevel 3014 0 stevel case DIF_SUBR_MUTEX_TYPE_ADAPTIVE: 3015 2870 dp if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t), 3016 2870 dp mstate, vstate)) { 3017 2870 dp regs[rd] = NULL; 3018 2870 dp break; 3019 2870 dp } 3020 2870 dp 3021 0 stevel m.mx = dtrace_load64(tupregs[0].dttk_value); 3022 0 stevel regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi); 3023 0 stevel break; 3024 0 stevel 3025 0 stevel case DIF_SUBR_MUTEX_TYPE_SPIN: 3026 2870 dp if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t), 3027 2870 dp mstate, vstate)) { 3028 2870 dp regs[rd] = NULL; 3029 2870 dp break; 3030 2870 dp } 3031 2870 dp 3032 0 stevel m.mx = dtrace_load64(tupregs[0].dttk_value); 3033 0 stevel regs[rd] = MUTEX_TYPE_SPIN(&m.mi); 3034 0 stevel break; 3035 0 stevel 3036 0 stevel case DIF_SUBR_RW_READ_HELD: { 3037 0 stevel uintptr_t tmp; 3038 0 stevel 3039 2870 dp if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t), 3040 2870 dp mstate, vstate)) { 3041 2870 dp regs[rd] = NULL; 3042 2870 dp break; 3043 2870 dp } 3044 2870 dp 3045 0 stevel r.rw = dtrace_loadptr(tupregs[0].dttk_value); 3046 0 stevel regs[rd] = _RW_READ_HELD(&r.ri, tmp); 3047 0 stevel break; 3048 0 stevel } 3049 0 stevel 3050 0 stevel case DIF_SUBR_RW_WRITE_HELD: 3051 2870 dp if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t), 3052 2870 dp mstate, vstate)) { 3053 2870 dp regs[rd] = NULL; 3054 2870 dp break; 3055 2870 dp } 3056 2870 dp 3057 0 stevel r.rw = dtrace_loadptr(tupregs[0].dttk_value); 3058 0 stevel regs[rd] = _RW_WRITE_HELD(&r.ri); 3059 0 stevel break; 3060 0 stevel 3061 0 stevel case DIF_SUBR_RW_ISWRITER: 3062 2870 dp if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t), 3063 2870 dp mstate, vstate)) { 3064 2870 dp regs[rd] = NULL; 3065 2870 dp break; 3066 2870 dp } 3067 2870 dp 3068 0 stevel r.rw = dtrace_loadptr(tupregs[0].dttk_value); 3069 0 stevel regs[rd] = _RW_ISWRITER(&r.ri); 3070 0 stevel break; 3071 0 stevel 3072 0 stevel case DIF_SUBR_BCOPY: { 3073 0 stevel /* 3074 0 stevel * We need to be sure that the destination is in the scratch 3075 0 stevel * region -- no other region is allowed. 3076 0 stevel */ 3077 0 stevel uintptr_t src = tupregs[0].dttk_value; 3078 0 stevel uintptr_t dest = tupregs[1].dttk_value; 3079 0 stevel size_t size = tupregs[2].dttk_value; 3080 0 stevel 3081 0 stevel if (!dtrace_inscratch(dest, size, mstate)) { 3082 0 stevel *flags |= CPU_DTRACE_BADADDR; 3083 0 stevel *illval = regs[rd]; 3084 2870 dp break; 3085 2870 dp } 3086 2870 dp 3087 2870 dp if (!dtrace_canload(src, size, mstate, vstate)) { 3088 2870 dp regs[rd] = NULL; 3089 0 stevel break; 3090 0 stevel } 3091 0 stevel 3092 0 stevel dtrace_bcopy((void *)src, (void *)dest, size); 3093 0 stevel break; 3094 0 stevel } 3095 0 stevel 3096 0 stevel case DIF_SUBR_ALLOCA: 3097 0 stevel case DIF_SUBR_COPYIN: { 3098 0 stevel uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8); 3099 0 stevel uint64_t size = 3100 0 stevel tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value; 3101 0 stevel size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size; 3102 0 stevel 3103 0 stevel /* 3104 0 stevel * This action doesn't require any credential checks since 3105 0 stevel * probes will not activate in user contexts to which the 3106 0 stevel * enabling user does not have permissions. 3107 0 stevel */ 3108 2922 dp 3109 2922 dp /* 3110 2922 dp * Rounding up the user allocation size could have overflowed 3111 2922 dp * a large, bogus allocation (like -1ULL) to 0. 3112 2922 dp */ 3113 2922 dp if (scratch_size < size || 3114 2922 dp !DTRACE_INSCRATCH(mstate, scratch_size)) { 3115 0 stevel DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); 3116 0 stevel regs[rd] = NULL; 3117 0 stevel break; 3118 0 stevel } 3119 0 stevel 3120 0 stevel if (subr == DIF_SUBR_COPYIN) { 3121 0 stevel DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); 3122 3677 sudheer dtrace_copyin(tupregs[0].dttk_value, dest, size, flags); 3123 0 stevel DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); 3124 0 stevel } 3125 0 stevel 3126 0 stevel mstate->dtms_scratch_ptr += scratch_size; 3127 0 stevel regs[rd] = dest; 3128 0 stevel break; 3129 0 stevel } 3130 0 stevel 3131 0 stevel case DIF_SUBR_COPYINTO: { 3132 0 stevel uint64_t size = tupregs[1].dttk_value; 3133 0 stevel uintptr_t dest = tupregs[2].dttk_value; 3134 0 stevel 3135 0 stevel /* 3136 0 stevel * This action doesn't require any credential checks since 3137 0 stevel * probes will not activate in user contexts to which the 3138 0 stevel * enabling user does not have permissions. 3139 0 stevel */ 3140 0 stevel if (!dtrace_inscratch(dest, size, mstate)) { 3141 0 stevel *flags |= CPU_DTRACE_BADADDR; 3142 0 stevel *illval = regs[rd]; 3143 0 stevel break; 3144 0 stevel } 3145 0 stevel 3146 0 stevel DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); 3147 3677 sudheer dtrace_copyin(tupregs[0].dttk_value, dest, size, flags); 3148 0 stevel DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); 3149 0 stevel break; 3150 0 stevel } 3151 0 stevel 3152 0 stevel case DIF_SUBR_COPYINSTR: { 3153 0 stevel uintptr_t dest = mstate->dtms_scratch_ptr; 3154 0 stevel uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; 3155 0 stevel 3156 0 stevel if (nargs > 1 && tupregs[1].dttk_value < size) 3157 0 stevel size = tupregs[1].dttk_value + 1; 3158 0 stevel 3159 0 stevel /* 3160 0 stevel * This action doesn't require any credential checks since 3161 0 stevel * probes will not activate in user contexts to which the 3162 0 stevel * enabling user does not have permissions. 3163 0 stevel */ 3164 2922 dp if (!DTRACE_INSCRATCH(mstate, size)) { 3165 0 stevel DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); 3166 0 stevel regs[rd] = NULL; 3167 0 stevel break; 3168 0 stevel } 3169 0 stevel 3170 0 stevel DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); 3171 3677 sudheer dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags); 3172 0 stevel DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); 3173 0 stevel 3174 0 stevel ((char *)dest)[size - 1] = '\0'; 3175 0 stevel mstate->dtms_scratch_ptr += size; 3176 0 stevel regs[rd] = dest; 3177 0 stevel break; 3178 0 stevel } 3179 0 stevel 3180 0 stevel case DIF_SUBR_MSGSIZE: 3181 0 stevel case DIF_SUBR_MSGDSIZE