Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/exacct.h>
     27 #include <sys/exacct_catalog.h>
     28 #include <sys/disp.h>
     29 #include <sys/task.h>
     30 #include <sys/proc.h>
     31 #include <sys/cmn_err.h>
     32 #include <sys/kmem.h>
     33 #include <sys/project.h>
     34 #include <sys/systm.h>
     35 #include <sys/vnode.h>
     36 #include <sys/file.h>
     37 #include <sys/acctctl.h>
     38 #include <sys/time.h>
     39 #include <sys/utsname.h>
     40 #include <sys/session.h>
     41 #include <sys/sysmacros.h>
     42 #include <sys/bitmap.h>
     43 #include <sys/msacct.h>
     44 
     45 /*
     46  * exacct usage and recording routines
     47  *
     48  * wracct(2), getacct(2), and the records written at process or task
     49  * termination are constructed using the exacct_assemble_[task,proc]_usage()
     50  * functions, which take a callback that takes the appropriate action on
     51  * the packed exacct record for the task or process.  For the process-related
     52  * actions, we partition the routines such that the data collecting component
     53  * can be performed while holding p_lock, and all sleeping or blocking
     54  * operations can be performed without acquiring p_lock.
     55  *
     56  * putacct(2), which allows an application to construct a customized record
     57  * associated with an existing process or task, has its own entry points:
     58  * exacct_tag_task() and exacct_tag_proc().
     59  */
     60 
     61 taskq_t *exacct_queue;
     62 kmem_cache_t *exacct_object_cache;
     63 
     64 zone_key_t exacct_zone_key = ZONE_KEY_UNINITIALIZED;
     65 
     66 static const uint32_t exacct_version = EXACCT_VERSION;
     67 static const char exacct_header[] = "exacct";
     68 static const char exacct_creator[] = "SunOS";
     69 
     70 ea_object_t *
     71 ea_alloc_item(ea_catalog_t catalog, void *buf, size_t bufsz)
     72 {
     73 	ea_object_t *item;
     74 
     75 	item = kmem_cache_alloc(exacct_object_cache, KM_SLEEP);
     76 	bzero(item, sizeof (ea_object_t));
     77 	(void) ea_set_item(item, catalog, buf, bufsz);
     78 	return (item);
     79 }
     80 
     81 ea_object_t *
     82 ea_alloc_group(ea_catalog_t catalog)
     83 {
     84 	ea_object_t *group;
     85 
     86 	group = kmem_cache_alloc(exacct_object_cache, KM_SLEEP);
     87 	bzero(group, sizeof (ea_object_t));
     88 	(void) ea_set_group(group, catalog);
     89 	return (group);
     90 }
     91 
     92 ea_object_t *
     93 ea_attach_item(ea_object_t *grp, void *buf, size_t bufsz, ea_catalog_t catalog)
     94 {
     95 	ea_object_t *item;
     96 
     97 	item = ea_alloc_item(catalog, buf, bufsz);
     98 	(void) ea_attach_to_group(grp, item);
     99 	return (item);
    100 }
    101 
    102 /*
    103  * exacct_add_task_mstate() and exacct_sub_task_mstate() add and subtract
    104  * microstate accounting data and resource usage counters from one task_usage_t
    105  * from those supplied in another. These functions do not operate on *all*
    106  * members of a task_usage_t: for some (e.g. tu_anctaskid) it would not make
    107  * sense.
    108  */
    109 static void
    110 exacct_add_task_mstate(task_usage_t *tu, task_usage_t *delta)
    111 {
    112 	tu->tu_utime  += delta->tu_utime;
    113 	tu->tu_stime  += delta->tu_stime;
    114 	tu->tu_minflt += delta->tu_minflt;
    115 	tu->tu_majflt += delta->tu_majflt;
    116 	tu->tu_sndmsg += delta->tu_sndmsg;
    117 	tu->tu_rcvmsg += delta->tu_rcvmsg;
    118 	tu->tu_ioch   += delta->tu_ioch;
    119 	tu->tu_iblk   += delta->tu_iblk;
    120 	tu->tu_oblk   += delta->tu_oblk;
    121 	tu->tu_vcsw   += delta->tu_vcsw;
    122 	tu->tu_icsw   += delta->tu_icsw;
    123 	tu->tu_nsig   += delta->tu_nsig;
    124 	tu->tu_nswp   += delta->tu_nswp;
    125 	tu->tu_nscl   += delta->tu_nscl;
    126 }
    127 
    128 /*
    129  * See the comments for exacct_add_task_mstate(), above.
    130  */
    131 static void
    132 exacct_sub_task_mstate(task_usage_t *tu, task_usage_t *delta)
    133 {
    134 	tu->tu_utime  -= delta->tu_utime;
    135 	tu->tu_stime  -= delta->tu_stime;
    136 	tu->tu_minflt -= delta->tu_minflt;
    137 	tu->tu_majflt -= delta->tu_majflt;
    138 	tu->tu_sndmsg -= delta->tu_sndmsg;
    139 	tu->tu_rcvmsg -= delta->tu_rcvmsg;
    140 	tu->tu_ioch   -= delta->tu_ioch;
    141 	tu->tu_iblk   -= delta->tu_iblk;
    142 	tu->tu_oblk   -= delta->tu_oblk;
    143 	tu->tu_vcsw   -= delta->tu_vcsw;
    144 	tu->tu_icsw   -= delta->tu_icsw;
    145 	tu->tu_nsig   -= delta->tu_nsig;
    146 	tu->tu_nswp   -= delta->tu_nswp;
    147 	tu->tu_nscl   -= delta->tu_nscl;
    148 }
    149 
    150 /*
    151  * Wrapper for vn_rdwr() used by exacct_vn_write() and exacct_write_header()
    152  * to write to the accounting file without corrupting it in case of an I/O or
    153  * filesystem error.
    154  */
    155 static int
    156 exacct_vn_write_impl(ac_info_t *info, void *buf, ssize_t bufsize)
    157 {
    158 	int error;
    159 	ssize_t resid;
    160 	struct vattr va;
    161 
    162 	ASSERT(info != NULL);
    163 	ASSERT(info->ac_vnode != NULL);
    164 	ASSERT(MUTEX_HELD(&info->ac_lock));
    165 
    166 	/*
    167 	 * Save the size. If vn_rdwr fails, reset the size to avoid corrupting
    168 	 * the present accounting file.
    169 	 */
    170 	va.va_mask = AT_SIZE;
    171 	error = VOP_GETATTR(info->ac_vnode, &va, 0, kcred, NULL);
    172 	if (error == 0) {
    173 		error = vn_rdwr(UIO_WRITE, info->ac_vnode, (caddr_t)buf,
    174 		    bufsize, 0LL, UIO_SYSSPACE, FAPPEND, (rlim64_t)MAXOFFSET_T,
    175 		    kcred, &resid);
    176 		if (error) {
    177 			(void) VOP_SETATTR(info->ac_vnode, &va, 0, kcred, NULL);
    178 		} else if (resid != 0) {
    179 			(void) VOP_SETATTR(info->ac_vnode, &va, 0, kcred, NULL);
    180 			error = ENOSPC;
    181 		}
    182 	}
    183 	return (error);
    184 }
    185 
    186 /*
    187  * exacct_vn_write() safely writes to an accounting file.  acctctl() prevents
    188  * the two accounting vnodes from being equal, and the appropriate ac_lock is
    189  * held across the call, so we're single threaded through this code for each
    190  * file.
    191  */
    192 static int
    193 exacct_vn_write(ac_info_t *info, void *buf, ssize_t bufsize)
    194 {
    195 	int error;
    196 
    197 	if (info == NULL)
    198 		return (0);
    199 
    200 	mutex_enter(&info->ac_lock);
    201 
    202 	/*
    203 	 * Don't do anything unless accounting file is set.
    204 	 */
    205 	if (info->ac_vnode == NULL) {
    206 		mutex_exit(&info->ac_lock);
    207 		return (0);
    208 	}
    209 	error = exacct_vn_write_impl(info, buf, bufsize);
    210 	mutex_exit(&info->ac_lock);
    211 
    212 	return (error);
    213 }
    214 
    215 /*
    216  * void *exacct_create_header(size_t *)
    217  *
    218  * Overview
    219  *   exacct_create_header() constructs an exacct file header identifying the
    220  *   accounting file as the output of the kernel.  exacct_create_header() and
    221  *   the static write_header() and verify_header() routines in libexacct must
    222  *   remain synchronized.
    223  *
    224  * Return values
    225  *   A pointer to a packed exacct buffer containing the appropriate header is
    226  *   returned; the size of the buffer is placed in the location indicated by
    227  *   sizep.
    228  *
    229  * Caller's context
    230  *   Suitable for KM_SLEEP allocations.
    231  */
    232 void *
    233 exacct_create_header(size_t *sizep)
    234 {
    235 	ea_object_t *hdr_grp;
    236 	uint32_t bskip;
    237 	void *buf;
    238 	size_t bufsize;
    239 
    240 	hdr_grp = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_HEADER);
    241 	(void) ea_attach_item(hdr_grp, (void *)&exacct_version, 0,
    242 	    EXT_UINT32 | EXC_DEFAULT | EXD_VERSION);
    243 	(void) ea_attach_item(hdr_grp, (void *)exacct_header, 0,
    244 	    EXT_STRING | EXC_DEFAULT | EXD_FILETYPE);
    245 	(void) ea_attach_item(hdr_grp, (void *)exacct_creator, 0,
    246 	    EXT_STRING | EXC_DEFAULT | EXD_CREATOR);
    247 	(void) ea_attach_item(hdr_grp, uts_nodename(), 0,
    248 	    EXT_STRING | EXC_DEFAULT | EXD_HOSTNAME);
    249 
    250 	bufsize = ea_pack_object(hdr_grp, NULL, 0);
    251 	buf = kmem_alloc(bufsize, KM_SLEEP);
    252 	(void) ea_pack_object(hdr_grp, buf, bufsize);
    253 	ea_free_object(hdr_grp, EUP_ALLOC);
    254 
    255 	/*
    256 	 * To prevent reading the header when reading the file backwards,
    257 	 * set the large backskip of the header group to 0 (last 4 bytes).
    258 	 */
    259 	bskip = 0;
    260 	exacct_order32(&bskip);
    261 	bcopy(&bskip, (char *)buf + bufsize - sizeof (bskip),
    262 	    sizeof (bskip));
    263 
    264 	*sizep = bufsize;
    265 	return (buf);
    266 }
    267 
    268 /*
    269  * int exacct_write_header(ac_info_t *, void *, size_t)
    270  *
    271  * Overview
    272  *   exacct_write_header() writes the given header buffer to the indicated
    273  *   vnode.
    274  *
    275  * Return values
    276  *   The result of the write operation is returned.
    277  *
    278  * Caller's context
    279  *   Caller must hold the ac_lock of the appropriate accounting file
    280  *   information block (ac_info_t).
    281  */
    282 int
    283 exacct_write_header(ac_info_t *info, void *hdr, size_t hdrsize)
    284 {
    285 	if (info != NULL && info->ac_vnode != NULL)
    286 		return (exacct_vn_write_impl(info, hdr, hdrsize));
    287 
    288 	return (0);
    289 }
    290 
    291 static void
    292 exacct_get_interval_task_usage(task_t *tk, task_usage_t *tu,
    293     task_usage_t **tu_buf)
    294 {
    295 	task_usage_t *oldtu, *newtu;
    296 	task_usage_t **prevusage;
    297 
    298 	ASSERT(MUTEX_HELD(&tk->tk_usage_lock));
    299 	if (getzoneid() != GLOBAL_ZONEID) {
    300 		prevusage = &tk->tk_zoneusage;
    301 	} else {
    302 		prevusage = &tk->tk_prevusage;
    303 	}
    304 	if ((oldtu = *prevusage) != NULL) {
    305 		/*
    306 		 * In case we have any accounting information
    307 		 * saved from the previous interval record.
    308 		 */
    309 		newtu = *tu_buf;
    310 		bcopy(tu, newtu, sizeof (task_usage_t));
    311 		tu->tu_minflt	-= oldtu->tu_minflt;
    312 		tu->tu_majflt	-= oldtu->tu_majflt;
    313 		tu->tu_sndmsg	-= oldtu->tu_sndmsg;
    314 		tu->tu_rcvmsg	-= oldtu->tu_rcvmsg;
    315 		tu->tu_ioch	-= oldtu->tu_ioch;
    316 		tu->tu_iblk	-= oldtu->tu_iblk;
    317 		tu->tu_oblk	-= oldtu->tu_oblk;
    318 		tu->tu_vcsw	-= oldtu->tu_vcsw;
    319 		tu->tu_icsw	-= oldtu->tu_icsw;
    320 		tu->tu_nsig	-= oldtu->tu_nsig;
    321 		tu->tu_nswp	-= oldtu->tu_nswp;
    322 		tu->tu_nscl	-= oldtu->tu_nscl;
    323 		tu->tu_utime	-= oldtu->tu_utime;
    324 		tu->tu_stime	-= oldtu->tu_stime;
    325 
    326 		tu->tu_startsec = oldtu->tu_finishsec;
    327 		tu->tu_startnsec = oldtu->tu_finishnsec;
    328 		/*
    329 		 * Copy the data from our temporary storage to the task's
    330 		 * previous interval usage structure for future reference.
    331 		 */
    332 		bcopy(newtu, oldtu, sizeof (task_usage_t));
    333 	} else {
    334 		/*
    335 		 * Store current statistics in the task's previous interval
    336 		 * usage structure for future references.
    337 		 */
    338 		*prevusage = *tu_buf;
    339 		bcopy(tu, *prevusage, sizeof (task_usage_t));
    340 		*tu_buf = NULL;
    341 	}
    342 }
    343 
    344 static void
    345 exacct_snapshot_task_usage(task_t *tk, task_usage_t *tu)
    346 {
    347 	timestruc_t ts;
    348 	proc_t *p;
    349 
    350 	ASSERT(MUTEX_HELD(&pidlock));
    351 
    352 	if ((p = tk->tk_memb_list) == NULL)
    353 		return;
    354 
    355 	/*
    356 	 * exacct_snapshot_task_usage() provides an approximate snapshot of the
    357 	 * usage of the potentially many members of the task.  Since we don't
    358 	 * guarantee exactness, we don't acquire the p_lock of any of the member
    359 	 * processes.
    360 	 */
    361 	do {
    362 		mutex_enter(&p->p_lock);
    363 		tu->tu_utime	+= mstate_aggr_state(p, LMS_USER);
    364 		tu->tu_stime	+= mstate_aggr_state(p, LMS_SYSTEM);
    365 		mutex_exit(&p->p_lock);
    366 		tu->tu_minflt	+= p->p_ru.minflt;
    367 		tu->tu_majflt	+= p->p_ru.majflt;
    368 		tu->tu_sndmsg	+= p->p_ru.msgsnd;
    369 		tu->tu_rcvmsg	+= p->p_ru.msgrcv;
    370 		tu->tu_ioch	+= p->p_ru.ioch;
    371 		tu->tu_iblk	+= p->p_ru.inblock;
    372 		tu->tu_oblk	+= p->p_ru.oublock;
    373 		tu->tu_vcsw	+= p->p_ru.nvcsw;
    374 		tu->tu_icsw	+= p->p_ru.nivcsw;
    375 		tu->tu_nsig	+= p->p_ru.nsignals;
    376 		tu->tu_nswp	+= p->p_ru.nswap;
    377 		tu->tu_nscl	+= p->p_ru.sysc;
    378 	} while ((p = p->p_tasknext) != tk->tk_memb_list);
    379 
    380 	/*
    381 	 * The resource usage accounted for so far will include that
    382 	 * contributed by the task's first process. If this process
    383 	 * came from another task, then its accumulated resource usage
    384 	 * will include a contribution from work performed there.
    385 	 * We must therefore subtract any resource usage that was
    386 	 * inherited with the first process.
    387 	 */
    388 	exacct_sub_task_mstate(tu, tk->tk_inherited);
    389 
    390 	gethrestime(&ts);
    391 	tu->tu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
    392 	tu->tu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
    393 }
    394 
    395 /*
    396  * void exacct_update_task_mstate(proc_t *)
    397  *
    398  * Overview
    399  *   exacct_update_task_mstate() updates the task usage; it is intended
    400  *   to be called from proc_exit().
    401  *
    402  * Return values
    403  *   None.
    404  *
    405  * Caller's context
    406  *   p_lock must be held at entry.
    407  */
    408 void
    409 exacct_update_task_mstate(proc_t *p)
    410 {
    411 	task_usage_t *tu;
    412 
    413 	mutex_enter(&p->p_task->tk_usage_lock);
    414 	tu = p->p_task->tk_usage;
    415 	tu->tu_utime	+= mstate_aggr_state(p, LMS_USER);
    416 	tu->tu_stime	+= mstate_aggr_state(p, LMS_SYSTEM);
    417 	tu->tu_minflt	+= p->p_ru.minflt;
    418 	tu->tu_majflt	+= p->p_ru.majflt;
    419 	tu->tu_sndmsg	+= p->p_ru.msgsnd;
    420 	tu->tu_rcvmsg	+= p->p_ru.msgrcv;
    421 	tu->tu_ioch	+= p->p_ru.ioch;
    422 	tu->tu_iblk	+= p->p_ru.inblock;
    423 	tu->tu_oblk	+= p->p_ru.oublock;
    424 	tu->tu_vcsw	+= p->p_ru.nvcsw;
    425 	tu->tu_icsw	+= p->p_ru.nivcsw;
    426 	tu->tu_nsig	+= p->p_ru.nsignals;
    427 	tu->tu_nswp	+= p->p_ru.nswap;
    428 	tu->tu_nscl	+= p->p_ru.sysc;
    429 	mutex_exit(&p->p_task->tk_usage_lock);
    430 }
    431 
    432 static void
    433 exacct_calculate_task_usage(task_t *tk, task_usage_t *tu, int flag)
    434 {
    435 	timestruc_t ts;
    436 	task_usage_t *tu_buf;
    437 
    438 	switch (flag) {
    439 	case EW_PARTIAL:
    440 		/*
    441 		 * For partial records we must report the sum of current
    442 		 * accounting statistics with previously accumulated
    443 		 * statistics.
    444 		 */
    445 		mutex_enter(&pidlock);
    446 		mutex_enter(&tk->tk_usage_lock);
    447 
    448 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
    449 		exacct_snapshot_task_usage(tk, tu);
    450 
    451 		mutex_exit(&tk->tk_usage_lock);
    452 		mutex_exit(&pidlock);
    453 		break;
    454 	case EW_INTERVAL:
    455 		/*
    456 		 * We need to allocate spare task_usage_t buffer before
    457 		 * grabbing pidlock because we might need it later in
    458 		 * exacct_get_interval_task_usage().
    459 		 */
    460 		tu_buf = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
    461 		mutex_enter(&pidlock);
    462 		mutex_enter(&tk->tk_usage_lock);
    463 
    464 		/*
    465 		 * For interval records, we deduct the previous microstate
    466 		 * accounting data and cpu usage times from previously saved
    467 		 * results and update the previous task usage structure.
    468 		 */
    469 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
    470 		exacct_snapshot_task_usage(tk, tu);
    471 		exacct_get_interval_task_usage(tk, tu, &tu_buf);
    472 
    473 		mutex_exit(&tk->tk_usage_lock);
    474 		mutex_exit(&pidlock);
    475 
    476 		if (tu_buf != NULL)
    477 			kmem_free(tu_buf, sizeof (task_usage_t));
    478 		break;
    479 	case EW_FINAL:
    480 		/*
    481 		 * For final records, we deduct, from the task's current
    482 		 * usage, any usage that was inherited with the arrival
    483 		 * of a process from a previous task. We then record
    484 		 * the task's finish time.
    485 		 */
    486 		mutex_enter(&tk->tk_usage_lock);
    487 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
    488 		exacct_sub_task_mstate(tu, tk->tk_inherited);
    489 		mutex_exit(&tk->tk_usage_lock);
    490 
    491 		gethrestime(&ts);
    492 		tu->tu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
    493 		tu->tu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
    494 
    495 		break;
    496 	}
    497 }
    498 
    499 static int
    500 exacct_attach_task_item(task_t *tk, task_usage_t *tu, ea_object_t *record,
    501     int res)
    502 {
    503 	int attached = 1;
    504 
    505 	switch (res) {
    506 	case AC_TASK_TASKID:
    507 		(void) ea_attach_item(record, &tk->tk_tkid,
    508 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_TASKID);
    509 		break;
    510 	case AC_TASK_PROJID:
    511 		(void) ea_attach_item(record, &tk->tk_proj->kpj_id,
    512 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_PROJID);
    513 		break;
    514 	case AC_TASK_CPU: {
    515 			timestruc_t ts;
    516 			uint64_t ui;
    517 
    518 			hrt2ts(tu->tu_stime, &ts);
    519 			ui = ts.tv_sec;
    520 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
    521 			    EXT_UINT64 | EXD_TASK_CPU_SYS_SEC);
    522 			ui = ts.tv_nsec;
    523 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
    524 			    EXT_UINT64 | EXD_TASK_CPU_SYS_NSEC);
    525 
    526 			hrt2ts(tu->tu_utime, &ts);
    527 			ui = ts.tv_sec;
    528 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
    529 			    EXT_UINT64 | EXD_TASK_CPU_USER_SEC);
    530 			ui = ts.tv_nsec;
    531 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
    532 			    EXT_UINT64 | EXD_TASK_CPU_USER_NSEC);
    533 		}
    534 		break;
    535 	case AC_TASK_TIME:
    536 		(void) ea_attach_item(record, &tu->tu_startsec,
    537 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_START_SEC);
    538 		(void) ea_attach_item(record, &tu->tu_startnsec,
    539 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_START_NSEC);
    540 		(void) ea_attach_item(record, &tu->tu_finishsec,
    541 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FINISH_SEC);
    542 		(void) ea_attach_item(record, &tu->tu_finishnsec,
    543 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FINISH_NSEC);
    544 		break;
    545 	case AC_TASK_HOSTNAME:
    546 		(void) ea_attach_item(record, tk->tk_zone->zone_nodename,
    547 		    strlen(tk->tk_zone->zone_nodename) + 1,
    548 		    EXT_STRING | EXD_TASK_HOSTNAME);
    549 			break;
    550 	case AC_TASK_MICROSTATE:
    551 		(void) ea_attach_item(record, &tu->tu_majflt,
    552 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FAULTS_MAJOR);
    553 		(void) ea_attach_item(record, &tu->tu_minflt,
    554 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FAULTS_MINOR);
    555 		(void) ea_attach_item(record, &tu->tu_sndmsg,
    556 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_MESSAGES_SND);
    557 		(void) ea_attach_item(record, &tu->tu_rcvmsg,
    558 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_MESSAGES_RCV);
    559 		(void) ea_attach_item(record, &tu->tu_iblk,
    560 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_BLOCKS_IN);
    561 		(void) ea_attach_item(record, &tu->tu_oblk,
    562 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_BLOCKS_OUT);
    563 		(void) ea_attach_item(record, &tu->tu_ioch,
    564 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CHARS_RDWR);
    565 		(void) ea_attach_item(record, &tu->tu_vcsw,
    566 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CONTEXT_VOL);
    567 		(void) ea_attach_item(record, &tu->tu_icsw,
    568 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CONTEXT_INV);
    569 		(void) ea_attach_item(record, &tu->tu_nsig,
    570 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SIGNALS);
    571 		(void) ea_attach_item(record, &tu->tu_nswp,
    572 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SWAPS);
    573 		(void) ea_attach_item(record, &tu->tu_nscl,
    574 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SYSCALLS);
    575 		break;
    576 	case AC_TASK_ANCTASKID:
    577 		(void) ea_attach_item(record, &tu->tu_anctaskid,
    578 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_ANCTASKID);
    579 		break;
    580 	case AC_TASK_ZONENAME:
    581 		(void) ea_attach_item(record, tk->tk_zone->zone_name,
    582 		    strlen(tk->tk_zone->zone_name) + 1,
    583 		    EXT_STRING | EXD_TASK_ZONENAME);
    584 		break;
    585 	default:
    586 		attached = 0;
    587 	}
    588 	return (attached);
    589 }
    590 
    591 static ea_object_t *
    592 exacct_assemble_task_record(task_t *tk, task_usage_t *tu, ulong_t *mask,
    593     ea_catalog_t record_type)
    594 {
    595 	int res, count;
    596 	ea_object_t *record;
    597 
    598 	/*
    599 	 * Assemble usage values into group.
    600 	 */
    601 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
    602 	for (res = 1, count = 0; res <= AC_TASK_MAX_RES; res++)
    603 		if (BT_TEST(mask, res))
    604 			count += exacct_attach_task_item(tk, tu, record, res);
    605 	if (count == 0) {
    606 		ea_free_object(record, EUP_ALLOC);
    607 		record = NULL;
    608 	}
    609 	return (record);
    610 }
    611 
    612 /*
    613  * int exacct_assemble_task_usage(task_t *, int (*)(void *, size_t, void *,
    614  *	size_t, size_t *), void *, size_t, size_t *, int)
    615  *
    616  * Overview
    617  *   exacct_assemble_task_usage() builds the packed exacct buffer for the
    618  *   indicated task, executes the given callback function, and free the packed
    619  *   buffer.
    620  *
    621  * Return values
    622  *   Returns 0 on success; otherwise the appropriate error code is returned.
    623  *
    624  * Caller's context
    625  *   Suitable for KM_SLEEP allocations.
    626  */
    627 int
    628 exacct_assemble_task_usage(ac_info_t *ac_task, task_t *tk,
    629     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
    630     void *ubuf, size_t ubufsize, size_t *actual, int flag)
    631 {
    632 	ulong_t mask[AC_MASK_SZ];
    633 	ea_object_t *task_record;
    634 	ea_catalog_t record_type;
    635 	task_usage_t *tu;
    636 	void *buf;
    637 	size_t bufsize;
    638 	int ret;
    639 
    640 	ASSERT(flag == EW_FINAL || flag == EW_PARTIAL || flag == EW_INTERVAL);
    641 
    642 	mutex_enter(&ac_task->ac_lock);
    643 	if (ac_task->ac_state == AC_OFF) {
    644 		mutex_exit(&ac_task->ac_lock);
    645 		return (ENOTACTIVE);
    646 	}
    647 	bt_copy(ac_task->ac_mask, mask, AC_MASK_SZ);
    648 	mutex_exit(&ac_task->ac_lock);
    649 
    650 	switch (flag) {
    651 	case EW_FINAL:
    652 		record_type = EXD_GROUP_TASK;
    653 		break;
    654 	case EW_PARTIAL:
    655 		record_type = EXD_GROUP_TASK_PARTIAL;
    656 		break;
    657 	case EW_INTERVAL:
    658 		record_type = EXD_GROUP_TASK_INTERVAL;
    659 		break;
    660 	}
    661 
    662 	/*
    663 	 * Calculate task usage and assemble it into the task record.
    664 	 */
    665 	tu = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
    666 	exacct_calculate_task_usage(tk, tu, flag);
    667 	task_record = exacct_assemble_task_record(tk, tu, mask, record_type);
    668 	if (task_record == NULL) {
    669 		/*
    670 		 * The current configuration of the accounting system has
    671 		 * resulted in records with no data; accordingly, we don't write
    672 		 * these, but we return success.
    673 		 */
    674 		kmem_free(tu, sizeof (task_usage_t));
    675 		return (0);
    676 	}
    677 
    678 	/*
    679 	 * Pack object into buffer and run callback on it.
    680 	 */
    681 	bufsize = ea_pack_object(task_record, NULL, 0);
    682 	buf = kmem_alloc(bufsize, KM_SLEEP);
    683 	(void) ea_pack_object(task_record, buf, bufsize);
    684 	ret = callback(ac_task, ubuf, ubufsize, buf, bufsize, actual);
    685 
    686 	/*
    687 	 * Free all previously allocated structures.
    688 	 */
    689 	kmem_free(buf, bufsize);
    690 	ea_free_object(task_record, EUP_ALLOC);
    691 	kmem_free(tu, sizeof (task_usage_t));
    692 	return (ret);
    693 }
    694 
    695 /*
    696  * void exacct_commit_task(void *)
    697  *
    698  * Overview
    699  *   exacct_commit_task() calculates the final usage for a task, updating the
    700  *   task usage if task accounting is active, and writing a task record if task
    701  *   accounting is active.  exacct_commit_task() is intended for being called
    702  *   from a task queue (taskq_t).
    703  *
    704  * Return values
    705  *   None.
    706  *
    707  * Caller's context
    708  *   Suitable for KM_SLEEP allocations.
    709  */
    710 
    711 void
    712 exacct_commit_task(void *arg)
    713 {
    714 	task_t *tk = (task_t *)arg;
    715 	size_t size;
    716 	zone_t *zone = tk->tk_zone;
    717 	struct exacct_globals *acg;
    718 
    719 	ASSERT(tk != task0p);
    720 	ASSERT(tk->tk_memb_list == NULL);
    721 
    722 	/*
    723 	 * Don't do any extra work if the acctctl module isn't loaded.
    724 	 */
    725 	if (exacct_zone_key != ZONE_KEY_UNINITIALIZED) {
    726 		acg = zone_getspecific(exacct_zone_key, zone);
    727 		(void) exacct_assemble_task_usage(&acg->ac_task, tk,
    728 		    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
    729 		if (tk->tk_zone != global_zone) {
    730 			acg = zone_getspecific(exacct_zone_key, global_zone);
    731 			(void) exacct_assemble_task_usage(&acg->ac_task, tk,
    732 			    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
    733 		}
    734 	}
    735 	/*
    736 	 * Release associated project and finalize task.
    737 	 */
    738 	task_end(tk);
    739 }
    740 
    741 static int
    742 exacct_attach_proc_item(proc_usage_t *pu, ea_object_t *record, int res)
    743 {
    744 	int attached = 1;
    745 
    746 	switch (res) {
    747 	case AC_PROC_PID:
    748 		(void) ea_attach_item(record, &pu->pu_pid,
    749 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_PID);
    750 		break;
    751 	case AC_PROC_UID:
    752 		(void) ea_attach_item(record, &pu->pu_ruid,
    753 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_UID);
    754 		break;
    755 	case AC_PROC_FLAG:
    756 		(void) ea_attach_item(record, &pu->pu_acflag,
    757 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_ACCT_FLAGS);
    758 		break;
    759 	case AC_PROC_GID:
    760 		(void) ea_attach_item(record, &pu->pu_rgid,
    761 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_GID);
    762 		break;
    763 	case AC_PROC_PROJID:
    764 		(void) ea_attach_item(record, &pu->pu_projid,
    765 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_PROJID);
    766 		break;
    767 	case AC_PROC_TASKID:
    768 		(void) ea_attach_item(record, &pu->pu_taskid,
    769 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TASKID);
    770 		break;
    771 	case AC_PROC_CPU:
    772 		(void) ea_attach_item(record, &pu->pu_utimesec,
    773 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_USER_SEC);
    774 		(void) ea_attach_item(record, &pu->pu_utimensec,
    775 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_USER_NSEC);
    776 		(void) ea_attach_item(record, &pu->pu_stimesec,
    777 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_SYS_SEC);
    778 		(void) ea_attach_item(record, &pu->pu_stimensec,
    779 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_SYS_NSEC);
    780 		break;
    781 	case AC_PROC_TIME:
    782 		(void) ea_attach_item(record, &pu->pu_startsec,
    783 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_START_SEC);
    784 		(void) ea_attach_item(record, &pu->pu_startnsec,
    785 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_START_NSEC);
    786 		(void) ea_attach_item(record, &pu->pu_finishsec,
    787 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FINISH_SEC);
    788 		(void) ea_attach_item(record, &pu->pu_finishnsec,
    789 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FINISH_NSEC);
    790 		break;
    791 	case AC_PROC_COMMAND:
    792 		(void) ea_attach_item(record, pu->pu_command,
    793 		    strlen(pu->pu_command) + 1, EXT_STRING | EXD_PROC_COMMAND);
    794 		break;
    795 	case AC_PROC_HOSTNAME:
    796 		(void) ea_attach_item(record, pu->pu_nodename,
    797 		    strlen(pu->pu_nodename) + 1,
    798 		    EXT_STRING | EXD_PROC_HOSTNAME);
    799 		break;
    800 	case AC_PROC_TTY:
    801 		(void) ea_attach_item(record, &pu->pu_major,
    802 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TTY_MAJOR);
    803 		(void) ea_attach_item(record, &pu->pu_minor,
    804 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TTY_MINOR);
    805 		break;
    806 	case AC_PROC_MICROSTATE:
    807 		(void) ea_attach_item(record, &pu->pu_majflt,
    808 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FAULTS_MAJOR);
    809 		(void) ea_attach_item(record, &pu->pu_minflt,
    810 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FAULTS_MINOR);
    811 		(void) ea_attach_item(record, &pu->pu_sndmsg,
    812 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MESSAGES_SND);
    813 		(void) ea_attach_item(record, &pu->pu_rcvmsg,
    814 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MESSAGES_RCV);
    815 		(void) ea_attach_item(record, &pu->pu_iblk,
    816 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_BLOCKS_IN);
    817 		(void) ea_attach_item(record, &pu->pu_oblk,
    818 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_BLOCKS_OUT);
    819 		(void) ea_attach_item(record, &pu->pu_ioch,
    820 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CHARS_RDWR);
    821 		(void) ea_attach_item(record, &pu->pu_vcsw,
    822 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CONTEXT_VOL);
    823 		(void) ea_attach_item(record, &pu->pu_icsw,
    824 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CONTEXT_INV);
    825 		(void) ea_attach_item(record, &pu->pu_nsig,
    826 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SIGNALS);
    827 		(void) ea_attach_item(record, &pu->pu_nswp,
    828 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SWAPS);
    829 		(void) ea_attach_item(record, &pu->pu_nscl,
    830 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SYSCALLS);
    831 		break;
    832 	case AC_PROC_ANCPID:
    833 		(void) ea_attach_item(record, &pu->pu_ancpid,
    834 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_ANCPID);
    835 		break;
    836 	case AC_PROC_WAIT_STATUS:
    837 		(void) ea_attach_item(record, &pu->pu_wstat,
    838 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_WAIT_STATUS);
    839 		break;
    840 	case AC_PROC_ZONENAME:
    841 		(void) ea_attach_item(record, pu->pu_zonename,
    842 		    strlen(pu->pu_zonename) + 1,
    843 		    EXT_STRING | EXD_PROC_ZONENAME);
    844 		break;
    845 	case AC_PROC_MEM:
    846 		(void) ea_attach_item(record, &pu->pu_mem_rss_avg,
    847 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MEM_RSS_AVG_K);
    848 		(void) ea_attach_item(record, &pu->pu_mem_rss_max,
    849 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MEM_RSS_MAX_K);
    850 		break;
    851 	default:
    852 		attached = 0;
    853 	}
    854 	return (attached);
    855 }
    856 
    857 static ea_object_t *
    858 exacct_assemble_proc_record(proc_usage_t *pu, ulong_t *mask,
    859     ea_catalog_t record_type)
    860 {
    861 	int res, count;
    862 	ea_object_t *record;
    863 
    864 	/*
    865 	 * Assemble usage values into group.
    866 	 */
    867 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
    868 	for (res = 1, count = 0; res <= AC_PROC_MAX_RES; res++)
    869 		if (BT_TEST(mask, res))
    870 			count += exacct_attach_proc_item(pu, record, res);
    871 	if (count == 0) {
    872 		ea_free_object(record, EUP_ALLOC);
    873 		record = NULL;
    874 	}
    875 	return (record);
    876 }
    877 
    878 /*
    879  * The following two routines assume that process's p_lock is held or
    880  * exacct_commit_proc has been called from exit() when all lwps are stopped.
    881  */
    882 static void
    883 exacct_calculate_proc_mstate(proc_t *p, proc_usage_t *pu)
    884 {
    885 	kthread_t *t;
    886 
    887 	ASSERT(MUTEX_HELD(&p->p_lock));
    888 	if ((t = p->p_tlist) == NULL)
    889 		return;
    890 
    891 	do {
    892 		pu->pu_minflt	+= t->t_lwp->lwp_ru.minflt;
    893 		pu->pu_majflt	+= t->t_lwp->lwp_ru.majflt;
    894 		pu->pu_sndmsg	+= t->t_lwp->lwp_ru.msgsnd;
    895 		pu->pu_rcvmsg	+= t->t_lwp->lwp_ru.msgrcv;
    896 		pu->pu_ioch	+= t->t_lwp->lwp_ru.ioch;
    897 		pu->pu_iblk	+= t->t_lwp->lwp_ru.inblock;
    898 		pu->pu_oblk	+= t->t_lwp->lwp_ru.oublock;
    899 		pu->pu_vcsw	+= t->t_lwp->lwp_ru.nvcsw;
    900 		pu->pu_icsw	+= t->t_lwp->lwp_ru.nivcsw;
    901 		pu->pu_nsig	+= t->t_lwp->lwp_ru.nsignals;
    902 		pu->pu_nswp	+= t->t_lwp->lwp_ru.nswap;
    903 		pu->pu_nscl	+= t->t_lwp->lwp_ru.sysc;
    904 	} while ((t = t->t_forw) != p->p_tlist);
    905 }
    906 
    907 static void
    908 exacct_copy_proc_mstate(proc_t *p, proc_usage_t *pu)
    909 {
    910 	pu->pu_minflt	= p->p_ru.minflt;
    911 	pu->pu_majflt	= p->p_ru.majflt;
    912 	pu->pu_sndmsg	= p->p_ru.msgsnd;
    913 	pu->pu_rcvmsg	= p->p_ru.msgrcv;
    914 	pu->pu_ioch	= p->p_ru.ioch;
    915 	pu->pu_iblk	= p->p_ru.inblock;
    916 	pu->pu_oblk	= p->p_ru.oublock;
    917 	pu->pu_vcsw	= p->p_ru.nvcsw;
    918 	pu->pu_icsw	= p->p_ru.nivcsw;
    919 	pu->pu_nsig	= p->p_ru.nsignals;
    920 	pu->pu_nswp	= p->p_ru.nswap;
    921 	pu->pu_nscl	= p->p_ru.sysc;
    922 }
    923 
    924 void
    925 exacct_calculate_proc_usage(proc_t *p, proc_usage_t *pu, ulong_t *mask,
    926     int flag, int wstat)
    927 {
    928 	timestruc_t ts, ts_run;
    929 
    930 	ASSERT(MUTEX_HELD(&p->p_lock));
    931 
    932 	/*
    933 	 * Convert CPU and execution times to sec/nsec format.
    934 	 */
    935 	if (BT_TEST(mask, AC_PROC_CPU)) {
    936 		hrt2ts(mstate_aggr_state(p, LMS_USER), &ts);
    937 		pu->pu_utimesec = (uint64_t)(ulong_t)ts.tv_sec;
    938 		pu->pu_utimensec = (uint64_t)(ulong_t)ts.tv_nsec;
    939 		hrt2ts(mstate_aggr_state(p, LMS_SYSTEM), &ts);
    940 		pu->pu_stimesec = (uint64_t)(ulong_t)ts.tv_sec;
    941 		pu->pu_stimensec = (uint64_t)(ulong_t)ts.tv_nsec;
    942 	}
    943 	if (BT_TEST(mask, AC_PROC_TIME)) {
    944 		gethrestime(&ts);
    945 		pu->pu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
    946 		pu->pu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
    947 		hrt2ts(gethrtime() - p->p_mstart, &ts_run);
    948 		ts.tv_sec -= ts_run.tv_sec;
    949 		ts.tv_nsec -= ts_run.tv_nsec;
    950 		if (ts.tv_nsec < 0) {
    951 			ts.tv_sec--;
    952 			if ((ts.tv_nsec = ts.tv_nsec + NANOSEC) >= NANOSEC) {
    953 				ts.tv_sec++;
    954 				ts.tv_nsec -= NANOSEC;
    955 			}
    956 		}
    957 		pu->pu_startsec = (uint64_t)(ulong_t)ts.tv_sec;
    958 		pu->pu_startnsec = (uint64_t)(ulong_t)ts.tv_nsec;
    959 	}
    960 
    961 	pu->pu_pid = p->p_pidp->pid_id;
    962 	pu->pu_acflag = p->p_user.u_acflag;
    963 	pu->pu_projid = p->p_task->tk_proj->kpj_id;
    964 	pu->pu_taskid = p->p_task->tk_tkid;
    965 	pu->pu_major = getmajor(p->p_sessp->s_dev);
    966 	pu->pu_minor = getminor(p->p_sessp->s_dev);
    967 	pu->pu_ancpid = p->p_ancpid;
    968 	pu->pu_wstat = wstat;
    969 	/*
    970 	 * Compute average RSS in K.  The denominator is the number of
    971 	 * samples:  the number of clock ticks plus the initial value.
    972 	 */
    973 	pu->pu_mem_rss_avg = (PTOU(p)->u_mem / (p->p_stime + p->p_utime + 1)) *
    974 	    (PAGESIZE / 1024);
    975 	pu->pu_mem_rss_max = PTOU(p)->u_mem_max * (PAGESIZE / 1024);
    976 
    977 	mutex_enter(&p->p_crlock);
    978 	pu->pu_ruid = crgetruid(p->p_cred);
    979 	pu->pu_rgid = crgetrgid(p->p_cred);
    980 	mutex_exit(&p->p_crlock);
    981 
    982 	bcopy(p->p_user.u_comm, pu->pu_command, strlen(p->p_user.u_comm) + 1);
    983 	bcopy(p->p_zone->zone_name, pu->pu_zonename,
    984 	    strlen(p->p_zone->zone_name) + 1);
    985 	bcopy(p->p_zone->zone_nodename, pu->pu_nodename,
    986 	    strlen(p->p_zone->zone_nodename) + 1);
    987 
    988 	/*
    989 	 * Calculate microstate accounting data for a process that is still
    990 	 * running.  Presently, we explicitly collect all of the LWP usage into
    991 	 * the proc usage structure here.
    992 	 */
    993 	if (flag & EW_PARTIAL)
    994 		exacct_calculate_proc_mstate(p, pu);
    995 	if (flag & EW_FINAL)
    996 		exacct_copy_proc_mstate(p, pu);
    997 }
    998 
    999 /*
   1000  * int exacct_assemble_proc_usage(proc_usage_t *, int (*)(void *, size_t, void
   1001  *	*, size_t, size_t *), void *, size_t, size_t *)
   1002  *
   1003  * Overview
   1004  *   Assemble record with miscellaneous accounting information about the process
   1005  *   and execute the callback on it. It is the callback's job to set "actual" to
   1006  *   the size of record.
   1007  *
   1008  * Return values
   1009  *   The result of the callback function, unless the extended process accounting
   1010  *   feature is not active, in which case ENOTACTIVE is returned.
   1011  *
   1012  * Caller's context
   1013  *   Suitable for KM_SLEEP allocations.
   1014  */
   1015 int
   1016 exacct_assemble_proc_usage(ac_info_t *ac_proc, proc_usage_t *pu,
   1017     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
   1018     void *ubuf, size_t ubufsize, size_t *actual, int flag)
   1019 {
   1020 	ulong_t mask[AC_MASK_SZ];
   1021 	ea_object_t *proc_record;
   1022 	ea_catalog_t record_type;
   1023 	void *buf;
   1024 	size_t bufsize;
   1025 	int ret;
   1026 
   1027 	ASSERT(flag == EW_FINAL || flag == EW_PARTIAL);
   1028 
   1029 	mutex_enter(&ac_proc->ac_lock);
   1030 	if (ac_proc->ac_state == AC_OFF) {
   1031 		mutex_exit(&ac_proc->ac_lock);
   1032 		return (ENOTACTIVE);
   1033 	}
   1034 	bt_copy(&ac_proc->ac_mask[0], mask, AC_MASK_SZ);
   1035 	mutex_exit(&ac_proc->ac_lock);
   1036 
   1037 	switch (flag) {
   1038 	case EW_FINAL:
   1039 		record_type = EXD_GROUP_PROC;
   1040 		break;
   1041 	case EW_PARTIAL:
   1042 		record_type = EXD_GROUP_PROC_PARTIAL;
   1043 		break;
   1044 	}
   1045 
   1046 	proc_record = exacct_assemble_proc_record(pu, mask, record_type);
   1047 	if (proc_record == NULL)
   1048 		return (0);
   1049 
   1050 	/*
   1051 	 * Pack object into buffer and pass to callback.
   1052 	 */
   1053 	bufsize = ea_pack_object(proc_record, NULL, 0);
   1054 	buf = kmem_alloc(bufsize, KM_SLEEP);
   1055 	(void) ea_pack_object(proc_record, buf, bufsize);
   1056 
   1057 	ret = callback(ac_proc, ubuf, ubufsize, buf, bufsize, actual);
   1058 
   1059 	/*
   1060 	 * Free all previously allocations.
   1061 	 */
   1062 	kmem_free(buf, bufsize);
   1063 	ea_free_object(proc_record, EUP_ALLOC);
   1064 	return (ret);
   1065 }
   1066 
   1067 /*
   1068  * int exacct_commit_callback(ac_info_t *, void *, size_t, void *, size_t,
   1069  * 	size_t *)
   1070  *
   1071  * Overview
   1072  *   exacct_commit_callback() writes the indicated buffer to the indicated
   1073  *   extended accounting file.
   1074  *
   1075  * Return values
   1076  *   The result of the write operation is returned.  "actual" is updated to
   1077  *   contain the number of bytes actually written.
   1078  *
   1079  * Caller's context
   1080  *   Suitable for a vn_rdwr() operation.
   1081  */
   1082 /*ARGSUSED*/
   1083 int
   1084 exacct_commit_callback(ac_info_t *info, void *ubuf, size_t ubufsize,
   1085     void *buf, size_t bufsize, size_t *actual)
   1086 {
   1087 	int error = 0;
   1088 
   1089 	*actual = 0;
   1090 	if ((error = exacct_vn_write(info, buf, bufsize)) == 0)
   1091 		*actual = bufsize;
   1092 	return (error);
   1093 }
   1094 
   1095 static void
   1096 exacct_do_commit_proc(ac_info_t *ac_proc, proc_t *p, int wstat)
   1097 {
   1098 	size_t size;
   1099 	proc_usage_t *pu;
   1100 	ulong_t mask[AC_MASK_SZ];
   1101 
   1102 	mutex_enter(&ac_proc->ac_lock);
   1103 	if (ac_proc->ac_state == AC_ON) {
   1104 		bt_copy(&ac_proc->ac_mask[0], mask, AC_MASK_SZ);
   1105 		mutex_exit(&ac_proc->ac_lock);
   1106 	} else {
   1107 		mutex_exit(&ac_proc->ac_lock);
   1108 		return;
   1109 	}
   1110 
   1111 	mutex_enter(&p->p_lock);
   1112 	size = strlen(p->p_user.u_comm) + 1;
   1113 	mutex_exit(&p->p_lock);
   1114 
   1115 	pu = kmem_alloc(sizeof (proc_usage_t), KM_SLEEP);
   1116 	pu->pu_command = kmem_alloc(size, KM_SLEEP);
   1117 	mutex_enter(&p->p_lock);
   1118 	exacct_calculate_proc_usage(p, pu, mask, EW_FINAL, wstat);
   1119 	mutex_exit(&p->p_lock);
   1120 
   1121 	(void) exacct_assemble_proc_usage(ac_proc, pu,
   1122 	    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
   1123 
   1124 	kmem_free(pu->pu_command, strlen(pu->pu_command) + 1);
   1125 	kmem_free(pu, sizeof (proc_usage_t));
   1126 }
   1127 
   1128 /*
   1129  * void exacct_commit_proc(proc_t *, int)
   1130  *
   1131  * Overview
   1132  *   exacct_commit_proc() calculates the final usage for a process, updating the
   1133  *   task usage if task accounting is active, and writing a process record if
   1134  *   process accounting is active.  exacct_commit_proc() is intended for being
   1135  *   called from proc_exit().
   1136  *
   1137  * Return values
   1138  *   None.
   1139  *
   1140  * Caller's context
   1141  *   Suitable for KM_SLEEP allocations.  p_lock must not be held at entry.
   1142  */
   1143 void
   1144 exacct_commit_proc(proc_t *p, int wstat)
   1145 {
   1146 	zone_t *zone = p->p_zone;
   1147 	struct exacct_globals *acg, *gacg = NULL;
   1148 
   1149 	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
   1150 		/*
   1151 		 * acctctl module not loaded.  Nothing to do.
   1152 		 */
   1153 		return;
   1154 	}
   1155 	acg = zone_getspecific(exacct_zone_key, zone);
   1156 	exacct_do_commit_proc(&acg->ac_proc, p, wstat);
   1157 	if (zone != global_zone) {
   1158 		gacg = zone_getspecific(exacct_zone_key, global_zone);
   1159 		exacct_do_commit_proc(&gacg->ac_proc, p, wstat);
   1160 	}
   1161 }
   1162 
   1163 static int
   1164 exacct_attach_netstat_item(net_stat_t *ns, ea_object_t *record, int res)
   1165 {
   1166 	int		attached = 1;
   1167 
   1168 	switch (res) {
   1169 	case AC_NET_NAME:
   1170 		(void) ea_attach_item(record, ns->ns_name,
   1171 		    strlen(ns->ns_name) + 1, EXT_STRING | EXD_NET_STATS_NAME);
   1172 		break;
   1173 	case AC_NET_CURTIME:
   1174 		{
   1175 			uint64_t	now;
   1176 			timestruc_t	ts;
   1177 
   1178 			gethrestime(&ts);
   1179 			now = (uint64_t)(ulong_t)ts.tv_sec;
   1180 			(void) ea_attach_item(record,  &now, sizeof (uint64_t),
   1181 			    EXT_UINT64 | EXD_NET_STATS_CURTIME);
   1182 		}
   1183 		break;
   1184 	case AC_NET_IBYTES:
   1185 		(void) ea_attach_item(record, &ns->ns_ibytes,
   1186 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IBYTES);
   1187 		break;
   1188 	case AC_NET_OBYTES:
   1189 		(void) ea_attach_item(record, &ns->ns_obytes,
   1190 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OBYTES);
   1191 		break;
   1192 	case AC_NET_IPKTS:
   1193 		(void) ea_attach_item(record, &ns->ns_ipackets,
   1194 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IPKTS);
   1195 		break;
   1196 	case AC_NET_OPKTS:
   1197 		(void) ea_attach_item(record, &ns->ns_opackets,
   1198 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OPKTS);
   1199 		break;
   1200 	case AC_NET_IERRPKTS:
   1201 		(void) ea_attach_item(record, &ns->ns_ierrors,
   1202 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IERRPKTS);
   1203 		break;
   1204 	case AC_NET_OERRPKTS:
   1205 		(void) ea_attach_item(record, &ns->ns_oerrors,
   1206 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OERRPKTS);
   1207 		break;
   1208 	default:
   1209 		attached = 0;
   1210 	}
   1211 	return (attached);
   1212 }
   1213 
   1214 static int
   1215 exacct_attach_netdesc_item(net_desc_t *nd, ea_object_t *record, int res)
   1216 {
   1217 	int attached = 1;
   1218 
   1219 	switch (res) {
   1220 	case AC_NET_NAME:
   1221 		(void) ea_attach_item(record, nd->nd_name,
   1222 		    strlen(nd->nd_name) + 1, EXT_STRING | EXD_NET_DESC_NAME);
   1223 		break;
   1224 	case AC_NET_DEVNAME:
   1225 		(void) ea_attach_item(record, nd->nd_devname,
   1226 		    strlen(nd->nd_devname) + 1, EXT_STRING |
   1227 		    EXD_NET_DESC_DEVNAME);
   1228 		break;
   1229 	case AC_NET_EHOST:
   1230 		(void) ea_attach_item(record, &nd->nd_ehost,
   1231 		    sizeof (nd->nd_ehost), EXT_RAW | EXD_NET_DESC_EHOST);
   1232 		break;
   1233 	case AC_NET_EDEST:
   1234 		(void) ea_attach_item(record, &nd->nd_edest,
   1235 		    sizeof (nd->nd_edest), EXT_RAW | EXD_NET_DESC_EDEST);
   1236 		break;
   1237 	case AC_NET_VLAN_TPID:
   1238 		(void) ea_attach_item(record, &nd->nd_vlan_tpid,
   1239 		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_VLAN_TPID);
   1240 		break;
   1241 	case AC_NET_VLAN_TCI:
   1242 		(void) ea_attach_item(record, &nd->nd_vlan_tci,
   1243 		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_VLAN_TCI);
   1244 		break;
   1245 	case AC_NET_SAP:
   1246 		(void) ea_attach_item(record, &nd->nd_sap,
   1247 		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_SAP);
   1248 		break;
   1249 	case AC_NET_PRIORITY:
   1250 		(void) ea_attach_item(record, &nd->nd_priority,
   1251 		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_PRIORITY);
   1252 		break;
   1253 	case AC_NET_BWLIMIT:
   1254 		(void) ea_attach_item(record, &nd->nd_bw_limit,
   1255 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_DESC_BWLIMIT);
   1256 		break;
   1257 	case AC_NET_SADDR:
   1258 		if (nd->nd_isv4) {
   1259 			(void) ea_attach_item(record, &nd->nd_saddr[3],
   1260 			    sizeof (uint32_t), EXT_UINT32 |
   1261 			    EXD_NET_DESC_V4SADDR);
   1262 		} else {
   1263 			(void) ea_attach_item(record, &nd->nd_saddr,
   1264 			    sizeof (nd->nd_saddr), EXT_RAW |
   1265 			    EXD_NET_DESC_V6SADDR);
   1266 		}
   1267 		break;
   1268 	case AC_NET_DADDR:
   1269 		if (nd->nd_isv4) {
   1270 			(void) ea_attach_item(record, &nd->nd_daddr[3],
   1271 			    sizeof (uint32_t), EXT_UINT32 |
   1272 			    EXD_NET_DESC_V4DADDR);
   1273 		} else {
   1274 			(void) ea_attach_item(record, &nd->nd_daddr,
   1275 			    sizeof (nd->nd_daddr), EXT_RAW |
   1276 			    EXD_NET_DESC_V6DADDR);
   1277 		}
   1278 		break;
   1279 	case AC_NET_SPORT:
   1280 		(void) ea_attach_item(record, &nd->nd_sport,
   1281 		    sizeof (uint16_t), EXT_UINT16 | EXD_NET_DESC_SPORT);
   1282 		break;
   1283 	case AC_NET_DPORT:
   1284 		(void) ea_attach_item(record, &nd->nd_dport,
   1285 		    sizeof (uint16_t), EXT_UINT16 | EXD_NET_DESC_DPORT);
   1286 		break;
   1287 	case AC_NET_PROTOCOL:
   1288 		(void) ea_attach_item(record, &nd->nd_protocol,
   1289 		    sizeof (uint8_t), EXT_UINT8 | EXD_NET_DESC_PROTOCOL);
   1290 		break;
   1291 	case AC_NET_DSFIELD:
   1292 		(void) ea_attach_item(record, &nd->nd_dsfield,
   1293 		    sizeof (uint8_t), EXT_UINT8 | EXD_NET_DESC_DSFIELD);
   1294 		break;
   1295 	default:
   1296 		attached = 0;
   1297 	}
   1298 	return (attached);
   1299 }
   1300 
   1301 static ea_object_t *
   1302 exacct_assemble_net_record(void *ninfo, ulong_t *mask, ea_catalog_t record_type,
   1303     int what)
   1304 {
   1305 	int		res;
   1306 	int		count;
   1307 	ea_object_t	*record;
   1308 
   1309 	/*
   1310 	 * Assemble usage values into group.
   1311 	 */
   1312 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
   1313 	for (res = 1, count = 0; res <= AC_NET_MAX_RES; res++)
   1314 		if (BT_TEST(mask, res)) {
   1315 			if (what == EX_NET_LNDESC_REC ||
   1316 			    what == EX_NET_FLDESC_REC) {
   1317 				count += exacct_attach_netdesc_item(
   1318 				    (net_desc_t *)ninfo, record, res);
   1319 			} else {
   1320 				count += exacct_attach_netstat_item(
   1321 				    (net_stat_t *)ninfo, record, res);
   1322 			}
   1323 		}
   1324 	if (count == 0) {
   1325 		ea_free_object(record, EUP_ALLOC);
   1326 		record = NULL;
   1327 	}
   1328 	return (record);
   1329 }
   1330 
   1331 int
   1332 exacct_assemble_net_usage(ac_info_t *ac_net, void *ninfo,
   1333     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
   1334     void *ubuf, size_t ubufsize, size_t *actual, int what)
   1335 {
   1336 	ulong_t		mask[AC_MASK_SZ];
   1337 	ea_object_t	*net_desc;
   1338 	ea_catalog_t	record_type;
   1339 	void		*buf;
   1340 	size_t		bufsize;
   1341 	int		ret;
   1342 
   1343 	mutex_enter(&ac_net->ac_lock);
   1344 	if (ac_net->ac_state == AC_OFF) {
   1345 		mutex_exit(&ac_net->ac_lock);
   1346 		return (ENOTACTIVE);
   1347 	}
   1348 	bt_copy(&ac_net->ac_mask[0], mask, AC_MASK_SZ);
   1349 	mutex_exit(&ac_net->ac_lock);
   1350 
   1351 	switch (what) {
   1352 	case EX_NET_LNDESC_REC:
   1353 		record_type = EXD_GROUP_NET_LINK_DESC;
   1354 		break;
   1355 	case EX_NET_LNSTAT_REC:
   1356 		record_type = EXD_GROUP_NET_LINK_STATS;
   1357 		break;
   1358 	case EX_NET_FLDESC_REC:
   1359 		record_type = EXD_GROUP_NET_FLOW_DESC;
   1360 		break;
   1361 	case EX_NET_FLSTAT_REC:
   1362 		record_type = EXD_GROUP_NET_FLOW_STATS;
   1363 		break;
   1364 	}
   1365 
   1366 	net_desc = exacct_assemble_net_record(ninfo, mask, record_type, what);
   1367 	if (net_desc == NULL)
   1368 		return (0);
   1369 
   1370 	/*
   1371 	 * Pack object into buffer and pass to callback.
   1372 	 */
   1373 	bufsize = ea_pack_object(net_desc, NULL, 0);
   1374 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
   1375 	if (buf == NULL)
   1376 		return (ENOMEM);
   1377 
   1378 	(void) ea_pack_object(net_desc, buf, bufsize);
   1379 
   1380 	ret = callback(ac_net, ubuf, ubufsize, buf, bufsize, actual);
   1381 
   1382 	/*
   1383 	 * Free all previously allocations.
   1384 	 */
   1385 	kmem_free(buf, bufsize);
   1386 	ea_free_object(net_desc, EUP_ALLOC);
   1387 	return (ret);
   1388 }
   1389 
   1390 int
   1391 exacct_commit_netinfo(void *arg, int what)
   1392 {
   1393 	size_t			size;
   1394 	ulong_t			mask[AC_MASK_SZ];
   1395 	struct exacct_globals	*acg;
   1396 	ac_info_t		*ac_net;
   1397 
   1398 	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
   1399 		/*
   1400 		 * acctctl module not loaded. Nothing to do.
   1401 		 */
   1402 		return (ENOTACTIVE);
   1403 	}
   1404 
   1405 	/*
   1406 	 * Even though each zone nominally has its own flow accounting settings
   1407 	 * (ac_flow), these are only maintained by and for the global zone.
   1408 	 *
   1409 	 * If this were to change in the future, this function should grow a
   1410 	 * second zoneid (or zone) argument, and use the corresponding zone's
   1411 	 * settings rather than always using those of the global zone.
   1412 	 */
   1413 	acg = zone_getspecific(exacct_zone_key, global_zone);
   1414 	ac_net = &acg->ac_net;
   1415 
   1416 	mutex_enter(&ac_net->ac_lock);
   1417 	if (ac_net->ac_state == AC_OFF) {
   1418 		mutex_exit(&ac_net->ac_lock);
   1419 		return (ENOTACTIVE);
   1420 	}
   1421 	bt_copy(&ac_net->ac_mask[0], mask, AC_MASK_SZ);
   1422 	mutex_exit(&ac_net->ac_lock);
   1423 
   1424 	return (exacct_assemble_net_usage(ac_net, arg, exacct_commit_callback,
   1425 	    NULL, 0, &size, what));
   1426 }
   1427 
   1428 static int
   1429 exacct_attach_flow_item(flow_usage_t *fu, ea_object_t *record, int res)
   1430 {
   1431 	int attached = 1;
   1432 
   1433 	switch (res) {
   1434 	case AC_FLOW_SADDR:
   1435 		if (fu->fu_isv4) {
   1436 			(void) ea_attach_item(record, &fu->fu_saddr[3],
   1437 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_V4SADDR);
   1438 		} else {
   1439 			(void) ea_attach_item(record, &fu->fu_saddr,
   1440 			    sizeof (fu->fu_saddr), EXT_RAW |
   1441 			    EXD_FLOW_V6SADDR);
   1442 		}
   1443 		break;
   1444 	case AC_FLOW_DADDR:
   1445 		if (fu->fu_isv4) {
   1446 			(void) ea_attach_item(record, &fu->fu_daddr[3],
   1447 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_V4DADDR);
   1448 		} else {
   1449 			(void) ea_attach_item(record, &fu->fu_daddr,
   1450 			    sizeof (fu->fu_daddr), EXT_RAW |
   1451 			    EXD_FLOW_V6DADDR);
   1452 		}
   1453 		break;
   1454 	case AC_FLOW_SPORT:
   1455 		(void) ea_attach_item(record, &fu->fu_sport,
   1456 		    sizeof (uint16_t), EXT_UINT16 | EXD_FLOW_SPORT);
   1457 		break;
   1458 	case AC_FLOW_DPORT:
   1459 		(void) ea_attach_item(record, &fu->fu_dport,
   1460 		    sizeof (uint16_t), EXT_UINT16 | EXD_FLOW_DPORT);
   1461 		break;
   1462 	case AC_FLOW_PROTOCOL:
   1463 		(void) ea_attach_item(record, &fu->fu_protocol,
   1464 		    sizeof (uint8_t), EXT_UINT8 | EXD_FLOW_PROTOCOL);
   1465 		break;
   1466 	case AC_FLOW_DSFIELD:
   1467 		(void) ea_attach_item(record, &fu->fu_dsfield,
   1468 		    sizeof (uint8_t), EXT_UINT8 | EXD_FLOW_DSFIELD);
   1469 		break;
   1470 	case AC_FLOW_CTIME:
   1471 		(void) ea_attach_item(record, &fu->fu_ctime,
   1472 		    sizeof (uint64_t), EXT_UINT64 | EXD_FLOW_CTIME);
   1473 		break;
   1474 	case AC_FLOW_LSEEN:
   1475 		(void) ea_attach_item(record, &fu->fu_lseen,
   1476 		    sizeof (uint64_t), EXT_UINT64 | EXD_FLOW_LSEEN);
   1477 		break;
   1478 	case AC_FLOW_NBYTES:
   1479 		(void) ea_attach_item(record, &fu->fu_nbytes,
   1480 		    sizeof (uint64_t), EXT_UINT32 | EXD_FLOW_NBYTES);
   1481 		break;
   1482 	case AC_FLOW_NPKTS:
   1483 		(void) ea_attach_item(record, &fu->fu_npackets,
   1484 		    sizeof (uint64_t), EXT_UINT32 | EXD_FLOW_NPKTS);
   1485 		break;
   1486 	case AC_FLOW_PROJID:
   1487 		if (fu->fu_projid >= 0) {
   1488 			(void) ea_attach_item(record, &fu->fu_projid,
   1489 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_PROJID);
   1490 		}
   1491 		break;
   1492 	case AC_FLOW_UID:
   1493 		if (fu->fu_userid >= 0) {
   1494 			(void) ea_attach_item(record, &fu->fu_userid,
   1495 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_UID);
   1496 		}
   1497 		break;
   1498 	case AC_FLOW_ANAME:
   1499 		(void) ea_attach_item(record, fu->fu_aname,
   1500 		    strlen(fu->fu_aname) + 1, EXT_STRING | EXD_FLOW_ANAME);
   1501 		break;
   1502 	default:
   1503 		attached = 0;
   1504 	}
   1505 	return (attached);
   1506 }
   1507 
   1508 static ea_object_t *
   1509 exacct_assemble_flow_record(flow_usage_t *fu, ulong_t *mask,
   1510     ea_catalog_t record_type)
   1511 {
   1512 	int res, count;
   1513 	ea_object_t *record;
   1514 
   1515 	/*
   1516 	 * Assemble usage values into group.
   1517 	 */
   1518 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
   1519 	for (res = 1, count = 0; res <= AC_FLOW_MAX_RES; res++)
   1520 		if (BT_TEST(mask, res))
   1521 			count += exacct_attach_flow_item(fu, record, res);
   1522 	if (count == 0) {
   1523 		ea_free_object(record, EUP_ALLOC);
   1524 		record = NULL;
   1525 	}
   1526 	return (record);
   1527 }
   1528 
   1529 int
   1530 exacct_assemble_flow_usage(ac_info_t *ac_flow, flow_usage_t *fu,
   1531     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
   1532     void *ubuf, size_t ubufsize, size_t *actual)
   1533 {
   1534 	ulong_t mask[AC_MASK_SZ];
   1535 	ea_object_t *flow_usage;
   1536 	ea_catalog_t record_type;
   1537 	void *buf;
   1538 	size_t bufsize;
   1539 	int ret;
   1540 
   1541 	mutex_enter(&ac_flow->ac_lock);
   1542 	if (ac_flow->ac_state == AC_OFF) {
   1543 		mutex_exit(&ac_flow->ac_lock);
   1544 		return (ENOTACTIVE);
   1545 	}
   1546 	bt_copy(&ac_flow->ac_mask[0], mask, AC_MASK_SZ);
   1547 	mutex_exit(&ac_flow->ac_lock);
   1548 
   1549 	record_type = EXD_GROUP_FLOW;
   1550 
   1551 	flow_usage = exacct_assemble_flow_record(fu, mask, record_type);
   1552 	if (flow_usage == NULL) {
   1553 		return (0);
   1554 	}
   1555 
   1556 	/*
   1557 	 * Pack object into buffer and pass to callback.
   1558 	 */
   1559 	bufsize = ea_pack_object(flow_usage, NULL, 0);
   1560 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
   1561 	if (buf == NULL) {
   1562 		return (ENOMEM);
   1563 	}
   1564 
   1565 	(void) ea_pack_object(flow_usage, buf, bufsize);
   1566 
   1567 	ret = callback(ac_flow, ubuf, ubufsize, buf, bufsize, actual);
   1568 
   1569 	/*
   1570 	 * Free all previously allocations.
   1571 	 */
   1572 	kmem_free(buf, bufsize);
   1573 	ea_free_object(flow_usage, EUP_ALLOC);
   1574 	return (ret);
   1575 }
   1576 
   1577 void
   1578 exacct_commit_flow(void *arg)
   1579 {
   1580 	flow_usage_t *f = (flow_usage_t *)arg;
   1581 	size_t size;
   1582 	ulong_t mask[AC_MASK_SZ];
   1583 	struct exacct_globals *acg;
   1584 	ac_info_t *ac_flow;
   1585 
   1586 	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
   1587 		/*
   1588 		 * acctctl module not loaded. Nothing to do.
   1589 		 */
   1590 		return;
   1591 	}
   1592 
   1593 	/*
   1594 	 * Even though each zone nominally has its own flow accounting settings
   1595 	 * (ac_flow), these are only maintained by and for the global zone.
   1596 	 *
   1597 	 * If this were to change in the future, this function should grow a
   1598 	 * second zoneid (or zone) argument, and use the corresponding zone's
   1599 	 * settings rather than always using those of the global zone.
   1600 	 */
   1601 	acg = zone_getspecific(exacct_zone_key, global_zone);
   1602 	ac_flow = &acg->ac_flow;
   1603 
   1604 	mutex_enter(&ac_flow->ac_lock);
   1605 	if (ac_flow->ac_state == AC_OFF) {
   1606 		mutex_exit(&ac_flow->ac_lock);
   1607 		return;
   1608 	}
   1609 	bt_copy(&ac_flow->ac_mask[0], mask, AC_MASK_SZ);
   1610 	mutex_exit(&ac_flow->ac_lock);
   1611 
   1612 	(void) exacct_assemble_flow_usage(ac_flow, f, exacct_commit_callback,
   1613 	    NULL, 0, &size);
   1614 }
   1615 
   1616 /*
   1617  * int exacct_tag_task(task_t *, void *, size_t, int)
   1618  *
   1619  * Overview
   1620  *   exacct_tag_task() provides the exacct record construction and writing
   1621  *   support required by putacct(2) for task entities.
   1622  *
   1623  * Return values
   1624  *   The result of the write operation is returned, unless the extended
   1625  *   accounting facility is not active, in which case ENOTACTIVE is returned.
   1626  *
   1627  * Caller's context
   1628  *   Suitable for KM_SLEEP allocations.
   1629  */
   1630 int
   1631 exacct_tag_task(ac_info_t *ac_task, task_t *tk, void *ubuf, size_t ubufsz,
   1632     int flags)
   1633 {
   1634 	int error = 0;
   1635 	void *buf;
   1636 	size_t bufsize;
   1637 	ea_catalog_t cat;
   1638 	ea_object_t *tag;
   1639 
   1640 	mutex_enter(&ac_task->ac_lock);
   1641 	if (ac_task->ac_state == AC_OFF || ac_task->ac_vnode == NULL) {
   1642 		mutex_exit(&ac_task->ac_lock);
   1643 		return (ENOTACTIVE);
   1644 	}
   1645 	mutex_exit(&ac_task->ac_lock);
   1646 
   1647 	tag = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_TASK_TAG);
   1648 	(void) ea_attach_item(tag, &tk->tk_tkid, 0,
   1649 	    EXT_UINT32 | EXC_DEFAULT | EXD_TASK_TASKID);
   1650 	(void) ea_attach_item(tag, tk->tk_zone->zone_nodename, 0,
   1651 	    EXT_STRING | EXC_DEFAULT | EXD_TASK_HOSTNAME);
   1652 	if (flags == EP_RAW)
   1653 		cat = EXT_RAW | EXC_DEFAULT | EXD_TASK_TAG;
   1654 	else
   1655 		cat = EXT_EXACCT_OBJECT | EXC_DEFAULT | EXD_TASK_TAG;
   1656 	(void) ea_attach_item(tag, ubuf, ubufsz, cat);
   1657 
   1658 	bufsize = ea_pack_object(tag, NULL, 0);
   1659 	buf = kmem_alloc(bufsize, KM_SLEEP);
   1660 	(void) ea_pack_object(tag, buf, bufsize);
   1661 	error = exacct_vn_write(ac_task, buf, bufsize);
   1662 	kmem_free(buf, bufsize);
   1663 	ea_free_object(tag, EUP_ALLOC);
   1664 	return (error);
   1665 }
   1666 
   1667 /*
   1668  * exacct_tag_proc(pid_t, taskid_t, void *, size_t, int, char *)
   1669  *
   1670  * Overview
   1671  *   exacct_tag_proc() provides the exacct record construction and writing
   1672  *   support required by putacct(2) for processes.
   1673  *
   1674  * Return values
   1675  *   The result of the write operation is returned, unless the extended
   1676  *   accounting facility is not active, in which case ENOTACTIVE is returned.
   1677  *
   1678  * Caller's context
   1679  *   Suitable for KM_SLEEP allocations.
   1680  */
   1681 int
   1682 exacct_tag_proc(ac_info_t *ac_proc, pid_t pid, taskid_t tkid, void *ubuf,
   1683     size_t ubufsz, int flags, const char *hostname)
   1684 {
   1685 	int error = 0;
   1686 	void *buf;
   1687 	size_t bufsize;
   1688 	ea_catalog_t cat;
   1689 	ea_object_t *tag;
   1690 
   1691 	mutex_enter(&ac_proc->ac_lock);
   1692 	if (ac_proc->ac_state == AC_OFF || ac_proc->ac_vnode == NULL) {
   1693 		mutex_exit(&ac_proc->ac_lock);
   1694 		return (ENOTACTIVE);
   1695 	}
   1696 	mutex_exit(&ac_proc->ac_lock);
   1697 
   1698 	tag = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_PROC_TAG);
   1699 	(void) ea_attach_item(tag, &pid, sizeof (uint32_t),
   1700 	    EXT_UINT32 | EXC_DEFAULT | EXD_PROC_PID);
   1701 	(void) ea_attach_item(tag, &tkid, 0,
   1702 	    EXT_UINT32 | EXC_DEFAULT | EXD_TASK_TASKID);
   1703 	(void) ea_attach_item(tag, (void *)hostname, 0,
   1704 	    EXT_STRING | EXC_DEFAULT | EXD_TASK_HOSTNAME);
   1705 	if (flags == EP_RAW)
   1706 		cat = EXT_RAW | EXC_DEFAULT | EXD_PROC_TAG;
   1707 	else
   1708 		cat = EXT_EXACCT_OBJECT | EXC_DEFAULT | EXD_PROC_TAG;
   1709 	(void) ea_attach_item(tag, ubuf, ubufsz, cat);
   1710 
   1711 	bufsize = ea_pack_object(tag, NULL, 0);
   1712 	buf = kmem_alloc(bufsize, KM_SLEEP);
   1713 	(void) ea_pack_object(tag, buf, bufsize);
   1714 	error = exacct_vn_write(ac_proc, buf, bufsize);
   1715 	kmem_free(buf, bufsize);
   1716 	ea_free_object(tag, EUP_ALLOC);
   1717 	return (error);
   1718 }
   1719 
   1720 /*
   1721  * void exacct_init(void)
   1722  *
   1723  * Overview
   1724  *   Initialized the extended accounting subsystem.
   1725  *
   1726  * Return values
   1727  *   None.
   1728  *
   1729  * Caller's context
   1730  *   Suitable for KM_SLEEP allocations.
   1731  */
   1732 void
   1733 exacct_init()
   1734 {
   1735 	exacct_queue = system_taskq;
   1736 	exacct_object_cache = kmem_cache_create("exacct_object_cache",
   1737 	    sizeof (ea_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
   1738 }
   1739 
   1740 /*
   1741  * exacct_snapshot_proc_mstate() copies a process's microstate accounting data
   1742  * and resource usage counters into a given task_usage_t. It differs from
   1743  * exacct_copy_proc_mstate() in that here a) we are copying to a task_usage_t,
   1744  * b) p_lock will have been acquired earlier in the call path and c) we
   1745  * are here including the process's user and system times.
   1746  */
   1747 static void
   1748 exacct_snapshot_proc_mstate(proc_t *p, task_usage_t *tu)
   1749 {
   1750 	tu->tu_utime  = mstate_aggr_state(p, LMS_USER);
   1751 	tu->tu_stime  = mstate_aggr_state(p, LMS_SYSTEM);
   1752 	tu->tu_minflt = p->p_ru.minflt;
   1753 	tu->tu_majflt = p->p_ru.majflt;
   1754 	tu->tu_sndmsg = p->p_ru.msgsnd;
   1755 	tu->tu_rcvmsg = p->p_ru.msgrcv;
   1756 	tu->tu_ioch   = p->p_ru.ioch;
   1757 	tu->tu_iblk   = p->p_ru.inblock;
   1758 	tu->tu_oblk   = p->p_ru.oublock;
   1759 	tu->tu_vcsw   = p->p_ru.nvcsw;
   1760 	tu->tu_icsw   = p->p_ru.nivcsw;
   1761 	tu->tu_nsig   = p->p_ru.nsignals;
   1762 	tu->tu_nswp   = p->p_ru.nswap;
   1763 	tu->tu_nscl   = p->p_ru.sysc;
   1764 }
   1765 
   1766 /*
   1767  * void exacct_move_mstate(proc_t *, task_t *, task_t *)
   1768  *
   1769  * Overview
   1770  *   exacct_move_mstate() is called by task_change() and accounts for
   1771  *   a process's resource usage when it is moved from one task to another.
   1772  *
   1773  *   The process's usage at this point is recorded in the new task so
   1774  *   that it can be excluded from the calculation of resources consumed
   1775  *   by that task.
   1776  *
   1777  *   The resource usage inherited by the new task is also added to the
   1778  *   aggregate maintained by the old task for processes that have exited.
   1779  *
   1780  * Return values
   1781  *   None.
   1782  *
   1783  * Caller's context
   1784  *   pidlock and p_lock held across exacct_move_mstate().
   1785  */
   1786 void
   1787 exacct_move_mstate(proc_t *p, task_t *oldtk, task_t *newtk)
   1788 {
   1789 	task_usage_t tu;
   1790 
   1791 	/* Take a snapshot of this process's mstate and RU counters */
   1792 	exacct_snapshot_proc_mstate(p, &tu);
   1793 
   1794 	/*
   1795 	 * Use the snapshot to increment the aggregate usage of the old
   1796 	 * task, and the inherited usage of the new one.
   1797 	 */
   1798 	mutex_enter(&oldtk->tk_usage_lock);
   1799 	exacct_add_task_mstate(oldtk->tk_usage, &tu);
   1800 	mutex_exit(&oldtk->tk_usage_lock);
   1801 	mutex_enter(&newtk->tk_usage_lock);
   1802 	exacct_add_task_mstate(newtk->tk_inherited, &tu);
   1803 	mutex_exit(&newtk->tk_usage_lock);
   1804 }
   1805