Home | History | Annotate | Download | only in diskomizer
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 #pragma ident	"@(#)diskomizer64mpism.c	2.91	09/07/16 SMI"
     23 
     24 /*
     25  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     26  * Use is subject to license terms.
     27  */
     28 
     29 /*
     30  *	diskomizer64mpism
     31  *
     32  *	Write to and then read from disk partitions and or files.
     33  *
     34  *	This is a test program.
     35  *
     36  *	To do:
     37  *
     38  *		The messages it prints out at the begining are a mess.
     39  *		The code should be better commented.
     40  *
     41  *	Chris.Gerhard (at) uk.sun.com - SMCC CTE
     42  */
     43 #include "args.h"
     44 #include "diskomizer64mpism.h"
     45 #include "bufs.h"
     46 #include "buf_init.h"
     47 #include <netdb.h>
     48 #include <sys/systeminfo.h>
     49 #include <tnf/probe.h>
     50 #include <sys/times.h>
     51 #include <diskomizer/log.h>
     52 #include "findap.h"
     53 #include "device_control.h"
     54 #include "timeval.h"
     55 #include "list_ops.h"
     56 #include "bits.h"
     57 #include "locks.h"
     58 #include "shm_ops.h"
     59 #include "signal_catch.h"
     60 #include "limit.h"
     61 #include "time.h"
     62 #include "prompt.h"
     63 #include "errors.h"
     64 #include "utils.h"
     65 #include "shared_device_info.h"
     66 #include "decode_errors.h"
     67 #include <sys/utsname.h>
     68 #include <sys/statvfs.h>
     69 #include <dlfcn.h>
     70 #include <diskomizer/uadmin.h>
     71 #include <usage_tracking/usage_tracking.h>
     72 #include "disko_usage_track.h"
     73 #ifdef __i386
     74 #include <note.h>
     75 #endif
     76 
     77 #define	OPEN_BRACE '{'
     78 #define	CLOSE_BRACE '}'
     79 #define	DEFAULT_BLOCK_SIZE 0x200 /* 512 */
     80 
     81 static char diskomizer_str[] = "diskomizer";
     82 iolen_index_t max_disk_io_len;
     83 static char *diffs;
     84 static char diffs_str[] = "diffs";
     85 
     86 static char write_str[] = "write";
     87 static char read_str[] = "read";
     88 static const char nil[] = "nil";
     89 #define	NIL(A) (A == NULL ? nil : A)
     90 static char *random_str;
     91 static hrtime_t stoptime;
     92 static time_t (*secs_till_exit)(void);
     93 static struct timeval start_time;
     94 
     95 void *usage_tracking_handle;
     96 
     97 enum read_type {
     98 	NORMAL_READ,
     99 	RETRY_READ,
    100 	WRITE_READ,
    101 	READ_ONLY_RAND,
    102 	READ_ONLY_SEQ
    103 };
    104 typedef enum read_type read_type_t;
    105 
    106 typedef uchar_t (*initializer_t)(int buf, int i);
    107 
    108 static int exit_status = EXIT_SUCCESS;
    109 
    110 /*
    111  * The minimum block size that can be used. Essentially the lowest common
    112  * muliple of the blocksizes available.
    113  */
    114 static int min_block_size;
    115 
    116 struct proc_store {
    117 	pid_t pid;
    118 };
    119 static struct proc_store *proc_store;
    120 /*
    121  * The daio_ops
    122  */
    123 static struct daio_ops *daio;
    124 /*
    125  * All the functions we have
    126  */
    127 time_t handle_read(struct aio_str *aiop, ullong_t start);
    128 time_t do_new_read(struct aio_str *aiop, ullong_t start, read_type_t read_type);
    129 time_t handle_readonly_rand(struct aio_str *aiop, ullong_t start);
    130 
    131 static int proc_no;
    132 long long convert_time(struct timeval tv);
    133 void update_time_stats(char off, struct times *tp, long long tyme,
    134 	struct aio_str *aiop);
    135 static int pend_write_with_lock(bitmap_t map[], ullong_t off, int maplen);
    136 static int do_memcmp(ullong_t start, struct aio_str *aiop);
    137 static struct device *open_device(char *name, struct paths *, ullong_t size,
    138 	int paths_to_use, int error_paths);
    139 static void check_matching_io(ullong_t start, struct aio_str *aiop);
    140 static int is_master(void);
    141 static void unwritten_block_rand(bitmap_t *map, struct aio_str *aiop,
    142 	ullong_t start, ullong_t len, int maplen);
    143 static void unwritten_block_seq(bitmap_t *map, struct aio_str *aiop,
    144 	ullong_t start, ullong_t len, int maplen);
    145 extern void close_and_free_paths(struct device *dev);
    146 extern void run_func(uchar_t *buf, size_t size);
    147 static struct bufhdr
    148 	build_bufhr(struct device *dev, ullong_t start, ullong_t off);
    149 void newfd(struct aio_str *aiop);
    150 struct fds *open_path(struct device *devp, char *name, ullong_t size);
    151 void cancel_all_io_byfd(struct fds *fd);
    152 struct fds *find_path(struct fds *fdhead, char path_id);
    153 static int check_for_duplicate_paths(struct device *devp);
    154 static void do_start_cancelled_io(struct device *devices, ullong_t start);
    155 static int return_zero(void);
    156 static int return_one(void);
    157 /*
    158  * The error handling functions.
    159  */
    160 static loop_type on_error_reread(ullong_t start, struct aio_str *aiop);
    161 static loop_type on_error_exit(ullong_t start, struct aio_str *aiop);
    162 static loop_type on_error_stop(ullong_t start, struct aio_str *aiop);
    163 static loop_type on_error_nop(ullong_t start, struct aio_str *aiop);
    164 static loop_type on_error_abort(ullong_t start, struct aio_str *aiop);
    165 static loop_type on_error_pause(ullong_t start, struct aio_str *aiop);
    166 static loop_type on_error_retry(ullong_t start, struct aio_str *aiop);
    167 static loop_type on_error_rewrite(ullong_t start, struct aio_str *aiop);
    168 static loop_type on_error_fail_path(ullong_t start, struct aio_str *aiop);
    169 /*
    170  * Error handling init functions.
    171  */
    172 static int init_path_stop_check(void);
    173 static int init_stop_check(void);
    174 /*
    175  * the "globals" that we use
    176  */
    177 struct device *devices; /* all the devices there are */
    178 
    179 write_buf_initializer_t init_uchar_func;
    180 static read_buf_initializer_t read_buffer_initializer;
    181 
    182 pid_t pgrp;
    183 /*
    184  * statics
    185  */
    186 static pid_t parent_pid;
    187 static pid_t killer_pid;
    188 static on_error_t *on_error_corrupt;
    189 static on_error_t *on_error_short;
    190 static on_error_t *on_write_error;
    191 static int Longest_device_name = 0;
    192 static int Longest_logical_name = 0;
    193 static int write_loops;
    194 
    195 static int usr1_exit = 0;
    196 
    197 struct shm_ops *shm_ops;
    198 /*
    199  * Count of the total number of io's that are currently cancelled.
    200  */
    201 static int cancelled_count = 0;
    202 /*
    203  * Start cancelled.  Only gets unset from nop if there are cancelled ios
    204  * to restart.  A rare thing.
    205  */
    206 static void (*start_cancelled_io)(struct device *, ullong_t) =
    207 	(void (*)(struct device *, ullong_t)) nop;
    208 static void (*start_deferred)(struct device *dev, ullong_t) =
    209 	(void (*)(struct device *, ullong_t)) nop;
    210 
    211 static int (*stop_check)(void *handle) = (int (*)(void *))return_zero;
    212 static int (*path_stop_check)(struct fds *fd, struct device *dev) =
    213 	(int (*)(struct fds *, struct device *dev))return_zero;
    214 static char nom[] = "no memory";
    215 #define	NOT_NULL(A) (A == NULL ? &nom[0] : A)
    216 #define	PLURAL(A) (A == 1 ? "" : "s")
    217 #define	LEN_BYTES2BLOCKS(A) (A->length / INDEX_TO_DIOLEN(max_disk_io_len))
    218 #define	TRUE_OR_FALSE(A) (A ? "true" : "false")
    219 
    220 struct error_handlers {
    221 	char *name; /* String that describes this error handler */
    222 	on_error_t func; /* error handleing function */
    223 	int (*setup)(void); /* init routine for the error handler */
    224 	uint_t breaker:1; /* Is this the last error handler on the list */
    225 	uint_t rw:2; /* Does this hander apply to read or write or both */
    226 };
    227 
    228 #define	READ_ERR 1
    229 #define	WRITE_ERR (READ_ERR << 1)
    230 #define	BOTH_ERR (READ_ERR | WRITE_ERR)
    231 
    232 struct error_handlers on_error_table[] = {
    233 	{"EXIT", on_error_exit, return_one, 1, BOTH_ERR},
    234 	{"ABORT", on_error_abort, return_one, 1, BOTH_ERR},
    235 	{"CONTINUE", on_error_nop, return_one, 0, BOTH_ERR},
    236 	{"NONE", on_error_nop, return_one, 0, BOTH_ERR},
    237 	{"STOP", on_error_stop, init_stop_check, 1, BOTH_ERR},
    238 	{"PAUSE", on_error_pause, return_one, 0, BOTH_ERR},
    239 	{"RETRY", on_error_retry, return_one, 0, BOTH_ERR},
    240 	{"FAIL_PATH", on_error_fail_path, init_path_stop_check, 0, BOTH_ERR},
    241 	{"UADMIN", on_error_uadmin, uadmin_init, 1, BOTH_ERR},
    242 	{"REREAD", on_error_reread, return_one, 0, READ_ERR},
    243 	{"REWRITE", on_error_rewrite, return_one, 0, WRITE_ERR}
    244 };
    245 
    246 /*
    247  * TNF declarations.
    248  */
    249 /*
    250  * The DEFINE should not have explicit mentions of the daio_ZZZZ
    251  * elements, they should be opaque.
    252  */
    253 TNF_DEFINE_RECORD_5(aio_str_t, aio_tnf_str,
    254 	tnf_opaque, buf, tnf_short, iolen, tnf_ulonglong, off,
    255 	tnf_longlong, aio_res.result.daio_return,
    256 	tnf_uint, aio_res.result.daio_errno)
    257 /*
    258  * locking functions.
    259  */
    260 static char *
    261 hostname(void)
    262 {
    263 	static char hostname[MAXHOSTNAMELEN + 1];
    264 	(void) sysinfo(SI_HOSTNAME, &hostname[0], MAXHOSTNAMELEN);
    265 	return (&hostname[0]);
    266 }
    267 int
    268 this_proc(void)
    269 {
    270 	return (proc_no);
    271 }
    272 static int
    273 return_one(void)
    274 {
    275 	return (1);
    276 }
    277 static int
    278 return_zero(void)
    279 {
    280 	return (0);
    281 }
    282 
    283 void
    284 nop(void)
    285 {
    286 }
    287 
    288 static void
    289 not_null_free(void *ptr)
    290 {
    291 	if (ptr != NULL)
    292 		free(ptr);
    293 }
    294 static time_t
    295 inf_secs_till_exit(void)
    296 {
    297 	return (LONG_MAX);
    298 }
    299 static time_t
    300 do_secs_till_exit(void)
    301 {
    302 	return ((stoptime - gethrtime()) / BILLION);
    303 }
    304 
    305 off64_t
    306 start_offset(void)
    307 {
    308 	return ((off64_t)(opts.start_offset *
    309 	    (ullong_t)INDEX_TO_DIOLEN(max_disk_io_len)));
    310 }
    311 
    312 static void
    313 return_aio_read_buf(struct aio_str *aiop)
    314 {
    315 	if (!(aiop->count % opts.expert_release_read_buffers_after_n_uses) &&
    316 	    aiop->buf != NULL) {
    317 		return_read_buf(aiop->buf);
    318 		aiop->buf = NULL;
    319 	}
    320 }
    321 
    322 static int
    323 do_stop_check(void *handle)
    324 {
    325 	return (get_shared_stop_flag(handle, this_proc()));
    326 }
    327 int
    328 is_readonly(void)
    329 {
    330 	return (opts.o_rdonly == 1);
    331 }
    332 const char *
    333 rw_string(void)
    334 {
    335 	return (is_readonly() ? read_str : write_str);
    336 }
    337 /*
    338  * background.  disassociate from controlling tty make session leader
    339  * 		then fork.  The parent exits and the child goes on in
    340  *		the back ground.
    341  */
    342 static void
    343 background()
    344 {
    345 	pid_t pid;
    346 	(void) freopen("/dev/null", "+r", stdin);
    347 
    348 	pid = opts.use_fork1 == 0 ? fork() : fork1();
    349 
    350 	if (pid == 0) {
    351 		if (setsid() == (pid_t)-1)
    352 			pperror("setsid");
    353 		return;
    354 	}
    355 	if (pid < 0) {
    356 		FORK_ERROR(opts.use_fork1 == 0 ? "" : "s");
    357 		exit(1);
    358 	}
    359 	exit(0);
    360 }
    361 static struct blks *
    362 aio_attach(struct aio_str *aiop)
    363 {
    364 	int error_count = 0;
    365 	struct blks *blocks;
    366 	while ((blocks = shm_ops->attach(AIO_BLOCK_HANDLE(aiop))) ==
    367 	    NULL) {
    368 		if (error_count++ % 10000 == 0)
    369 			ATTACH_ERROR(AIO_BLOCK_HANDLE(aiop));
    370 	}
    371 	if (error_count > 0)
    372 		plog(LOG_WARNING, "attached o.k.\n");
    373 	return (blocks);
    374 }
    375 static void
    376 update_aio_time_stats(struct aio_str *aiop, struct times *ts)
    377 {
    378 	if (aiop->count > 0) {
    379 		ullong_t len = LEN_BYTES2BLOCKS(aiop->dev);
    380 
    381 		update_time_stats((100 * MIN(aiop->dev->block, len))/
    382 		    ((aiop->dev->length/
    383 		    INDEX_TO_DIOLEN(max_disk_io_len))),
    384 		    ts,
    385 		    DAIO_GET_TIME_TAKEN(aiop->aio_res), aiop);
    386 	}
    387 }
    388 
    389 static void
    390 update_aio_read_stats(struct aio_str *aiop)
    391 {
    392 	update_aio_time_stats(aiop, &aiop->fd->read_times);
    393 }
    394 
    395 static void
    396 update_aio_write_stats(struct aio_str *aiop)
    397 {
    398 	update_aio_time_stats(aiop, &aiop->fd->write_times);
    399 }
    400 
    401 ullong_t
    402 diskomizer_off2byteoff(ullong_t off)
    403 {
    404 	return ((off + opts.start_offset) * INDEX_TO_DIOLEN(max_disk_io_len));
    405 }
    406 
    407 static ullong_t
    408 byteoff2diskomizer_off(ullong_t off)
    409 {
    410 	return ((off/INDEX_TO_DIOLEN(max_disk_io_len)) - opts.start_offset);
    411 }
    412 /*
    413  * Sanity check.
    414  */
    415 #define	ASSERT_OFFSET(X) \
    416 	assert(byteoff2diskomizer_off(diskomizer_off2byteoff(X)) == X)
    417 
    418 static ullong_t
    419 aio_str2byteoff(struct aio_str *aiop)
    420 {
    421 	return (diskomizer_off2byteoff(aiop->off));
    422 }
    423 
    424 static int64_t
    425 aio_str2lba(struct aio_str *aiop)
    426 {
    427 	long long byteoff;
    428 	int64_t lba;
    429 
    430 	if (aiop->dev->v_part == NULL || aiop->dev->device_block_size == 0) {
    431 		return (-1);
    432 	}
    433 	byteoff = aio_str2byteoff(aiop);
    434 
    435 	lba = byteoff / (int64_t)aiop->dev->device_block_size;
    436 
    437 	return (aiop->dev->v_part->p_start + lba);
    438 }
    439 
    440 static void
    441 plog_dd(int pri, struct aio_str *aiop)
    442 {
    443 	if ((INDEX_TO_DIOLEN(max_disk_io_len) %
    444 	    INDEX_TO_DIOLEN(aiop->iolen)) == 0) {
    445 		daio->plog_dd(pri, aiop->fd->fd, INDEX_TO_DIOLEN(aiop->iolen),
    446 		    INDEX_TO_DIOLEN(max_disk_io_len),
    447 		    aio_str2byteoff(aiop));
    448 
    449 	}
    450 }
    451 
    452 static void
    453 report_device(int pri, struct aio_str *aiop)
    454 {
    455 	plog(pri, "Requested File %s (%s)\n",
    456 	    aiop->fd->name, aiop->dev->logicalname);
    457 }
    458 /*
    459  * report block.
    460  *
    461  * 	Report all the information about the block that was requested to be read
    462  */
    463 static void
    464 report_offset(int pri, struct aio_str *aiop)
    465 {
    466 	long long byteoff;
    467 	long long lba;
    468 
    469 	byteoff = aio_str2byteoff(aiop);
    470 
    471 	lba = aio_str2lba(aiop);
    472 
    473 	if (lba >= 0) {
    474 		plog(pri,
    475 		    "Requested File offset 0t%lld (0x%llx), block size "
    476 		    "0t%d (0x%x), LBA 0t%lld (0x%llx)\n",
    477 		    byteoff, byteoff, INDEX_TO_DIOLEN(aiop->iolen),
    478 		    INDEX_TO_DIOLEN(aiop->iolen), lba, lba);
    479 	} else {
    480 		plog(pri, "Requested File offset 0t%lld (0x%llx), block size "
    481 		    "0t%d (0x%x)\n", byteoff, byteoff,
    482 		    INDEX_TO_DIOLEN(aiop->iolen),
    483 		    INDEX_TO_DIOLEN(aiop->iolen));
    484 	}
    485 }
    486 
    487 static void
    488 report_device_and_offset(int pri, struct aio_str *aiop)
    489 {
    490 	report_device(pri, aiop);
    491 	report_offset(pri, aiop);
    492 }
    493 
    494 
    495 /*
    496  * report_error.  This is the generic error reporting routine.
    497  * It reports all errors to stderr, giving similar information
    498  * and advise as to other commands that can be tried.
    499  */
    500 void
    501 report_error(struct aio_str *aiop, const union err_info u, err_type error)
    502 {
    503 	struct timeval now_tv;
    504 	long long disk_block;
    505 	int pri;
    506 
    507 	while (my_gettimeofday(&now_tv, NULL) == -1)
    508 		pperror("gettimeofday");
    509 
    510 	disk_block = aio_str2byteoff(aiop);
    511 
    512 	mutex->stderr_enter();
    513 
    514 
    515 	if (error == ERR_HUNG) {
    516 		pri = LOG_WARNING;
    517 		time_log(pri, now_tv.tv_sec, "Time now");
    518 		time_log(pri, aiop->tv.tv_sec, "Requested io requested at");
    519 
    520 		report_device(pri, aiop);
    521 
    522 		if (is_readonly()) {
    523 			plog(pri, "%s has %ld out of %ld read%s\n",
    524 			    aiop->dev->logicalname,
    525 			    aiop->fd->number_of_hung_read,
    526 			    aiop->fd->total_read,
    527 			    aiop->fd->number_of_hung_read == 1 ? "" : "s");
    528 		} else {
    529 			plog(pri, "%s has %ld out of %ld read%s and %ld "
    530 			    "out of %ld write%s\n",
    531 			    aiop->dev->logicalname,
    532 			    aiop->fd->number_of_hung_read,
    533 			    aiop->fd->total_read,
    534 			    aiop->fd->number_of_hung_read == 1 ? "" : "s",
    535 			    aiop->fd->number_of_hung_write,
    536 			    aiop->fd->total_write,
    537 			    aiop->fd->number_of_hung_write == 1 ? "" : "s");
    538 		}
    539 		plog(pri, "waiting for more than %ld second%s\n",
    540 		    u.time, PLURAL(u.time));
    541 		if (is_readonly()) {
    542 			plog(pri, "Last read took %lld\n",
    543 			    aiop->fd->last_read_time/BILLION);
    544 		} else {
    545 			plog(pri, "Last read took %lld, last write took "
    546 			    "%lld seconds\n",
    547 			    aiop->fd->last_read_time/BILLION,
    548 			    aiop->fd->last_write_time/BILLION);
    549 		}
    550 		plog(pri,
    551 		    "oldest i/o is a %s waiting for %ld second%s\n",
    552 		    is_read_io(aiop) ? "read" : "write",
    553 		    now_tv.tv_sec - aiop->tv.tv_sec,
    554 		    ((now_tv.tv_sec - aiop->tv.tv_sec) > 1) ? "s" : "");
    555 	} else if (error == ERR_CORRUPT) {
    556 		time_t request_time;
    557 		time_t return_time;
    558 		int read_count;
    559 		struct blks *blocks;
    560 		struct fds *fd;
    561 
    562 		pri = LOG_ERR;
    563 
    564 		time_log(pri, now_tv.tv_sec, "Time now");
    565 		time_log(pri, aiop->tv.tv_sec, "Requested io requested at");
    566 
    567 		report_device_and_offset(pri, aiop);
    568 
    569 		report_error_desc(pri, aiop, u.str);
    570 
    571 		blocks = aio_attach(aiop);
    572 
    573 		fd = find_path(aiop->dev->fdhead,
    574 		    blocks[AIO_BLOCK_INDEX(aiop)].path_id);
    575 		assert(fd != NULL);
    576 
    577 		read_count = blocks[AIO_BLOCK_INDEX(aiop)].read_count;
    578 		request_time = blocks[AIO_BLOCK_INDEX(aiop)].last_requested;
    579 		return_time = request_time + GET_LAST_RETURN(
    580 		    blocks[AIO_BLOCK_INDEX(aiop)].last_returned_delta);
    581 
    582 		shm_ops->detach(AIO_BLOCK_HANDLE(aiop));
    583 
    584 		time_log(pri, blocks[AIO_BLOCK_INDEX(aiop)].last_requested,
    585 		    "Last %s to the requested block submitted", rw_string());
    586 		time_log(pri, return_time,
    587 		    "Last %s to the requested block  returned", rw_string());
    588 
    589 		plog(pri, "Last %s to the requested block used path: %s\n",
    590 		    rw_string(), fd->name);
    591 
    592 		if (!is_readonly()) {
    593 			plog(pri, "Requested block has been read %d times "
    594 			    "since last written\n", read_count);
    595 		}
    596 
    597 		decode_errors(pri, aiop, read_count);
    598 	} else if (error == ERR_DEFERRED) {
    599 		pri = LOG_WARNING;
    600 
    601 		time_log(pri, now_tv.tv_sec, "Time now");
    602 		time_log(pri, aiop->tv.tv_sec, "Requested io requested at");
    603 		plog(pri, "%s to device %s deferred\n",
    604 		    u.str, aiop->dev->logicalname);
    605 	} else if (DAIO_RETURN(aiop->aio_res) < 0) {
    606 		char *datestr;
    607 		pri = LOG_ERR;
    608 
    609 		time_log(pri, now_tv.tv_sec, "Time now");
    610 		time_log(pri, aiop->tv.tv_sec, "Requested io requested at");
    611 		(void) plog(pri, "%s %s%s error, errno %d %s\n",
    612 		    aiop->fd->name, u.str,
    613 		    aiop->retrycnt > 0 ? " retry" : "",
    614 		    DAIO_ERROR(aiop->aio_res),
    615 		    strerror(DAIO_ERROR(aiop->aio_res)));
    616 		datestr = alloc_time_str_fmt(aiop->tv.tv_sec, "%b %e %H:%M");
    617 		if (datestr != NULL) {
    618 			plog(pri, "Try \"egrep '^%s.*%s' "
    619 			    "/var/adm/messages\"\n",
    620 			    datestr, hostname());
    621 			free(datestr);
    622 		}
    623 	} else {
    624 		pri = LOG_WARNING;
    625 
    626 		time_log(pri, now_tv.tv_sec, "Time now");
    627 		time_log(pri, aiop->tv.tv_sec, "Requested io requested at");
    628 		plog(pri, "%s short %s%s, Transferred %ld (%#lx)"
    629 		    " bytes, requested %d (%#x) bytes.\n",
    630 		    aiop->fd->name, u.str,
    631 		    aiop->retrycnt > 0 ? " retry" : "",
    632 		    (long)DAIO_RETURN(aiop->aio_res),
    633 		    (long)DAIO_RETURN(aiop->aio_res),
    634 		    INDEX_TO_DIOLEN(aiop->iolen),
    635 		    INDEX_TO_DIOLEN(aiop->iolen));
    636 	}
    637 	dlog(pri, "Block at byte offset 0t%lld (%#llx) block size %d (%#x)\n",
    638 	    disk_block, disk_block, INDEX_TO_DIOLEN(aiop->iolen),
    639 	    INDEX_TO_DIOLEN(aiop->iolen));
    640 	plog_dd(pri, aiop);
    641 
    642 	(void) fflush(stderr);
    643 	(void) fsync(fileno(stderr));
    644 	mutex->stderr_exit();
    645 }
    646 ulong_t
    647 my_lrand(void)
    648 {
    649 	union {
    650 		ulong_t l;
    651 		uint32_t i[sizeof (ulong_t)/sizeof (uint32_t)];
    652 	} u;
    653 	int i;
    654 #ifdef __lint
    655 	ZERO_OBJ(u);
    656 #endif
    657 
    658 	for (i = 0; i < (sizeof (ulong_t)/sizeof (uint32_t)); i++)
    659 		u.i[i] = (uint32_t)lrand48();
    660 
    661 	return (u.l);
    662 }
    663 
    664 static void
    665 remove_from_all_aios(struct aio_str *aiop)
    666 {
    667 	remove_from_aio_list(&aiop->fd->all_aios, aiop);
    668 }
    669 
    670 static void
    671 infantacide(void)
    672 {
    673 	(void) killpg(pgrp, SIGTERM);
    674 }
    675 /*ARGSUSED*/
    676 static loop_type
    677 on_error_exit(ullong_t start, struct aio_str *aiop)
    678 {
    679 	union err_info err_info;
    680 
    681 	err_info.str = "On error exit";
    682 	DAIO_SET_RETURN(aiop->aio_res, 0);
    683 	report_error(aiop, err_info, ERR_SYS);
    684 	remove_from_all_aios(aiop);
    685 	(void) sighold(SIGTERM);
    686 	if (incr_shared_device_error(aiop->dev->shared_data_handle,
    687 	    aiop->dev->errors) != -1) {
    688 		aiop->dev->errors = 0;
    689 	}
    690 	exit_status = EXIT_FAILURE;
    691 	exit(exit_status);
    692 	/*NOTREACHED*/
    693 	return (BREAK);
    694 }
    695 /*ARGSUSED*/
    696 static loop_type
    697 on_error_stop(ullong_t start, struct aio_str *aiop)
    698 {
    699 	pfprintf(stderr, "%s Set On error stop\n", aiop->fd->name);
    700 	if (set_shared_stop_flag(aiop->dev->shared_data_handle) == -1)
    701 		aiop->dev->need_to_stop = 1;
    702 	aiop->dev->stop_flag = 1;
    703 	return (BREAK);
    704 }
    705 
    706 /*ARGSUSED*/
    707 static loop_type
    708 on_error_nop(ullong_t start, struct aio_str *aiop)
    709 {
    710 	if (is_write_io(aiop)) {
    711 		struct blks *blocks;
    712 
    713 		/*
    714 		 * Clear the last io as the retries never suceeded
    715 		 * so we don't read this block which is now in an
    716 		 * undefined state.
    717 		 */
    718 		blocks = aio_attach(aiop);
    719 		blocks[AIO_BLOCK_INDEX(aiop)].r.w.last_io = NULL;
    720 		shm_ops->detach(AIO_BLOCK_HANDLE(aiop));
    721 	}
    722 	pfprintf(stderr, "%s On error continue\n", aiop->fd->name);
    723 	return (BREAK);
    724 }
    725 
    726 /*ARGSUSED*/
    727 static loop_type
    728 on_error_abort(ullong_t start, struct aio_str *aiop)
    729 {
    730 	union err_info err_info;
    731 
    732 	err_info.str = "On error abort";
    733 	report_error(aiop, err_info, ERR_SYS);
    734 	/* pfprintf(stderr, "On error abort\n"); */
    735 	(void) sighold(SIGTERM);
    736 	if (incr_shared_device_error(aiop->dev->shared_data_handle,
    737 	    aiop->dev->errors) != -1) {
    738 		aiop->dev->errors = 0;
    739 	}
    740 	abort(); /* On error abort. This one is o.k. */
    741 	return (BREAK);
    742 }
    743 /*
    744  *	report_hangers_fd.  counts the number of I/O requests that
    745  * 	have been waiting for more than hanger_time seconds and then
    746  *	calls report_error() with the i/o that has been waiting the
    747  *	longest and a count of the number of i/o requests that are
    748  * 	over time.  It only calls report_error() when the number of i/o
    749  * 	requests or the oldest outstanding i/o change or if the last
    750  *	report was more than hanger_time seconds ago and there are some
    751  *	i/o hung.
    752  */
    753 static int
    754 report_hangers_fd(struct fds *fd, time_t tyme, time_t hanger_time)
    755 {
    756 	int total_hung_read = 0;
    757 	int total_hung_write = 0;
    758 #ifdef IO_COUNT_DEBUG
    759 	int total_read = 0;
    760 	int total_write = 0;
    761 #endif
    762 	struct aio_str *aiop;
    763 	union err_info err_info;
    764 
    765 	if (fd->error_path != 0 || fd->stop_flag != 0) {
    766 		return (0);
    767 	}
    768 	/* fd->total_read = fd->total_write = 0; */
    769 
    770 	err_info.time = hanger_time;
    771 
    772 	for (aiop = fd->all_aios.head; aiop != NULL; aiop = aiop->next) {
    773 		if (tyme - aiop->tv.tv_sec > hanger_time) {
    774 			if (is_read_io(aiop)) {
    775 				total_hung_read++;
    776 			} else {
    777 				total_hung_write++;
    778 			}
    779 		} else {
    780 			break;
    781 		}
    782 	}
    783 
    784 #ifdef IO_COUNT_DEBUG
    785 	assert(total_read == fd->total_read);
    786 	assert(total_write == fd->total_write);
    787 #endif
    788 
    789 	if (fd->oldest_io == NULL) {
    790 		fd->oldest_io = fd->all_aios.head;
    791 	}
    792 
    793 	if (total_hung_read != fd->number_of_hung_read ||
    794 	    total_hung_write != fd->number_of_hung_write ||
    795 	    (fd->all_aios.head != fd->oldest_io &&
    796 	    (total_hung_read || total_hung_write))) {
    797 
    798 		fd->number_of_hung_read = total_hung_read;
    799 		fd->number_of_hung_write = total_hung_write;
    800 		report_error(fd->all_aios.head, err_info, ERR_HUNG);
    801 		fd->last_report = tyme;
    802 		fd->oldest_io = fd->all_aios.head;
    803 
    804 	} else if (total_hung_read + total_hung_write > 0 &&
    805 	    fd->all_aios.head != NULL && fd->last_report + hanger_time < tyme) {
    806 
    807 		report_error(fd->all_aios.head, err_info, ERR_HUNG);
    808 		fd->last_report = tyme;
    809 	}
    810 	return (total_hung_read + total_hung_write);
    811 }
    812 /*
    813  * Search the list of i/o that are currently outstanding and report
    814  * on any that have been outstanding for more than hanger_time.
    815  * Also display howlong the oldest i/o has been Waiting for and when
    816  * it was submitted.
    817  */
    818 static void
    819 report_hangers(struct device *dev, time_t tyme, time_t hanger_time)
    820 {
    821 	int total = 0;
    822 	struct fds *fd;
    823 
    824 	for (fd = dev->fdhead; ; fd = fd->next) {
    825 		total += report_hangers_fd(fd, tyme, hanger_time);
    826 		if (dev->fdhead == fd->next)
    827 			break;
    828 	}
    829 }
    830 static void
    831 report_all_hangers(struct device *dev, time_t hanger_time)
    832 {
    833 	struct timeval tv;
    834 
    835 	while (my_gettimeofday(&tv, NULL) == -1)
    836 		pperror("gettimeofday");
    837 
    838 	for (; dev != NULL; dev = dev->next) {
    839 		report_hangers(dev, tv.tv_sec, hanger_time);
    840 	}
    841 }
    842 char *
    843 my_strdup(const char *s)
    844 {
    845 	char *x = strdup(s);
    846 	if (x == NULL) {
    847 		STRDUP_ERROR(s);
    848 	}
    849 	return (x);
    850 }
    851 void *
    852 my_calloc(long a, long b)
    853 {
    854 	void *x;
    855 
    856 	x = calloc(a, b);
    857 
    858 	if (x == NULL) {
    859 		CALLOC_ERROR(a, b);
    860 		return (NULL);
    861 	}
    862 	return (x);
    863 }
    864 
    865 static void
    866 add_to_all_aios(struct aio_str *aiop)
    867 {
    868 	add_to_aio_list(&aiop->fd->all_aios, aiop);
    869 }
    870 
    871 static int
    872 init_read(struct aio_str aio[], ullong_t start)
    873 {
    874 	struct aio_str *aiop;
    875 
    876 	aiop = my_calloc(1, sizeof (struct aio_str));
    877 	if (aiop == NULL) {
    878 		pfprintf(stderr, "init_read, can't allocate memory\n");
    879 		return (0);
    880 	}
    881 
    882 	aiop->buf = NULL;
    883 	aiop->off = 0;
    884 	aiop->handler = is_readonly() ? handle_readonly_rand : handle_read;
    885 	aiop->dev = aio->dev;
    886 	aiop->fd =  aio->dev->fdhead;
    887 	aio->dev->fdhead = aio->dev->fdhead->next;
    888 	add_to_all_aios(aiop);
    889 	(void) do_new_read(aiop, start,
    890 	    is_readonly() ? READ_ONLY_RAND : NORMAL_READ);
    891 	return (1);
    892 }
    893 static int
    894 has_no_unwritten(struct aio_str *aiop)
    895 {
    896 	return (aiop->dev->unwritten == NULL);
    897 }
    898 static void
    899 push_unwritten(struct aio_str *aiop)
    900 {
    901 	struct offset_list *new;
    902 
    903 	if ((new = calloc(1, sizeof (struct offset_list))) == NULL) {
    904 		CALLOC_ERROR(1, sizeof (struct offset_list));
    905 		aiop->dev->failed_to_push_unwritten = 1;
    906 	} else {
    907 		plog(LOG_DEBUG,
    908 		    "Block %#llx (0t%lld) %s pushed onto unwritten queue\n",
    909 		    aiop->off, aiop->off, aiop->dev->logicalname);
    910 		new->offset = aiop->off;
    911 		new->next = aiop->dev->unwritten;
    912 		aiop->dev->unwritten = new;
    913 		aiop->dev->choose_block = unwritten_block_seq;
    914 	}
    915 }
    916 static int
    917 find_unwritten(bitmap_t *map, struct aio_str *aiop, int maplen)
    918 {
    919 	struct offset_list *u, *p;
    920 	int status = 0;
    921 	p = NULL;
    922 
    923 	for (u = aiop->dev->unwritten; u != NULL; u = u->next) {
    924 		if (pend_write_with_lock(map, u->offset, maplen) == 0) {
    925 			if (p == NULL)
    926 				aiop->dev->unwritten = u->next;
    927 			else
    928 				p->next = u->next;
    929 			aiop->off = u->offset;
    930 			plog(LOG_DEBUG,
    931 			    "Block %#llx (0t%lld) %s locked and removed "
    932 			    "from unwritten queue\n",
    933 			    aiop->off, aiop->off, aiop->dev->logicalname);
    934 			free(u);
    935 			status = 1;
    936 			break;
    937 		}
    938 		p = u;
    939 	}
    940 	return (status);
    941 }
    942 static int
    943 is_unwritten(struct aio_str *aiop)
    944 {
    945 	struct offset_list *u;
    946 	for (u = aiop->dev->unwritten; u != NULL; u = u->next) {
    947 		if (u->offset == aiop->off)
    948 			return (1);
    949 	}
    950 	return (0);
    951 }
    952 /*
    953  * randomish_block
    954  *	return a random block to try to do io too or from. If we
    955  *	are short of memory the block is less random to try and
    956  *	decrease the number of attach/detach pairs that actually
    957  *	result in system calls, and therefore reduce the number of
    958  *	faults.
    959  *
    960  *	In particular when short of memory the next io will tend to
    961  *	be in the same block ob blks structures or the next block
    962  *	for odd numbered processes and the previous block for even
    963  *	numbered processes.  The overall effect is still close to
    964  *	random at the device, but individual processes thrash less.
    965  */
    966 ulong_t
    967 randomish_block(struct aio_str *aiop)
    968 {
    969 	ulong_t t;
    970 	if (shm_ops->is_short_of_mem()) {
    971 		t = aiop->off + ((my_lrand() % (shm_ops->max_size() /
    972 		    sizeof (struct blks))) * this_proc() % 2 ? -1 : 1);
    973 	} else {
    974 		t = my_lrand();
    975 	}
    976 	return (t);
    977 }
    978 /*ARGSUSED2*/
    979 void
    980 rand_block(bitmap_t *map, struct aio_str *aiop,
    981 	ullong_t start, ullong_t len, int maplen)
    982 {
    983 	ulong_t t;
    984 	ullong_t *next_io_blk_ptr;
    985 
    986 	if (is_write_io(aiop)) {
    987 		next_io_blk_ptr = &aiop->dev->next_write_blk;
    988 		if (!(aiop->dev->next_write_blk %
    989 		    opts.expert_write_cluster_length)) {
    990 			t = randomish_block(aiop);
    991 			t = t - (t % opts.expert_write_cluster_length);
    992 		} else {
    993 			TNF_PROBE_1(cluster_write, "rand_block",
    994 			    "sunw%cte%diskomizer%blocks write cluster",
    995 			    tnf_ulonglong, next_read_blk,
    996 			    aiop->dev->next_read_blk);
    997 			t = *next_io_blk_ptr;
    998 		}
    999 	} else {
   1000 		next_io_blk_ptr = &aiop->dev->next_read_blk;
   1001 		if (!(aiop->dev->next_read_blk %
   1002 		    opts.expert_read_cluster_length)) {
   1003 			t = randomish_block(aiop);
   1004 			t = t - (t % opts.expert_read_cluster_length);
   1005 		} else {
   1006 			TNF_PROBE_1(cluster_read, "rand_block",
   1007 			    "sunw%cte%diskomizer%blocks read cluster",
   1008 			    tnf_ulonglong, next_read_blk,
   1009 			    aiop->dev->next_read_blk);
   1010 			t = *next_io_blk_ptr;
   1011 		}
   1012 	}
   1013 	aiop->off = (t)%(len);
   1014 	aiop->off = find_next_free(map, aiop->off, len, maplen);
   1015 	if (next_io_blk_ptr != NULL)
   1016 		*next_io_blk_ptr = aiop->off + 1;
   1017 	assert(aiop->off < len);
   1018 }
   1019 
   1020 static void
   1021 unwritten_block(bitmap_t *map, struct aio_str *aiop,
   1022 	ullong_t start, ullong_t len, int maplen, choose_block_t chooser)
   1023 {
   1024 	plog(LOG_DEBUG, "in unwritten_block %llx\n",
   1025 	    aiop->dev->unwritten == NULL ?
   1026 	    0LL : aiop->dev->unwritten->offset);
   1027 	if (find_unwritten(map, aiop, maplen)) {
   1028 		struct blks *blocks;
   1029 
   1030 		if (has_no_unwritten(aiop))
   1031 			aiop->dev->choose_block = chooser;
   1032 		blocks = aio_attach(aiop);
   1033 		blocks[AIO_BLOCK_INDEX(aiop)].u.was_unwritten = 1;
   1034 		shm_ops->detach(AIO_BLOCK_HANDLE(aiop));
   1035 	} else {
   1036 		chooser(map, aiop, start, len, maplen);
   1037 	}
   1038 }
   1039 /*
   1040  * find the "next" block to read for this aio_str. Increment by
   1041  * by the number of processes, so that when initializing the disk
   1042  * each block only gets written once.
   1043  */
   1044 static void
   1045 seq_block(bitmap_t *map, struct aio_str *aiop,
   1046 	ullong_t start, ullong_t len, int maplen)
   1047 {
   1048 	int does_not_have_lock;
   1049 
   1050 	assert((aiop->dev->block % opts.nprocs) ==  this_proc() ||
   1051 	    aiop->dev->block == len);
   1052 
   1053 	aiop->off = aiop->dev->block;
   1054 	if (aiop->dev->block >= len) {
   1055 		does_not_have_lock = 1;
   1056 	} else while ((does_not_have_lock =
   1057 	    pend_write_with_lock(map, aiop->off, maplen)) != 0) {
   1058 		/*
   1059 		 * Only push blocks that this process would have to write
   1060 		 * onto the unwritten queue. This only effects the last
   1061 		 * block on the device.
   1062 		 */
   1063 		if ((aiop->off % opts.nprocs) ==  this_proc()) {
   1064 			push_unwritten(aiop);
   1065 		}
   1066 		aiop->dev->block += opts.nprocs;
   1067 		aiop->off = aiop->dev->block % len;
   1068 		if (aiop->dev->block >= len) {
   1069 			aiop->dev->block = len;
   1070 			break;
   1071 		}
   1072 	}
   1073 
   1074 	if (does_not_have_lock != 0) {
   1075 		if (aiop->dev->seq_passes == 0 ||
   1076 		    --aiop->dev->seq_passes == 0) {
   1077 			time_now_log(LOG_NOTICE,
   1078 			    "Finished sequential %ss on %s count %d",
   1079 			    is_readonly() ? read_str : write_str,
   1080 			    aiop->dev->logicalname,
   1081 			    aiop->count);
   1082 
   1083 			if (has_no_unwritten(aiop)) {
   1084 				aiop->dev->choose_block = rand_block;
   1085 			} else {
   1086 				aiop->dev->choose_block = unwritten_block_rand;
   1087 			}
   1088 		} else {
   1089 			aiop->off = aiop->dev->block = this_proc();
   1090 			time_now_log(LOG_NOTICE,
   1091 			    "Starting sequential series again on %s counts %d",
   1092 			    aiop->dev->logicalname, aiop->count);
   1093 		}
   1094 		aiop->dev->choose_block(map, aiop, start, len, maplen);
   1095 	} else {
   1096 		assert(!does_not_have_lock);
   1097 
   1098 		ASSERT_OFFSET(aiop->dev->block);
   1099 
   1100 		aiop->dev->block += opts.nprocs;
   1101 		if (aiop->dev->block >= len) {
   1102 			aiop->dev->block = len;
   1103 		}
   1104 	}
   1105 }
   1106 static void
   1107 unwritten_block_seq(bitmap_t *map, struct aio_str *aiop,
   1108 	ullong_t start, ullong_t len, int maplen)
   1109 {
   1110 	unwritten_block(map, aiop, start, len, maplen, seq_block);
   1111 }
   1112 static void
   1113 unwritten_block_rand(bitmap_t *map, struct aio_str *aiop,
   1114 	ullong_t start, ullong_t len, int maplen)
   1115 {
   1116 	unwritten_block(map, aiop, start, len, maplen, rand_block);
   1117 }
   1118 /*
   1119  * I leave the source as this _may_ be useful in the future.
   1120  */
   1121 #ifdef NOT_USED_CODE
   1122 static char
   1123 set_write(bitmap_t map[], ullong_t off, int maplen)
   1124 {
   1125 	ulong_t tmp = GET_OFF(off) % maplen;
   1126 	char status;
   1127 
   1128 	mutex->enter(tmp);
   1129 	if (map[tmp] & GET_BIT(off)) {
   1130 		/* we are already locked */
   1131 		status = 0;
   1132 	} else {
   1133 		map[tmp] |= GET_BIT(off);
   1134 		status = 1;
   1135 	}
   1136 	mutex->exit(tmp);
   1137 	TNF_PROBE_3(set_write, "set_write", "sunw%cte%diskomizer",
   1138 	    tnf_opaque, off, off,
   1139 	    tnf_opaque, map, map,
   1140 	    tnf_char, status, status);
   1141 	return (status);
   1142 }
   1143 #endif
   1144 
   1145 void
   1146 clear_write(bitmap_t map[], ullong_t off, ulong_t maplen)
   1147 {
   1148 	ulong_t tmp = (GET_OFF(off) % maplen);
   1149 	ulong_t x;
   1150 	bitmap_t bit = ~(GET_BIT(off));
   1151 
   1152 	mutex->enter(tmp);
   1153 	x = map[tmp];
   1154 	map[tmp] &= bit;
   1155 	assert(~bit != (ulong_t)0);
   1156 	if (x == map[tmp]) {
   1157 		plog(LOG_ALERT, "Ooops block %#llx (0t%lld) was not locked\n",
   1158 		    diskomizer_off2byteoff(off), diskomizer_off2byteoff(off));
   1159 		TNF_PROBE_2(clear_write, "clear_write failed",
   1160 		    "sunw%cte%diskomizer",
   1161 		    tnf_opaque, off, off, tnf_opaque, map, map);
   1162 	} else {
   1163 		TNF_PROBE_2(clear_write, "clear_write ok",
   1164 		    "sunw%cte%diskomizer",
   1165 		    tnf_opaque, off, off,
   1166 		    tnf_opaque, map, map);
   1167 	}
   1168 	mutex->exit(tmp);
   1169 }
   1170 
   1171 #ifdef NOT_USED_CODE
   1172 static void
   1173 print_bitmap(bitmap_t map[], int maplen)
   1174 {
   1175 	int i;
   1176 
   1177 	for (i = 0; i < maplen; i++)
   1178 		pprintf("%#8.8X %#8.8X\n", i, map[i]);
   1179 	(void) fflush(stdout);
   1180 }
   1181 #endif
   1182 
   1183 /*
   1184  * find_next_free finds the "next" block that is not locked starting from
   1185  * offset.
   1186  */
   1187 ullong_t
   1188 find_next_free(bitmap_t map[], ullong_t off, int len, int maplen)
   1189 {
   1190 	ulong_t tmp = (GET_OFF(off) % maplen);
   1191 	ulong_t i = 0;
   1192 	bitmap_t bit = GET_BIT(off);
   1193 
   1194 	mutex->enter(tmp);
   1195 
   1196 	while ((map[tmp] & bit) != 0) {
   1197 		ulong_t newtmp;
   1198 
   1199 		off = off + 1;
   1200 
   1201 		off %= len;
   1202 		newtmp = (GET_OFF(off) % maplen);
   1203 		bit = GET_BIT(off);
   1204 		mutex->getnext(tmp, newtmp);
   1205 		tmp = newtmp;
   1206 		TNF_PROBE_2(find_next_free_trying, "find_next_free trying",
   1207 		    "sunw%cte%diskomizer", tnf_longlong, off, off,
   1208 		    tnf_opaque, map, map);
   1209 		if (!(i < (4 * len))) {
   1210 			(void) plog(LOG_ALERT,
   1211 			    "Unable to find free entry in map %#lx"
   1212 			    " of length %d\n",
   1213 			    (ulong_t)&map[0], maplen);
   1214 			(void) fflush(stderr);
   1215 			/* print_bitmap(map, maplen); */
   1216 			mutex->exit(tmp);
   1217 			(void) sleep(1);
   1218 			mutex->enter(tmp);
   1219 			i = 0;
   1220 		}
   1221 		i++;
   1222 	}
   1223 	map[tmp] |= GET_BIT(off);
   1224 	mutex->exit(tmp);
   1225 	TNF_PROBE_2(find_next_free_found, "find_next_free found",
   1226 	    "sunw%cte%diskomizer", tnf_longlong, off, off,
   1227 	    tnf_opaque, map, map);
   1228 	return (off);
   1229 }
   1230 
   1231 /*
   1232  * Test to see if the write bit is set for this offset.  The lock MUST
   1233  * already be held
   1234  */
   1235 static int
   1236 test_write(bitmap_t map[], ullong_t off, int maplen)
   1237 {
   1238 	ulong_t tmp = GET_OFF(off) % maplen;
   1239 
   1240 	return (map[tmp] & GET_BIT(off) ? 1 : 0);
   1241 }
   1242 
   1243 /*
   1244  * If this block is being read from or written to return true
   1245  * Otherwise return lock it and return.
   1246  */
   1247 static int
   1248 pend_write_with_lock(bitmap_t map[], ullong_t off, int maplen)
   1249 {
   1250 	ulong_t tmp = GET_OFF(off) % maplen;
   1251 	int status;
   1252 
   1253 	mutex->enter(tmp);
   1254 	plog(LOG_DEBUG, "Disk Block %lld\n", diskomizer_off2byteoff(off));
   1255 	if (map[tmp] & GET_BIT(off)) {
   1256 		status = 1;
   1257 	} else {
   1258 		map[tmp] |= GET_BIT(off);
   1259 		status = 0;
   1260 	}
   1261 	mutex->exit(tmp);
   1262 	return (status);
   1263 }
   1264 
   1265 static uchar_t
   1266 choose_iolen(struct aio_str *aiop)
   1267 {
   1268 	if (is_executable(aiop->buf)) {
   1269 		return (max_disk_io_len);
   1270 	}
   1271 	return (opts.disk_io_sizes.weightings[lrand48() %
   1272 	    opts.disk_io_sizes.wlen]);
   1273 }
   1274 
   1275 static void
   1276 init_read_buf(uchar_t *buf, ulong_t len, const uchar_t * const write_buf)
   1277 {
   1278 	void *sig = expect_signal(SIGBUS, "memset", buf, len);
   1279 
   1280 	read_buffer_initializer(buf, len, write_buf);
   1281 	cancel_expected_signal(SIGBUS, sig);
   1282 }
   1283 
   1284 static bitmap_t *
   1285 attach_dev_writemap(struct device *dev)
   1286 {
   1287 	bitmap_t *map;
   1288 	int error_count = 0;
   1289 
   1290 	while ((map = (bitmap_t *)
   1291 	    shm_ops->attach(dev->writemap_handle)) == NULL) {
   1292 		if ((error_count++ % 10000) == 0)
   1293 			ATTACH_ERROR(dev->writemap_handle);
   1294 	}
   1295 	if (error_count > 0)
   1296 		plog(LOG_WARNING, "attached o.k.\n");
   1297 	return (map);
   1298 }
   1299 static bitmap_t *
   1300 attach_aio_writemap(struct aio_str *aiop)
   1301 {
   1302 	return (attach_dev_writemap(aiop->dev));
   1303 }
   1304 static void
   1305 clear_writemap(struct aio_str *aiop)
   1306 {
   1307 	bitmap_t *map = attach_aio_writemap(aiop);
   1308 	clear_write(map, aiop->off, aiop->dev->writemap_size);
   1309 	shm_ops->detach(aiop->dev->writemap_handle);
   1310 }
   1311 static void
   1312 clear_writemap_success(struct aio_str *aiop)
   1313 {
   1314 	aiop->off = push_recent(aiop->dev->recent, aiop->off);
   1315 	if (aiop->off != -1) {
   1316 		clear_writemap(aiop);
   1317 	}
   1318 }
   1319 static struct blks *
   1320 choose_new_random_read(struct aio_str *aiop, ullong_t start, ullong_t len)
   1321 {
   1322 	struct blks *blocks;
   1323 	bitmap_t *map;
   1324 
   1325 	if ((aiop->off = pop_recent(aiop->dev->recent)) != -1) {
   1326 		return (aio_attach(aiop));
   1327 	}
   1328 
   1329 	map = attach_aio_writemap(aiop);
   1330 
   1331 	aiop->retrycnt = 0;
   1332 	for (;;) {
   1333 		rand_block(map, aiop, start, len,
   1334 		    aiop->dev->writemap_size);
   1335 		blocks = aio_attach(aiop);
   1336 
   1337 		if (is_readonly()) {
   1338 			if (0x1 & (uint_t)
   1339 			    blocks[AIO_BLOCK_INDEX(aiop)].r.o.last_io)
   1340 				break;
   1341 		} else {
   1342 			if (blocks[AIO_BLOCK_INDEX(aiop)].r.w.last_io != NULL)
   1343 				break;
   1344 		}
   1345 		shm_ops->detach(AIO_BLOCK_HANDLE(aiop));
   1346 		blocks = NULL;
   1347 		clear_write(map, aiop->off, aiop->dev->writemap_size);
   1348 	}
   1349 	shm_ops->detach(aiop->dev->writemap_handle);
   1350 	return (blocks);
   1351 }
   1352 /*
   1353  * do a new read.
   1354  */
   1355 time_t
   1356 do_new_read(struct aio_str *aiop, ullong_t start, read_type_t read_type)
   1357 {
   1358 	ullong_t offset;
   1359 	int fd = aiop->fd->fd;
   1360 	ullong_t len;
   1361 	struct blks *blocks = NULL;
   1362 
   1363 	/*
   1364 	 * if opts.sequential_passes is equal to seq_passes then we are on the
   1365 	 * first pass or opts.sequential_passes was zero to start with. In
   1366 	 * the second case once the disk is fill aip->dev->block will contain
   1367 	 * the address of the last block anyway.
   1368 	 */
   1369 	if (aiop->dev->seq_passes == opts.sequential_passes) {
   1370 		len = aiop->dev->block;
   1371 	} else {
   1372 		len = LEN_BYTES2BLOCKS(aiop->dev);
   1373 	}
   1374 
   1375 
   1376 	if (read_type != RETRY_READ &&
   1377 	    OPTION(nloops) != 0 && aiop->dev->countdown != 0) {
   1378 		if (--aiop->dev->countdown == 0) {
   1379 			time_now_log(LOG_INFO, "countdown on device %s is zero",
   1380 			    aiop->dev->logicalname);
   1381 		}
   1382 	}
   1383 	if (read_type == NORMAL_READ) {
   1384 		struct shadow_hdr const *shadow;
   1385 
   1386 		blocks = choose_new_random_read(aiop, start, len);
   1387 
   1388 		if (aiop->buf == NULL)
   1389 			aiop->buf = get_read_buf();
   1390 
   1391 		aiop->iolen = blocks[AIO_BLOCK_INDEX(aiop)].r.w.last_iolen;
   1392 
   1393 		aiop->daio_id.bufs = INDEX_TO_DIOLEN(aiop->iolen);
   1394 		aiop->daio_id.buf = blocks[AIO_BLOCK_INDEX(aiop)].r.w.last_io;
   1395 		aiop->hdr = build_bufhr(aiop->dev, start, aiop->off);
   1396 		shadow = get_shadow_hdr(aiop->daio_id.buf);
   1397 		aiop->daio_id.chksum = shadow->chksums[aiop->iolen];
   1398 		aiop->daio_id.buf_id = get_write_buf_id(
   1399 		    blocks[AIO_BLOCK_INDEX(aiop)].r.w.last_io);
   1400 		aiop->daio_id.hdr_len = sizeof (aiop->hdr);
   1401 		aiop->daio_id.hdr = (uchar_t *)&aiop->hdr;
   1402 	} else if (read_type == RETRY_READ) {
   1403 		aiop->retrycnt++;
   1404 	} else if (read_type == WRITE_READ) {
   1405 		struct shadow_hdr const *shadow;
   1406 
   1407 		blocks = aio_attach(aiop);
   1408 
   1409 		aiop->daio_id.bufs = INDEX_TO_DIOLEN(aiop->iolen);
   1410 		aiop->daio_id.buf = blocks[AIO_BLOCK_INDEX(aiop)].r.w.last_io;
   1411 		aiop->daio_id.buf_id = get_write_buf_id(
   1412 		    blocks[AIO_BLOCK_INDEX(aiop)].r.w.last_io);
   1413 		shadow = get_shadow_hdr(aiop->daio_id.buf);
   1414 		aiop->daio_id.chksum = shadow->chksums[aiop->iolen];
   1415 		aiop->daio_id.hdr_len = sizeof (aiop->hdr);
   1416 		aiop->daio_id.hdr = (uchar_t *)&aiop->hdr;
   1417 	} else if (read_type == READ_ONLY_RAND) {
   1418 		if (aiop->buf == NULL)
   1419 			aiop->buf = get_read_buf();
   1420 
   1421 		blocks = choose_new_random_read(aiop, start, len);
   1422 		aiop->daio_id.buf =
   1423 		    BIT2CHARSTAR(blocks[AIO_BLOCK_INDEX(aiop)].r.o.last_io);
   1424 		aiop->daio_id.chksum =
   1425 		    blocks[AIO_BLOCK_INDEX(aiop)].r.o.last_chksum;
   1426 		aiop->iolen = max_disk_io_len;
   1427 		aiop->daio_id.bufs = INDEX_TO_DIOLEN(aiop->iolen);
   1428 	} else if (read_type == READ_ONLY_SEQ) {
   1429 		bitmap_t *map;
   1430 
   1431 		if (aiop->buf == NULL)
   1432 			aiop->buf = get_read_buf();
   1433 
   1434 		map = attach_aio_writemap(aiop);
   1435 		len = LEN_BYTES2BLOCKS(aiop->dev);
   1436 		aiop->dev->choose_block(map, aiop, start, len,
   1437 		    aiop->dev->writemap_size);
   1438 		shm_ops->detach(aiop->dev->writemap_handle);
   1439 		blocks = aio_attach(aiop);
   1440 
   1441 		aiop->daio_id.buf =
   1442 		    BIT2CHARSTAR(blocks[AIO_BLOCK_INDEX(aiop)].r.o.last_io);
   1443 		aiop->daio_id.chksum =
   1444 		    blocks[AIO_BLOCK_INDEX(aiop)].r.o.last_chksum;
   1445 		aiop->iolen = max_disk_io_len;
   1446 		aiop->daio_id.bufs = INDEX_TO_DIOLEN(aiop->iolen);
   1447 	}
   1448 
   1449 	offset = aio_str2byteoff(aiop);
   1450 
   1451 	while (my_gettimeofday(&aiop->tv, NULL) == -1)
   1452 		pperror("gettimeofday");
   1453 
   1454 	if (blocks == NULL)
   1455 		blocks = aio_attach(aiop);
   1456 
   1457 	init_read_buf(aiop->buf, INDEX_TO_DIOLEN(max_disk_io_len),
   1458 	    blocks[AIO_BLOCK_INDEX(aiop)].r.w.last_io);
   1459 
   1460 	shm_ops->detach(AIO_BLOCK_HANDLE(aiop));
   1461 
   1462 	for (;;) {
   1463 		if (aiop->dev->stop_flag ||
   1464 		    stop_check(aiop->dev->shared_data_handle)) {
   1465 			if (aiop->dev->stop_flag == 0) {
   1466 				plog(LOG_NOTICE, "Stopping %s\n",
   1467 				    aiop->dev->logicalname);
   1468 				aiop->dev->stop_flag = 1;
   1469 			} else if (aiop->dev->need_to_stop &&
   1470 			    set_shared_stop_flag(
   1471 			    aiop->dev->shared_data_handle) != -1) {
   1472 				aiop->dev->need_to_stop = 0;
   1473 			}
   1474 			clear_writemap(aiop);
   1475 			return_read_buf(aiop->buf);
   1476 			aiop->buf = NULL;
   1477 			remove_from_aio_list(&aiop->fd->all_aios, aiop);
   1478 			break;
   1479 		}
   1480 
   1481 		ZERO_OBJ(aiop->error.desc);
   1482 
   1483 		/* Move to the begining of the all_aios list */
   1484 		remove_from_aio_list(&aiop->fd->all_aios, aiop);
   1485 		add_to_aio_list(&aiop->fd->all_aios, aiop);
   1486 
   1487 		TNF_PROBE_4(aioread, "aioread",
   1488 		    "sunw%cte%diskomizer%aio read",
   1489 		    tnf_long, fd, aiop->fd->fd,
   1490 		    tnf_opaque, offset, offset,
   1491 		    tnf_opaque, aiop, aiop,
   1492 		    aio_tnf_str, *aiop, aiop);
   1493 
   1494 		if (daio->aread(fd, aiop->buf,
   1495 		    INDEX_TO_DIOLEN(aiop->iolen), offset,
   1496 		    &aiop->aio_res, &aiop->daio_id) < 0) {
   1497 			if (errno == EAGAIN) {
   1498 				AIOREAD_ERROR(fd, aiop->fd->name,
   1499 				    aiop->buf,
   1500 				    INDEX_TO_DIOLEN(aiop->iolen), offset,
   1501 				    SEEK_SET, &aiop->aio_res);
   1502 				continue;
   1503 			} else {
   1504 				AIOREAD_ERROR(fd, aiop->fd->name, aiop->buf,
   1505 				    INDEX_TO_DIOLEN(aiop->iolen), offset,
   1506 				    SEEK_SET, &aiop->aio_res);
   1507 				clear_writemap(aiop);
   1508 			}
   1509 		}
   1510 		aiop->fd->total_read++;
   1511 		break;
   1512 	}
   1513 	return (aiop->tv.tv_sec);
   1514 }
   1515 static struct shadow_hdr const *
   1516 set_io_len(struct aio_str *aiop)
   1517 {
   1518 	struct shadow_hdr const *shadow_hdr = get_shadow_hdr(aiop->buf);
   1519 	if (!shadow_hdr->type.BUF_READY) {
   1520 		struct shadow_hdr *shadow;
   1521 		int j;
   1522 		shadow = (struct shadow_hdr *)shadow_hdr;
   1523 		init_buf(aiop->buf);
   1524 		for (j = 0; j <= opts.disk_io_sizes.weightings[
   1525 		    opts.disk_io_sizes.wlen - 1]; j++) {
   1526 			shadow->chksums[j] =
   1527 			    check_bufbody(aiop->buf,
   1528 			    INDEX_TO_DIOLEN(j));
   1529 		}
   1530 		shadow->type = get_bufhdr_a(aiop->buf).type;
   1531 		shadow->type.BUF_READY = 1;
   1532 		if (opts.obscure_execute && is_executable(aiop->buf)) {
   1533 			run_func(aiop->buf,
   1534 			    opts.disk_io_sizes.vals[aiop->iolen] -
   1535 			    SIZEOF_BUFHDR);
   1536 		}
   1537 		aiop->iolen = choose_iolen(aiop);
   1538 	} else if (shadow_hdr->type.BUF_READ_ONLY) {
   1539 		aiop->iolen = max_disk_io_len;
   1540 	} else {
   1541 		assert(*aiop->buf == 0xAA || *aiop->buf == 0x55);
   1542 		aiop->iolen = choose_iolen(aiop);
   1543 	}
   1544 	return (shadow_hdr);
   1545 }
   1546 static int
   1547 is_sequential(struct aio_str *aiop)
   1548 {
   1549 	return (aiop->dev->choose_block == seq_block ||
   1550 	    aiop->dev->choose_block == unwritten_block_seq);
   1551 }
   1552 
   1553 static struct aio_str *
   1554 get_deferred_io(struct device *dev)
   1555 {
   1556 	struct device *devp;
   1557 	struct aio_str *aiop;
   1558 
   1559 	for (devp = dev; devp != NULL; devp = devp->next) {
   1560 		if ((aiop = pop_from_aio_list(&devp->deferred_ios)) != NULL) {
   1561 			return (aiop);
   1562 		}
   1563 	}
   1564 	return (NULL);
   1565 }
   1566 
   1567 static void
   1568 deferred_starter(struct device *dev, ullong_t start)
   1569 {
   1570 	int all_going = 1;
   1571 	struct device *devp;
   1572 
   1573 	for (devp = dev; devp != NULL; devp = devp->next) {
   1574 		struct aio_str *aiop;
   1575 
   1576 		check_exit_flag();
   1577 
   1578 		aiop = pop_from_aio_list(&devp->deferred_ios);
   1579 		if (aiop != NULL) {
   1580 			cancelled_count--;
   1581 			aiop->handler(aiop, start);
   1582 		}
   1583 		if (is_aio_on_list(&devp->deferred_ios)) {
   1584 			all_going = 0;
   1585 		}
   1586 	}
   1587 	if (all_going == 1) {
   1588 		start_deferred = (void (*)(struct device *, ullong_t)) nop;
   1589 	}
   1590 }
   1591 
   1592 static int
   1593 number_of_writes(struct device *dev)
   1594 {
   1595 	struct fds *fd = dev->fdhead;
   1596 	int count = 0;
   1597 
   1598 	do {
   1599 		count += fd->total_write;
   1600 		fd = fd->next;
   1601 	} while (fd != dev->fdhead);
   1602 
   1603 	return (count);
   1604 }
   1605 
   1606 /* static void */
   1607 void
   1608 do_new_write(struct aio_str *aiop, ullong_t start, int retry)
   1609 {
   1610 	ullong_t len;
   1611 	ullong_t offset;
   1612 	long writemap_size;
   1613 	struct shadow_hdr const *shadow_hdr;
   1614 	struct blks *blocks;
   1615 	struct blks *block;
   1616 	char deferred;
   1617 
   1618 	if (aiop->fd == NULL) {
   1619 		aiop->fd = aiop->dev->fdhead;
   1620 		add_to_aio_list(&aiop->fd->all_aios, aiop);
   1621 		deferred = 1;
   1622 	} else {
   1623 		deferred = 0;
   1624 	}
   1625 
   1626 	len = LEN_BYTES2BLOCKS(aiop->dev);
   1627 	writemap_size = aiop->dev->writemap_size;
   1628 
   1629 	if (!retry) {
   1630 		bitmap_t *map;
   1631 		int i;
   1632 		if (write_loops) {
   1633 			if (--aiop->dev->countdown == 0) {
   1634 				time_now_log(LOG_INFO,
   1635 				    "countdown on device %s is zero",
   1636 				    aiop->dev->logicalname);
   1637 			}
   1638 		}
   1639 		if (aiop->buf == NULL) {
   1640 			aiop->buf = get_write_buf();
   1641 		}
   1642 		shadow_hdr = set_io_len(aiop);
   1643 		map = attach_aio_writemap(aiop);
   1644 
   1645 		for (i = 0; /* cstyle */; i++) {
   1646 			aiop->dev->choose_block(map, aiop, start, len,
   1647 			    writemap_size);
   1648 			blocks = aio_attach(aiop);
   1649 
   1650 			block = &blocks[AIO_BLOCK_INDEX(aiop)];
   1651 			if (block->r.w.last_io == NULL ||
   1652 			    (block->read_count >= OPTION(read_minimum) &&
   1653 			    block->r.w.last_io != aiop->buf)) {
   1654 					break;
   1655 			} else {
   1656 				if (block->r.w.last_io == aiop->buf) {
   1657 					uchar_t *buf;
   1658 					/*
   1659 					 * Get the new write buf first so that
   1660 					 * You definitely get a new buffer.
   1661 					 */
   1662 					if ((buf = get_write_buf()) != NULL) {
   1663 						return_write_buf(aiop->buf);
   1664 						aiop->buf = buf;
   1665 						shadow_hdr = set_io_len(aiop);
   1666 						break;
   1667 					}
   1668 				}
   1669 				clear_write(map, aiop->off, writemap_size);
   1670 				shm_ops->detach(AIO_BLOCK_HANDLE(aiop));
   1671 				block = blocks = NULL;
   1672 				if (i * OPTION(obscure_search_multiplier) >=
   1673 				    len || deferred) {
   1674 					if (!deferred) {
   1675 						union err_info err_info;
   1676 						err_info.str = "write";
   1677 						report_error(aiop, err_info,
   1678 						    ERR_DEFERRED);
   1679 					}
   1680 					remove_from_aio_list(
   1681 					    &aiop->fd->all_aios, aiop);
   1682 					aiop->fd = NULL;
   1683 					add_to_aio_list(
   1684 					    &aiop->dev->deferred_ios, aiop);
   1685 					return_write_buf(aiop->buf);
   1686 					aiop->buf = NULL;
   1687 					shm_ops->detach(
   1688 					    aiop->dev->writemap_handle);
   1689 					if (number_of_writes(aiop->dev) == 0) {
   1690 						start_deferred =
   1691 						    deferred_starter;
   1692 					}
   1693 					return;
   1694 				}
   1695 			}
   1696 		}
   1697 		aiop->retrycnt = 0;
   1698 		shm_ops->detach(aiop->dev->writemap_handle);
   1699 	} else {
   1700 		/* if we are retrying then we already have the lock. */
   1701 		ullong_t blockno = aio_str2byteoff(aiop);
   1702 		shadow_hdr = get_shadow_hdr(aiop->buf);
   1703 		if (retry == 1) {
   1704 			pfprintf(stderr,
   1705 			    "%s Block 0t%lld (%#llx) retry count %d\n",
   1706 			    aiop->fd->name, blockno, blockno,
   1707 			    ++aiop->retrycnt);
   1708 		}
   1709 		blocks = aio_attach(aiop);
   1710 
   1711 		block = &blocks[AIO_BLOCK_INDEX(aiop)];
   1712 	}
   1713 
   1714 	offset = (ullong_t)start + (INDEX_TO_DIOLEN(max_disk_io_len)*aiop->off);
   1715 
   1716 	assert((ullong_t)offset >= (ullong_t)start);
   1717 	assert((ullong_t)offset <= (ullong_t)(start + aiop->dev->length -
   1718 	    INDEX_TO_DIOLEN(max_disk_io_len)));
   1719 
   1720 	while (my_gettimeofday(&aiop->tv, NULL) == -1)
   1721 		pperror("gettimeofday");
   1722 
   1723 	if (shadow_hdr->type.BUF_READ_ONLY == 0) {
   1724 		/*
   1725 		 * Set up the buffer header and store away the path_id of the
   1726 		 * path we are using, and the header checksum.
   1727 		 */
   1728 		unprotect_buf(aiop->buf);
   1729 		toggle_bufhdr(aiop->buf);
   1730 		set_bufhdr_all(aiop->buf, shadow_hdr->chksums[aiop->iolen],
   1731 		    INDEX_TO_DIOLEN(aiop->iolen),
   1732 		    aiop->fd->devid, offset, shadow_hdr->type,
   1733 		    ++block->sequence,
   1734 		    aiop->tv.tv_sec);
   1735 		block->path_id = aiop->fd->path_id;
   1736 		block->hdrchksum =
   1737 		    set_buf_hdrchksum(aiop->buf);
   1738 		protect_buf(aiop->buf);
   1739 	} else {
   1740 		block->hdrchksum = check_bufhdr(aiop->buf,
   1741 		    get_bufhdr_hdrchksum(aiop->buf));
   1742 	}
   1743 	if (get_bufhdr_hdrchksum(aiop->buf) != block->hdrchksum) {
   1744 		pfprintf(stderr, "writing bad checksum buf %#lx\n",
   1745 		    (ulong_t)aiop->buf);
   1746 	}
   1747 	if (opts.obscure_execute && is_executable(aiop->buf)) {
   1748 		plog(LOG_DEBUG, "Writing executable buffer\n");
   1749 	}
   1750 	/*
   1751 	 * This if is saying that this process should have initialized
   1752 	 * this block, during the sequential part of the run.  So last_io
   1753 	 * should be set. If not then something went wrong.
   1754 	 *
   1755 	 * The block could also been skipped as it was busy and put on
   1756 	 * the unwritten list, so only check if the unwritten list is
   1757 	 * empty.
   1758 	 */
   1759 
   1760 	if (!is_sequential(aiop) && block->r.w.last_io == NULL &&
   1761 	    (aiop->off % opts.nprocs) == this_proc() &&
   1762 	    aiop->dev->failed_to_push_unwritten == 0 &&
   1763 	    blocks[AIO_BLOCK_INDEX(aiop)].u.was_unwritten == 1 &&
   1764 	    !is_unwritten(aiop)) {
   1765 		ullong_t blockno = aio_str2byteoff(aiop);
   1766 		pfprintf(stderr, "Device %s\n", aiop->fd->name);
   1767 		pfprintf(stderr, "Device len %#llx\n", aiop->dev->length);
   1768 		pfprintf(stderr, "This proc %d nprocs %ld\n", this_proc(),
   1769 		    opts.nprocs);
   1770 		pfprintf(stderr, "Block %#llx (0t%lld) byte off %llx error\n",
   1771 		    aiop->off, aiop->off, blockno);
   1772 		pfprintf(stderr, "Last Requested %ld\n",
   1773 		    block->last_requested);
   1774 		pfprintf(stderr, "Last return delta %d\n",
   1775 		    block->last_returned_delta);
   1776 		pfprintf(stderr, "Read Count %d\n", block->read_count);
   1777 		pfprintf(stderr, "Last Io Len %d\n",
   1778 		    INDEX_TO_DIOLEN(block->r.w.last_iolen));
   1779 		assert(block->r.w.last_io != NULL);
   1780 	}
   1781 
   1782 	shm_ops->detach(AIO_BLOCK_HANDLE(aiop));
   1783 
   1784 	for (;;) {
   1785 		if (aiop->dev->stop_flag ||
   1786 		    stop_check(aiop->dev->shared_data_handle)) {
   1787 			if (aiop->dev->stop_flag == 0) {
   1788 				plog(LOG_NOTICE, "Stopping %s\n",
   1789 				    aiop->dev->logicalname);
   1790 				aiop->dev->stop_flag = 1;
   1791 			} else if (aiop->dev->need_to_stop &&
   1792 			    set_shared_stop_flag(
   1793 			    aiop->dev->shared_data_handle) != -1) {
   1794 					aiop->dev->need_to_stop = 0;
   1795 			}
   1796 			clear_writemap(aiop);
   1797 			return_write_buf(aiop->buf);
   1798 			aiop->buf = NULL;
   1799 			remove_from_aio_list(&aiop->fd->all_aios, aiop);
   1800 			break;
   1801 		}
   1802 
   1803 		assert(*aiop->buf == 0xAA || *aiop->buf == 0x55);
   1804 
   1805 		ZERO_OBJ(aiop->error.desc);
   1806 
   1807 		TNF_PROBE_4(daiowrite, "aiowrite",
   1808 		    "sunw%cte%diskomizer%aio write",
   1809 		    tnf_long, fd, aiop->fd->fd,
   1810 		    tnf_opaque, offset, offset,
   1811 		    tnf_opaque, aiop, aiop,
   1812 		    aio_tnf_str, *aiop, aiop);
   1813 		aiop->daio_id.buf = aiop->buf;
   1814 		aiop->daio_id.buf_id = get_write_buf_id(aiop->buf);
   1815 		aiop->daio_id.hdr_len = sizeof (aiop->hdr);
   1816 		aiop->daio_id.hdr = (uchar_t *)&aiop->hdr;
   1817 		(void) memcpy(&aiop->hdr, aiop->buf, sizeof (aiop->hdr));
   1818 
   1819 		aiop->daio_id.footer_len = 0;
   1820 
   1821 		/* Move to the begining of the all_aios list */
   1822 		remove_from_aio_list(&aiop->fd->all_aios, aiop);
   1823 		add_to_aio_list(&aiop->fd->all_aios, aiop);
   1824 
   1825 		if (daio->awrite(aiop->fd->fd, aiop->buf,
   1826 		    INDEX_TO_DIOLEN(aiop->iolen),
   1827 		    offset, &aiop->aio_res, &aiop->daio_id) == -1) {
   1828 			int serrno = errno;
   1829 			AIOWRITE_ERROR(aiop->fd->fd, aiop->fd->name,
   1830 			    (ulong_t)aiop->buf,
   1831 			    INDEX_TO_DIOLEN(aiop->iolen),
   1832 			    offset,
   1833 			    SEEK_SET,
   1834 			    (ulong_t)&aiop->aio_res);
   1835 			if (serrno == EAGAIN) {
   1836 				continue;
   1837 			} else {
   1838 				clear_writemap(aiop);
   1839 			}
   1840 		} else if (!deferred) {
   1841 			aiop->fd->total_write++;
   1842 			if (is_aio_on_list(&aiop->dev->deferred_ios)) {
   1843 				aiop = pop_from_aio_list(
   1844 				    &aiop->dev->deferred_ios);
   1845 				do_new_write(aiop, start, 0);
   1846 			}
   1847 		} else {
   1848 			aiop->fd->total_write++;
   1849 			plog(LOG_NOTICE, "Started deferred io to %s\n",
   1850 			    aiop->dev->logicalname);
   1851 		}
   1852 		break;
   1853 	}
   1854 }
   1855 
   1856 /*ARGSUSED1*/
   1857 void
   1858 run_func(uchar_t *buf, size_t size)
   1859 {
   1860 	uchar_t *cptr;
   1861 #ifdef SPARC
   1862 	uint32_t *last, *ptr;
   1863 #else
   1864 	uint32_t *ptr;
   1865 #endif
   1866 	void (*func)(void);
   1867 
   1868 	cptr = get_buf_data(buf);
   1869 #ifdef SPARC
   1870 	/* check alignment for SPARC */
   1871 	if ((ulong_t)cptr % 4) {
   1872 		return;
   1873 	}
   1874 #endif
   1875 	/*LINTED*/
   1876 	ptr = (uint32_t *)cptr;
   1877 	func = (void (*)(void))(ptr);
   1878 #ifdef SPARC
   1879 	last = ptr + size / sizeof (uint32_t);
   1880 
   1881 	for (; ptr < last; ptr++)
   1882 		flush((int32_t *)ptr);
   1883 #endif
   1884 
   1885 	plog(LOG_DEBUG, "Running func %#lx in buf %#lx, type %llx\n",
   1886 	    func, (ulong_t)buf, get_bufhdr(buf).start);
   1887 	TNF_PROBE_1(run_func, "run_func",
   1888 	    "sunw%cte%diskomizer%aio execute run",
   1889 	    tnf_opaque, buf, buf);
   1890 	func();
   1891 }
   1892 
   1893 struct fds *
   1894 find_path(struct fds *fdhead, char path_id)
   1895 {
   1896 	struct fds *fd;
   1897 
   1898 	for (fd = fdhead->next; ; fd = fd->next) {
   1899 		if (fd->path_id == path_id)
   1900 			return (fd);
   1901 		if (fd == fdhead)
   1902 			return (NULL);
   1903 	}
   1904 }
   1905 
   1906 static struct bufhdr
   1907 build_bufhr(struct device *dev, ullong_t start, ullong_t off)
   1908 {
   1909 	struct bufhdr hdr;
   1910 	struct shadow_hdr const *shadow_hdr;
   1911 	struct blks *block;
   1912 	struct blks *blocks;
   1913 	struct fds *fd;
   1914 	ushort16_t hdrchksum;
   1915 	int error_count = 0;
   1916 	ullong_t offset = (ullong_t)start +
   1917 	    (ullong_t)(INDEX_TO_DIOLEN(max_disk_io_len)*off);
   1918 	while ((blocks = shm_ops->attach(DEV_BLOCK_HANDLE(dev, off))) == NULL) {
   1919 		if (error_count++ % 10000 == 0)
   1920 			ATTACH_ERROR(DEV_BLOCK_HANDLE(dev, off));
   1921 	}
   1922 	if (error_count > 0)
   1923 		plog(LOG_WARNING, "attached o.k.\n");
   1924 	block = &blocks[DEV_BLOCK_INDEX(dev, off)];
   1925 	ZERO_OBJ(hdr);
   1926 	fd = find_path(dev->fdhead, block->path_id);
   1927 	assert(fd != NULL);
   1928 
   1929 	if (block->bad_hdr) {
   1930 		(void) memcpy(&hdr, block->r.w.last_io, SIZEOF_BUFHDR);
   1931 		shm_ops->detach(DEV_BLOCK_HANDLE(dev, off));
   1932 		return (hdr);
   1933 	}
   1934 	shadow_hdr = get_shadow_hdr(block->r.w.last_io);
   1935 
   1936 	if (block->ab == 1) {
   1937 		hdr.start = hdr.end = BUF_TYPE_A;
   1938 		hdr.ab.a.chksum = shadow_hdr->chksums[block->r.w.last_iolen];
   1939 		hdr.ab.a.type = shadow_hdr->type;
   1940 		hdr.ab.a.type.sequence = block->sequence;
   1941 		hdr.ab.a.devid = fd->devid;
   1942 		hdr.ab.a.off = offset;
   1943 		hdr.ab.a.time = block->last_requested;
   1944 		hdr.ab.a.did = master_pid();
   1945 		hdr.ab.a.len = INDEX_TO_DIOLEN(block->r.w.last_iolen);
   1946 		get_serial_and_provider(hdr.ab.a.serial_and_provider,
   1947 		    SIZEOF_SERIAL_AND_PROVIDER);
   1948 	} else {
   1949 		hdr.start = hdr.end = BUF_TYPE_B;
   1950 		hdr.ab.b.time = block->last_requested;
   1951 		hdr.ab.b.chksum =  shadow_hdr->chksums[block->r.w.last_iolen];
   1952 		hdr.ab.b.type = shadow_hdr->type;
   1953 		hdr.ab.b.type.sequence = block->sequence;
   1954 		hdr.ab.b.devid = fd->devid;
   1955 		hdr.ab.b.off = offset;
   1956 		hdr.ab.b.did = master_pid();
   1957 		hdr.ab.b.len = INDEX_TO_DIOLEN(block->r.w.last_iolen);
   1958 		get_serial_and_provider(hdr.ab.b.serial_and_provider,
   1959 		    SIZEOF_SERIAL_AND_PROVIDER);
   1960 	}
   1961 	if ((hdrchksum = set_hdrchksum(&hdr)) != block->hdrchksum) {
   1962 		pfprintf(stderr,
   1963 		    "Bad rebuilt buf header is %#x should be %#x\n",
   1964 		    block->hdrchksum, hdrchksum);
   1965 	}
   1966 	shm_ops->detach(DEV_BLOCK_HANDLE(dev, off));
   1967 	return (hdr);
   1968 }
   1969 struct bufhdr
   1970 build_prevbufhr(struct device *dev, ullong_t start, ullong_t off)
   1971 {
   1972 	struct bufhdr hdr;
   1973 	struct shadow_hdr const *shadow_hdr;
   1974 	struct blks *block;
   1975 	struct blks *blocks;
   1976 	struct fds *fd;
   1977 	ushort16_t hdrchksum;
   1978 	int error_count = 0;
   1979 	ullong_t offset = (ullong_t)start +
   1980 	    (ullong_t)(INDEX_TO_DIOLEN(max_disk_io_len)*off);
   1981 	while ((blocks = shm_ops->attach(DEV_BLOCK_HANDLE(dev, off))) == NULL) {
   1982 		if (error_count++ % 10000 == 0)
   1983 			ATTACH_ERROR(DEV_BLOCK_HANDLE(dev, off));
   1984 	}
   1985 	if (error_count > 0)
   1986 		plog(LOG_WARNING, "attached o.k.\n");
   1987 	block = &blocks[DEV_BLOCK_INDEX(dev, off)];
   1988 	ZERO_OBJ(hdr);
   1989 	fd = find_path(dev->fdhead, block->path_id);
   1990 	assert(fd != NULL);
   1991 
   1992 	if (block->bad_hdr) {
   1993 		(void) memcpy(&hdr, block->r.w.last_io, SIZEOF_BUFHDR);
   1994 		shm_ops->detach(DEV_BLOCK_HANDLE(dev, off));
   1995 		return (hdr);
   1996 	}
   1997 	shadow_hdr = get_shadow_hdr(block->r.w.prev_io);
   1998 
   1999 	if (block->ab != 1) {
   2000 		hdr.start = hdr.end = BUF_TYPE_A;
   2001 		hdr.ab.a.time = block->u.prev_requested;
   2002 		hdr.ab.a.chksum = shadow_hdr->chksums[block->r.w.prev_iolen];
   2003 		hdr.ab.a.type = shadow_hdr->type;
   2004 		hdr.ab.a.type.sequence = block->sequence - 1;
   2005 		hdr.ab.a.devid = fd->devid;
   2006 		hdr.ab.a.off = offset;
   2007 		get_serial_and_provider(hdr.ab.a.serial_and_provider,
   2008 		    SIZEOF_SERIAL_AND_PROVIDER);
   2009 		hdr.ab.a.len = INDEX_TO_DIOLEN(block->r.w.prev_iolen);
   2010 		hdr.ab.a.did = master_pid();
   2011 	} else {
   2012 		hdr.start = hdr.end = BUF_TYPE_B;
   2013 		hdr.ab.b.chksum =  shadow_hdr->chksums[block->r.w.prev_iolen];
   2014 		hdr.ab.b.type = shadow_hdr->type;
   2015 		hdr.ab.b.type.sequence = block->sequence - 1;
   2016 		hdr.ab.b.devid = fd->devid;
   2017 		hdr.ab.b.off = offset;
   2018 		hdr.ab.b.len = INDEX_TO_DIOLEN(block->r.w.prev_iolen);
   2019 		hdr.ab.b.time = block->u.prev_requested;
   2020 		hdr.ab.b.did = master_pid();
   2021 		get_serial_and_provider(hdr.ab.b.serial_and_provider,
   2022 		    SIZEOF_SERIAL_AND_PROVIDER);
   2023 	}
   2024 	if ((hdrchksum = set_hdrchksum(&hdr)) != block->hdrchksum) {
   2025 		pfprintf(stderr,
   2026 		    "Bad rebuilt buf header is %#x should be %#x\n",
   2027 		    block->hdrchksum, hdrchksum);
   2028 	}
   2029 	shm_ops->detach(DEV_BLOCK_HANDLE(dev, off));
   2030 	return (hdr);
   2031 }
   2032 static struct diff_return
   2033 memdiff_data(FILE *err, uchar_t *goodptr, uchar_t *badptr,
   2034 	int offset, int len)
   2035 {
   2036 	int i;
   2037 	struct diff_return dr;
   2038 	union {
   2039 		uchar_t c[sizeof (uint64_t) /  sizeof (uchar_t)];
   2040 		uint32_t i[sizeof (uint64_t) / sizeof (uint32_t)];
   2041 		uint64_t l;
   2042 	} good, bad, diff;
   2043 
   2044 	dr.bits = 0LL;
   2045 	dr.count = 0LL;
   2046 
   2047 	for (i = 0; i < len; i += sizeof (uint64_t)) {
   2048 		(void) memcpy(&good.c[0], goodptr, sizeof (uint64_t));
   2049 		(void) memcpy(&bad.c[0], badptr, sizeof (uint64_t));
   2050 		diff.i[0] = good.i[0] ^ bad.i[0];
   2051 		diff.i[1] = good.i[1] ^ bad.i[1];
   2052 
   2053 		if (!opts.expert_small_diffs || diff.l) {
   2054 			int bc = count_uint32_bits(diff.i[0]) +
   2055 			    count_uint32_bits(diff.i[1]);
   2056 			dr.count += bc;
   2057 			dr.bits |= diff.l;
   2058 #ifdef _BIG_ENDIAN
   2059 			(void) fprintf(err,
   2060 			    "0x%8.8x %8.8x%8.8x %8.8x%8.8x "
   2061 			    "%8.8x%8.8x %2.2d\n", i + offset,
   2062 			    good.i[0], good.i[1], bad.i[0], bad.i[1],
   2063 			    diff.i[0], diff.i[1], bc);
   2064 #elif defined(_LITTLE_ENDIAN)
   2065 			(void) fprintf(err,
   2066 			    "0x%8.8x "
   2067 			    "%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x "
   2068 			    "%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x "
   2069 			    "%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x "
   2070 			    "%2.2d\n", i + offset,
   2071 			    good.c[0], good.c[1], good.c[2], good.c[3],
   2072 			    good.c[4], good.c[5], good.c[6], good.c[7],
   2073 			    bad.c[0], bad.c[1], bad.c[2], bad.c[3],
   2074 			    bad.c[4], bad.c[5], bad.c[6], bad.c[7],
   2075 			    diff.c[0], diff.c[1], diff.c[2], diff.c[3],
   2076 			    diff.c[4], diff.c[5], diff.c[6], diff.c[7],
   2077 			    bc);
   2078 #else
   2079 #error "niether _BIG_ENDIAN or _LITTLE_ENDIAN defined"
   2080 #endif
   2081 		}
   2082 		badptr += sizeof (uint64_t);
   2083 		goodptr += sizeof (uint64_t);
   2084 	}
   2085 	return (dr);
   2086 }
   2087 
   2088 static struct diff_return
   2089 memdiff_bufhdr(FILE *err, uchar_t *buf, uchar_t *good_hdr)
   2090 {
   2091 	union {
   2092 		struct bufhdr hdr;
   2093 		uchar_t c[SIZEOF_BUFHDR];
   2094 	} bad;
   2095 
   2096 	bad.hdr = get_bufhdr(buf);
   2097 
   2098 	return (memdiff_data(err, good_hdr, &bad.c[0], 0, SIZEOF_BUFHDR));
   2099 }
   2100 
   2101 char *
   2102 diff_file(void)
   2103 {
   2104 	char *wd;
   2105 	static char *diffs_file;
   2106 
   2107 	if (NULL == diffs_file) {
   2108 		if (diffs[0] != '/' && (wd = getcwd(NULL, 128)) != NULL) {
   2109 			int x = strlen(diffs) + strlen(wd) + 2;
   2110 			if ((diffs_file = malloc(x)) != NULL) {
   2111 				snprintf(diffs_file, x, "%s/%s", wd, diffs);
   2112 			} else {
   2113 				diffs_file = diffs;
   2114 			}
   2115 			free(wd);
   2116 		} else {
   2117 			diffs_file = diffs;
   2118 		}
   2119 	}
   2120 	return (diffs_file);
   2121 }
   2122 
   2123 struct diff_return
   2124 memdiff_buf(uint64_t off, struct device *dev, uchar_t *buf, uint32_t iolen,
   2125 	struct fds *fd, const char *str, struct error *error)
   2126 {
   2127 	static const char zero2seven[] = "0 1 2 3 4 5 6 7";
   2128 	uchar_t *badptr;
   2129 	uchar_t *goodptr;
   2130 	uchar_t *prevptr; /* pointer to the previous buffer that was written */
   2131 	FILE *err;
   2132 	time_t now;
   2133 	sigset_t nset; /* new set */
   2134 	sigset_t oset; /* old set */
   2135 	int sigprocmask_status;
   2136 	int error_count = 0;
   2137 	struct diff_return dr;
   2138 	struct diff_return dr2;
   2139 	struct blks *blocks, *block;
   2140 	union {
   2141 		struct bufhdr hdr;
   2142 		uchar_t c[SIZEOF_BUFHDR];
   2143 	} good, prev;
   2144 	while ((blocks = shm_ops->attach(DEV_BLOCK_HANDLE(dev, off))) == NULL) {
   2145 		if (error_count++ % 10000 == 0)
   2146 			ATTACH_ERROR(DEV_BLOCK_HANDLE(dev, off));
   2147 	}
   2148 	if (error_count > 0)
   2149 		plog(LOG_WARNING, "attached o.k.\n");
   2150 	block = &blocks[DEV_BLOCK_INDEX(dev, off)];
   2151 	if (block->r.w.last_io == NULL) {
   2152 		shm_ops->detach(DEV_BLOCK_HANDLE(dev, off));
   2153 		dr.count = -1;
   2154 		return (dr);
   2155 	}
   2156 	goodptr = get_buf_data(block->r.w.last_io);
   2157 	if (block->r.w.prev_io != NULL) {
   2158 		prevptr = get_buf_data(block->r.w.prev_io);
   2159 	} else {
   2160 		prevptr = NULL;
   2161 	}
   2162 	shm_ops->detach(DEV_BLOCK_HANDLE(dev, off));
   2163 
   2164 	badptr = get_buf_data(buf);
   2165 
   2166 	if ((err = fopen(diff_file(), "a+")) == NULL) {
   2167 		err = stderr;
   2168 		FOPEN_ERROR(diff_file(), "a+");
   2169 
   2170 		(void) sigemptyset(&nset);
   2171 		(void) sigaddset(&nset, SIGINT);
   2172 		(void) sigaddset(&nset, SIGTERM);
   2173 		sigprocmask_status =
   2174 		    sigprocmask(SIG_BLOCK, &nset, &oset);
   2175 		mutex->stderr_enter();
   2176 	}
   2177 
   2178 	now = time(NULL);
   2179 	(void) fprintf(err, "diskomizer %s\n", VERSION);
   2180 	print_bufhdr_offsets(err);
   2181 
   2182 	(void) fprintf(err, "Error Instance %d\n", get_error_instance_number());
   2183 	(void) fprintf(err, "Diffs dumped %s", ctime(&now));
   2184 	(void) fprintf(err, "Diffs from %s for block 0x%llx\n",
   2185 	    str, diskomizer_off2byteoff(off));
   2186 	(void) fprintf(err,
   2187 	    "use \""
   2188 	    "dd if=%s bs=%d iseek=%lld count=1\" to read the block\n",
   2189 	    fd->longname, iolen,
   2190 	    (opts.start_offset) + off);
   2191 	good.hdr = build_bufhr(dev, start_offset(), off);
   2192 	decode_header(err, &good.c[0], buf);
   2193 	(void) fprintf(err, "%10.10s %16.16s %16.16s %16.16s %s\n",
   2194 	    "", "Written", "Read", "Diffs", "Bit count");
   2195 	(void) fprintf(err, "%10.10s %16.16s %16.16s %16.16s\n",
   2196 	    "Offset", zero2seven, zero2seven, zero2seven);
   2197 
   2198 	dr = memdiff_bufhdr(err, buf, &good.c[0]);
   2199 	dr2 = memdiff_data(err, goodptr, badptr, SIZEOF_BUFHDR,
   2200 	    iolen - SIZEOF_BUFHDR);
   2201 
   2202 	dr.count += dr2.count;
   2203 	dr.bits |= dr2.bits;
   2204 
   2205 	(void) fprintf(err, "End of diffs for block 0x%llx\n",
   2206 	    diskomizer_off2byteoff(off));
   2207 	if (prevptr != NULL && opts.display_prev_diffs) {
   2208 		prev.hdr = build_prevbufhr(dev, start_offset(), off);
   2209 		(void) fprintf(err, "Diffs from %s for previous io to block "
   2210 		    "0x%llx\n", str, diskomizer_off2byteoff(off));
   2211 		(void) fprintf(err, "%10.10s %16.16s %16.16s %16.16s %s\n",
   2212 		    "", "Written", "Read", "Diffs", "Bit count");
   2213 		(void) fprintf(err, "%10.10s %16.16s %16.16s %16.16s\n",
   2214 		    "Offset", zero2seven, zero2seven, zero2seven);
   2215 		(void) memdiff_bufhdr(err, buf, &prev.c[0]);
   2216 		(void) memdiff_data(err, prevptr, badptr, SIZEOF_BUFHDR,
   2217 		    iolen - SIZEOF_BUFHDR);
   2218 	}
   2219 
   2220 
   2221 	(void) fflush(err);
   2222 	if (fsync(fileno(err)) == -1) {
   2223 		FSYNC_ERROR(fileno(err), diffs);
   2224 	}
   2225 	if (err != stderr) {
   2226 		(void) fclose(err);
   2227 		error->diff_file = diff_file();
   2228 		dlog(LOG_ERR, "Diffs file dumped to %s\n", diff_file());
   2229 	} else {
   2230 		mutex->stderr_exit();
   2231 		error->diff_file = NULL;
   2232 		if (sigprocmask_status == 0)
   2233 			(void) sigprocmask(SIG_SETMASK, &oset, NULL);
   2234 	}
   2235 	return (dr);
   2236 }
   2237 struct diff_return
   2238 memdiff(struct aio_str *aiop, char *str)
   2239 {
   2240 	struct diff_return dr;
   2241 
   2242 	if (!is_readonly()) {
   2243 		aiop->error.dr = memdiff_buf(aiop->off, aiop->dev, aiop->buf,
   2244 		    INDEX_TO_DIOLEN(aiop->iolen), aiop->fd, str, &aiop->error);
   2245 		return (aiop->error.dr);
   2246 	}
   2247 	dr.count = dr.bits = 0;
   2248 	return (dr);
   2249 }
   2250 /*
   2251  * Check to see if the buffer that has been read matches the previous
   2252  * buffer that was written. This would spot if an write never got to
   2253  * the disk.
   2254  */
   2255 int
   2256 check_previous_buffer(check_t check_sum, struct aio_str *aiop)
   2257 {
   2258 	uchar_t *previous_buf_written;
   2259 	struct blks *blocks;
   2260 	time_t prev_time;
   2261 
   2262 	blocks = aio_attach(aiop);
   2263 
   2264 	previous_buf_written = blocks[AIO_BLOCK_INDEX(aiop)].r.w.prev_io;
   2265 	prev_time = blocks[AIO_BLOCK_INDEX(aiop)].u.prev_requested;
   2266 
   2267 	shm_ops->detach(AIO_BLOCK_HANDLE(aiop));
   2268 
   2269 	if (previous_buf_written != NULL) {
   2270 		struct bufhdr_a hdr_a;
   2271 		struct bufhdr hdr;
   2272 		hdr = get_bufhdr(previous_buf_written);
   2273 
   2274 		hdr_a = conv_bufhdr(&hdr);
   2275 
   2276 		if (check_sum == hdr_a.chksum) {
   2277 			char *time_str;
   2278 
   2279 			time_str = alloc_time_str(prev_time);
   2280 
   2281 			pfprintf(stderr, "block %llx checksum matches"
   2282 			    " the previous block written at %s\n",
   2283 			    aio_str2byteoff(aiop), NOT_NULL(time_str));
   2284 			not_null_free(time_str);
   2285 			return (1);
   2286 		}
   2287 	}
   2288 	return (0);
   2289 }
   2290 
   2291 static int
   2292 check_old_data(struct aio_str *aiop)
   2293 {
   2294 	const char *x;
   2295 	time_t tyme;
   2296 	pid_t did;
   2297 
   2298 	if (((x = get_buf_serial_and_provider(aiop->buf)) == NULL) ||
   2299 	    cmp_serial_and_provider(x) != 0) {
   2300 		char y[SIZEOF_SERIAL_AND_PROVIDER];
   2301 
   2302 		get_serial_and_provider(y, SIZEOF_SERIAL_AND_PROVIDER);
   2303 
   2304 		if (x == NULL) {
   2305 			plog(LOG_NOTICE, "block %llx contains data that "
   2306 			    "could not be recognized.\n",
   2307 			    aio_str2byteoff(aiop));
   2308 		} else {
   2309 			plog(LOG_NOTICE, "block %llx contains data written "
   2310 			    "by host %.*s not %.*s\n", aio_str2byteoff(aiop),
   2311 			    SIZEOF_SERIAL_AND_PROVIDER, x,
   2312 			    SIZEOF_SERIAL_AND_PROVIDER, y);
   2313 		}
   2314 		return (1);
   2315 	} else if ((tyme = get_buf_time(aiop->buf)) < start_time.tv_sec) {
   2316 		char *t = alloc_time_str(tyme);
   2317 
   2318 		plog(LOG_NOTICE, "block %llx contains data written before "
   2319 		    "this instance started. It was written at %s\n",
   2320 		    aio_str2byteoff(aiop), NOT_NULL(t));
   2321 		not_null_free(t);
   2322 		return (1);
   2323 	} else if ((did = get_buf_did(aiop->buf)) != master_pid()) {
   2324 		plog(LOG_NOTICE, "block at byte offset %llx not written by "
   2325 		    "this instance, but by %ld\n", aio_str2byteoff(aiop), did);
   2326 		return (1);
   2327 	}
   2328 	return (0);
   2329 }
   2330 
   2331 int
   2332 check_header(ullong_t start, struct aio_str *aiop)
   2333 {
   2334 	if (!is_readonly()) {
   2335 		ushort16_t bufhdrchksum;
   2336 		ushort16_t hdrchksum;
   2337 
   2338 		bufhdrchksum = get_bufhdr_hdrchksum(aiop->buf);
   2339 		hdrchksum = check_bufhdr(aiop->buf, bufhdrchksum);
   2340 
   2341 		if (bufhdrchksum != hdrchksum || hdrchksum == 0) {
   2342 			plog(LOG_ERR, "block %llx bad header checksum\n",
   2343 			    aio_str2byteoff(aiop));
   2344 			return (0);
   2345 		} else {
   2346 			struct bufhdr_a hdr_a;
   2347 			ullong_t off;
   2348 			struct bufhdr hdr;
   2349 			hdr = get_bufhdr(aiop->buf);
   2350 
   2351 			(void) check_old_data(aiop);
   2352 
   2353 			hdr_a = conv_bufhdr(&hdr);
   2354 
   2355 			off = byteoff2diskomizer_off(hdr_a.off);
   2356 
   2357 			if (off != aiop->off) {
   2358 				plog(LOG_ERR,
   2359 				    "On disk header says device byte offset "
   2360 				    "%llx (0t%lld), which calculates "
   2361 				    "diskomizer block %#llx (0t%lld), I "
   2362 				    "requested diskomizer block "
   2363 				    "%#llx (0t%lld)\n",
   2364 				    hdr_a.off, hdr_a.off, off, off,
   2365 				    aiop->off, aiop->off);
   2366 				return (0);
   2367 			}
   2368 		}
   2369 	}
   2370 	return (1);
   2371 }
   2372 
   2373 int
   2374 do_memcmp(ullong_t start, struct aio_str *aiop)
   2375 {
   2376 	check_t check_sum;
   2377 	uchar_t *last;
   2378 	struct blks *blocks;
   2379 	int status = 0;
   2380 	if (check_header(start, aiop) == 0)
   2381 		return (0);
   2382 	blocks = aio_attach(aiop);
   2383 
   2384 	if ((last = blocks[AIO_BLOCK_INDEX(aiop)].r.w.last_io) != NULL) {
   2385 		if (memcmp(get_buf_data(last), get_buf_data(aiop->buf),
   2386 		    INDEX_TO_DIOLEN(aiop->iolen) - SIZEOF_BUFHDR)) {
   2387 			struct bufhdr_a hdr_a;
   2388 			struct bufhdr hdr = get_bufhdr(last);
   2389 
   2390 			hdr_a = conv_bufhdr(&hdr);
   2391 			/* the memcmp failed */
   2392 			check_sum = check_aiobuf(aiop);
   2393 			dfprintf(stderr, "block %llx buf %#lx does not match "
   2394 			    "what was written, what was read %#lx,"
   2395 			    " written %#lx\n", aio_str2byteoff(aiop),
   2396 			    (ulong_t)last, check_sum, hdr_a.chksum);
   2397 			if (check_previous_buffer(check_sum, aiop) == 0) {
   2398 				check_old_data(aiop);
   2399 			}
   2400 		} else {
   2401 			status = 1;
   2402 		}
   2403 	} else {
   2404 		status = 1;
   2405 	}
   2406 	shm_ops->detach(AIO_BLOCK_HANDLE(aiop));
   2407 	return (status);
   2408 }
   2409 
   2410 /*
   2411  * print number of bytes will print the given number in full and then
   2412  * convert it to a human readable form and print it to 2 decimal places.
   2413  */
   2414 void
   2415 print_number_of_bytes(unsigned long long  x, char *singular, char *plural)
   2416 {
   2417 	char *str = x != 1 ? plural : singular;
   2418 	const char *units;
   2419 	int j;
   2420 	int y;
   2421 	/*
   2422 	 * All the units that fit in 64 bits:
   2423 	 * kilo, mega, giga, tera, peta, exa
   2424 	 */
   2425 	static const char *all_units[] = { "K", "M", "G", "T", "P", "E" };
   2426 
   2427 	(void) printf("\t%#llx, %lld, %s ", x, x, str);
   2428 
   2429 	units = NULL;
   2430 
   2431 	for (j = 0; j < (sizeof (all_units) / sizeof (all_units[0])); j++) {
   2432 		if (x / 1024) {
   2433 			y = ((x * 1000) / 1024) % 1000;
   2434 			x = x / 1024;
   2435 			units = all_units[j];
   2436 		} else {
   2437 			break;
   2438 		}
   2439 	}
   2440 
   2441 	if (units) {
   2442 		/* Round up it necessary */
   2443 		if (y % 10 >= 5) {
   2444 			y = y + 10;
   2445 		}
   2446 		/* loose the least significant digit */
   2447 		y = y/10;
   2448 		if (y >= 100) {
   2449 			y -= 100;
   2450 			x++;
   2451 		}
   2452 
   2453 		(void) printf("(%lld.%.2d %s)\n", x, y, units);
   2454 	} else {
   2455 		(void) printf("\n");
   2456 	}
   2457 }
   2458 
   2459 void
   2460 print_number(unsigned long long  i, char *singular, char *plural)
   2461 {
   2462 	char *str = i != 1 ? plural : singular;
   2463 	(void) printf("\t%#llx, %lld, %s\n", i, i, str);
   2464 }
   2465 /*
   2466  * given that the bufhdr for the io has a good check sum but is not
   2467  * for this device find the correct device and offset for the io
   2468  * and report this.
   2469  */
   2470 struct fds *
   2471 check_matching_path_io(struct bufhdr_a *hdr, struct fds *fd)
   2472 {
   2473 	struct fds *x, *fdh;
   2474 
   2475 	fdh = fd;
   2476 
   2477 	for (x = fd->next; /* make cstyle happy */; x = x->next) {
   2478 		if (memcmp(&hdr->devid, &fd->devid,
   2479 		    sizeof (struct device_id)) == 0) {
   2480 			return (fd);
   2481 		}
   2482 		if (x == fdh) {
   2483 			return (NULL);
   2484 		} else {
   2485 			fd = x;
   2486 		}
   2487 	}
   2488 	/*NOTREACHED*/
   2489 }
   2490 void
   2491 read_and_check(ullong_t start, struct device *dev, ullong_t off,
   2492 		struct aio_str *aiop)
   2493 {
   2494 	uchar_t *buf;
   2495 	ullong_t status;
   2496 	ullong_t diskoff = diskomizer_off2byteoff(off);
   2497 	struct blks *blocks;
   2498 	struct blks *block;
   2499 	struct shadow_hdr const *shadow;
   2500 	int error_count = 0;
   2501 
   2502 	buf = calloc(1, INDEX_TO_DIOLEN(max_disk_io_len));
   2503 
   2504 	if (buf == NULL) {
   2505 		CALLOC_ERROR(1L, (ulong_t)INDEX_TO_DIOLEN(max_disk_io_len));
   2506 		return;
   2507 	}
   2508 	while ((blocks = shm_ops->attach(DEV_BLOCK_HANDLE(dev, off))) == NULL) {
   2509 		if (error_count++ % 10000 == 0)
   2510 			ATTACH_ERROR(DEV_BLOCK_HANDLE(dev, off));
   2511 	}
   2512 	if (error_count > 0)
   2513 		plog(LOG_WARNING, "attached o.k.\n");
   2514 	block = &blocks[DEV_BLOCK_INDEX(dev, off)];
   2515 	aiop->daio_id.buf = block->r.w.last_io;
   2516 	aiop->daio_id.bufs =
   2517 	    INDEX_TO_DIOLEN(opts.disk_io_sizes.vals[block->r.w.last_iolen]);
   2518 	aiop->daio_id.buf_id = get_write_buf_id(block->r.w.last_io);
   2519 
   2520 	aiop->hdr = build_bufhr(dev, start, off);
   2521 	aiop->daio_id.hdr = (uchar_t *)&aiop->hdr;
   2522 	aiop->daio_id.hdr_len = sizeof (aiop->hdr);
   2523 	aiop->daio_id.footer_len = 0;
   2524 
   2525 	shadow = get_shadow_hdr(aiop->daio_id.buf);
   2526 	aiop->daio_id.chksum = shadow->chksums[aiop->iolen];
   2527 
   2528 	status = daio->pread(dev->fdhead->fd, buf,
   2529 	    opts.disk_io_sizes.vals[block->r.w.last_iolen], diskoff,
   2530 	    &aiop->daio_id);
   2531 	if (status == DAIO_CORRUPT) {
   2532 		int check_sum;
   2533 		struct error error;
   2534 		ulong_t shadow_chksum = check_bufbody(block->r.w.last_io,
   2535 		    opts.disk_io_sizes.vals[block->r.w.last_iolen]);
   2536 
   2537 		ZERO_OBJ(error);
   2538 
   2539 		check_sum = check_buf(buf,
   2540 		    INDEX_TO_DIOLEN(block->r.w.last_iolen), &error);
   2541 
   2542 		if (check_sum != shadow_chksum) {
   2543 			pfprintf(stderr,
   2544 			    "Off %#llx (%lld) header differs "
   2545 			    "on disk\n", diskoff, diskoff);
   2546 			aiop->error.dr = memdiff_buf(off, dev, buf,
   2547 			    opts.disk_io_sizes.vals[block->r.w.last_iolen],
   2548 			    dev->fdhead, "read and check", &aiop->error);
   2549 		} else {
   2550 			if (memcmp(get_buf_data(buf),
   2551 			    get_buf_data(block->r.w.last_io),
   2552 			    INDEX_TO_DIOLEN(block->r.w.last_iolen) -
   2553 			    SIZEOF_BUFHDR) != 0) {
   2554 				pfprintf(stderr,
   2555 				    "Off %#llx (%lld) body differs "
   2556 				    "on disk\n", diskoff, diskoff);
   2557 				aiop->error.dr = memdiff_buf(off, dev, buf,
   2558 				    opts.disk_io_sizes.vals[
   2559 				    block->r.w.last_iolen],
   2560 				    dev->fdhead, "read and check",
   2561 				    &aiop->error);
   2562 			} else {
   2563 				ullong_t off = aio_str2byteoff(aiop);
   2564 
   2565 				pfprintf(stderr,
   2566 				    "Data at byte offset %#llx (%lld) "
   2567 				    "on disk, matches the data just read "
   2568 				    "from %#llx (%lld)\n", diskoff, diskoff,
   2569 				    off, off);
   2570 		}
   2571 		}
   2572 	} else if (status != opts.disk_io_sizes.vals[block->r.w.last_iolen]) {
   2573 		PREAD_ERROR(dev->fdhead->fd, dev->fdhead->name, (ulong_t)buf,
   2574 		    opts.disk_io_sizes.vals[block->r.w.last_iolen], diskoff);
   2575 	}
   2576 	shm_ops->detach(DEV_BLOCK_HANDLE(dev, off));
   2577 	free(buf);
   2578 }
   2579 /*
   2580  * Check the buffer contents matches an io which it internally thinks it
   2581  * is. This error path is used when we have read a block X but the contents
   2582  * of the block is not for block X but for block Y. So this routine gives
   2583  * information regarding the last movements of block Y.
   2584  *
   2585  * This routine must be called with all the mutexs held, see
   2586  *	mutex->grab_all().
   2587  *
   2588  * To Do:
   2589  *	bounds checking must be done on off before using it!
   2590  */
   2591 void
   2592 check_by_buffer(ullong_t start, struct device *dev, struct aio_str *aiop)
   2593 {
   2594 	struct bufhdr_a read_hdr;
   2595 	struct bufhdr hdr;
   2596 	ullong_t off;
   2597 	struct blks *block;
   2598 	struct blks *blocks;
   2599 	ulong_t shadow_chksum;
   2600 	int error_count = 0;
   2601 	bitmap_t *map;
   2602 
   2603 
   2604 	hdr = get_bufhdr(aiop->buf);
   2605 
   2606 	read_hdr = conv_bufhdr(&hdr);
   2607 
   2608 	off = byteoff2diskomizer_off(read_hdr.off);
   2609 	if ((long long)off < 0 || off >= LEN_BYTES2BLOCKS(dev)) {
   2610 		return;
   2611 	}
   2612 
   2613 	aiop->error.doff = off;
   2614 	aiop->error.dev = dev;
   2615 
   2616 	if (aiop->dev == dev && off == aiop->off) {
   2617 		return;
   2618 	}
   2619 
   2620 	map =  attach_dev_writemap(dev);
   2621 
   2622 	if (test_write(map, off, LEN_BYTES2BLOCKS(dev))) {
   2623 	/*
   2624 	 * Grr. There is an IO outstanding on this block on this device,
   2625 	 * I don't know whether it is a read or a write
   2626 	 */
   2627 		shm_ops->detach(dev->writemap_handle);
   2628 		aiop->error.desc.UNABLE_TO_LOCK = 1;
   2629 		dfprintf(stderr,
   2630 		    "Block %#llx 0t%lld is currently locked for dev %s\n",
   2631 		    diskomizer_off2byteoff(off),
   2632 		    diskomizer_off2byteoff(off), dev->logicalname);
   2633 		return;
   2634 
   2635 	}
   2636 	shm_ops->detach(dev->writemap_handle);
   2637 	while ((blocks = shm_ops->attach(DEV_BLOCK_HANDLE(dev, off))) == NULL) {
   2638 		if (error_count++ % 10000 == 0)
   2639 			ATTACH_ERROR(DEV_BLOCK_HANDLE(dev, off));
   2640 	}
   2641 	if (error_count > 0)
   2642 		plog(LOG_WARNING, "attached o.k.\n");
   2643 	block = &blocks[DEV_BLOCK_INDEX(dev, off)];
   2644 
   2645 	if (block->r.w.last_io != NULL &&
   2646 	    (shadow_chksum = check_bufbody(block->r.w.last_io,
   2647 	    opts.disk_io_sizes.vals[block->r.w.last_iolen])) ==
   2648 	    read_hdr.chksum &&
   2649 	    memcmp(get_buf_data(aiop->buf),
   2650 	    get_buf_data(block->r.w.last_io),
   2651 	    opts.disk_io_sizes.vals[aiop->iolen] - SIZEOF_BUFHDR) == 0) {
   2652 		struct fds *fd;
   2653 
   2654 		fd = find_path(dev->fdhead, block->path_id);
   2655 		assert(fd != NULL);
   2656 		aiop->error.desc.MATCHING_LAST = 1;
   2657 		aiop->error.last_requested = block->last_requested;
   2658 		aiop->error.delta = block->last_returned_delta;
   2659 		aiop->error.doff = off;
   2660 		aiop->error.dev = dev;
   2661 		aiop->error.path_id = block->path_id;
   2662 
   2663 		dlog(LOG_ERR, "Buffer matches last write to block %#llx "
   2664 		    "(0t%lld) (block %#llx 0t%lld) on dev %s path %s\n",
   2665 		    off, off, diskomizer_off2byteoff(off),
   2666 		    diskomizer_off2byteoff(off), dev->logicalname,
   2667 		    fd->name);
   2668 		dtime_log(LOG_ERR, block->last_requested,
   2669 		    "Last write to %s block %#llx (0t%lld) requested",
   2670 		    dev->logicalname, diskomizer_off2byteoff(off),
   2671 		    diskomizer_off2byteoff(off));
   2672 
   2673 		dtime_log(LOG_ERR,
   2674 		    block->last_requested +
   2675 		    block->last_returned_delta,
   2676 		    "Last write to %s block %#llx (0t%lld)"
   2677 		    " returned ", dev->logicalname,
   2678 		    diskomizer_off2byteoff(off),
   2679 		    diskomizer_off2byteoff(off));
   2680 		read_and_check(start, dev, off, aiop);
   2681 
   2682 	} else if (block->r.w.prev_io != NULL) {
   2683 		shadow_chksum = check_bufbody(block->r.w.prev_io,
   2684 		    MIN(opts.disk_io_sizes.vals[block->r.w.last_iolen],
   2685 		    opts.disk_io_sizes.vals[block->r.w.prev_iolen]));
   2686 		if (shadow_chksum == read_hdr.chksum &&
   2687 		    memcmp(get_buf_data(aiop->buf),
   2688 		    get_buf_data(block->r.w.prev_io),
   2689 		    opts.disk_io_sizes.vals[aiop->iolen] -
   2690 		    SIZEOF_BUFHDR)) {
   2691 
   2692 			aiop->error.desc.MATCHING_PREV = 1;
   2693 			aiop->error.doff = off;
   2694 			aiop->error.last_requested = block->last_requested;
   2695 
   2696 			dlog(LOG_ERR,
   2697 			    "Buffer matches block %#llx (block %lld) dev "
   2698 			    "%s prev io\n",
   2699 			    (ullong_t)off, diskomizer_off2byteoff(off),
   2700 			    dev->logicalname);
   2701 			dtime_log(LOG_ERR, block->u.prev_requested,
   2702 			    "Prev write to %s block %#llx (0t%lld)"
   2703 			    " requested %s\n",
   2704 			    dev->logicalname,
   2705 			    diskomizer_off2byteoff(off),
   2706 			    diskomizer_off2byteoff(off));
   2707 		} else {
   2708 			dfprintf(stderr, "Buffer claiming to be from block "
   2709 			    "%#llx dev %s does not match either of the "
   2710 			    "last two ios.\n",
   2711 			    (ullong_t)diskomizer_off2byteoff(off),
   2712 			    dev->logicalname);
   2713 		}
   2714 	}
   2715 	shm_ops->detach(DEV_BLOCK_HANDLE(dev, off));
   2716 }
   2717 /*ARGSUSED*/
   2718 void
   2719 check_matching_io(ullong_t start, struct aio_str *aiop)
   2720 {
   2721 	struct device *device;
   2722 	struct bufhdr_a hdr_a;
   2723 	struct bufhdr hdr = get_bufhdr(aiop->buf);
   2724 
   2725 	hdr_a = conv_bufhdr(&hdr);
   2726 
   2727 	if (hdr_a.hdrchksum != check_bufhdr(aiop->buf, hdr_a.hdrchksum)) {
   2728 		return;
   2729 	}
   2730 	mutex->grab_all();
   2731 	for (device = devices; device != NULL; device = device->next) {
   2732 		struct fds *fd;
   2733 		if ((fd = check_matching_path_io(&hdr_a,
   2734 		    device->fdhead)) != NULL) {
   2735 			/*
   2736 			 * Now we have the device to which this io was sent
   2737 			 */
   2738 			aiop->error.desc.MATCHING_DEVICE = 1;
   2739 			aiop->error.fd = fd;
   2740 			aiop->error.doff = byteoff2diskomizer_off(hdr_a.off);
   2741 			aiop->error.last_requested = hdr_a.time;
   2742 			check_by_buffer(start, device, aiop);
   2743 			dfprintf(stderr, "Block read from %s matches block "
   2744 			    "written to %s\n", aiop->fd->name,
   2745 			    fd->name);
   2746 		}
   2747 	}
   2748 	mutex->drop_all();
   2749 }
   2750 
   2751 static loop_type
   2752 on_error_pause(ullong_t start, struct aio_str *aiop)
   2753 {
   2754 	int isread = is_read_io(aiop);
   2755 	pfprintf(stderr, "On %s error pause %d seconds\n",
   2756 	    isread ? "read" : "write", OPTION(pause_time));
   2757 	(void) sleep(opts.pause_time);
   2758 
   2759 	if (!isread) {
   2760 		return (CONTINUE);
   2761 	}
   2762 
   2763 	if (!do_memcmp(start, aiop)) {
   2764 		check_matching_io(start, aiop);
   2765 		memdiff(aiop, "pause");
   2766 		return (CONTINUE);
   2767 	} else {
   2768 		return (BREAK);
   2769 	}
   2770 }
   2771 static void
   2772 bring_error_path_online(struct fds *fd)
   2773 {
   2774 	struct fds *start = fd;
   2775 
   2776 	do {
   2777 		if (fd->error_path == 1) {
   2778 			pfprintf(stderr,
   2779 			    "Path %s brought on line\n", fd->name);
   2780 			fd->error_path = 0;
   2781 			break;
   2782 		}
   2783 		fd = fd->next;
   2784 	} while (fd != start);
   2785 }
   2786 /*ARGSUSED*/
   2787 static loop_type
   2788 on_error_fail_path(ullong_t start, struct aio_str *aiop)
   2789 {
   2790 	char *name = aiop->fd->name;
   2791 	pfprintf(stderr,
   2792 	    "On error fail path %s failed\n", aiop->fd->name);
   2793 	if (set_shared_stop_flag(aiop->fd->shared_data_handle) == -1) {
   2794 		aiop->fd->need_to_stop = 1;
   2795 	} else {
   2796 		aiop->fd->stop_flag = 1;
   2797 		cancel_all_io_byfd(aiop->fd);
   2798 		snapshot_recent(aiop->dev->recent);
   2799 		bring_error_path_online(aiop->fd);
   2800 	}
   2801 	newfd(aiop);
   2802 	if (aiop->fd->stop_flag == 1 || aiop->fd->need_to_stop == 1) {
   2803 	/*
   2804 	 * All the paths have failed, we muddle on to complete any furhter
   2805 	 * error action down this failed path.
   2806 	 */
   2807 		pfprintf(stderr, "On error fail path %s continuing\n", name);
   2808 		return (BREAK);
   2809 	} else {
   2810 		aiop->retrycnt = 0;
   2811 		pfprintf(stderr, "On error fail path %s retrying\n", name);
   2812 		return (RETRY);
   2813 	}
   2814 }
   2815 
   2816 /*ARGSUSED*/
   2817 static loop_type
   2818 on_error_retry(ullong_t start, struct aio_str *aiop)
   2819 {
   2820 	int isread = is_read_io(aiop);
   2821 	short max = (isread ?
   2822 	    OPTION(max_read_retries) : OPTION(max_write_retries));
   2823 	if (aiop->retrycnt < max) {
   2824 		pfprintf(stderr,
   2825 		    "On %s error retry %d, %d remaining %s blk %#llx\n",
   2826 		    isread ? "read": "write",
   2827 		    1+aiop->retrycnt, max-(1+aiop->retrycnt),
   2828 		    aiop->fd->name, aio_str2byteoff(aiop));
   2829 		return (RETRY);
   2830 	} else {
   2831 		return (CONTINUE);
   2832 	}
   2833 }
   2834 
   2835 static loop_type
   2836 on_error_rewrite(ullong_t start, struct aio_str *aiop)
   2837 {
   2838 	ssize_t status;
   2839 	union err_info err_info;
   2840 	struct shadow_hdr const *shadow_hdr = get_shadow_hdr(aiop->buf);
   2841 	ullong_t offset = (ullong_t)start +
   2842 	    (ullong_t)(INDEX_TO_DIOLEN(max_disk_io_len)*aiop->off);
   2843 
   2844 	err_info.str = "pwrite";
   2845 	pfprintf(stderr, "%s On error rewrite\n", aiop->fd->name);
   2846 
   2847 	aiop->daio_id.buf = aiop->buf;
   2848 	aiop->daio_id.buf_id = get_write_buf_id(aiop->buf);
   2849 	aiop->daio_id.chksum =  shadow_hdr->chksums[aiop->iolen];
   2850 	aiop->daio_id.bufs = opts.disk_io_sizes.vals[aiop->iolen];
   2851 	aiop->daio_id.hdr = (uchar_t *)&aiop->hdr;
   2852 	aiop->daio_id.footer_len = 0;
   2853 	aiop->daio_id.hdr_len = sizeof (aiop->hdr);
   2854 	(void) memcpy(aiop->daio_id.hdr, aiop->daio_id.buf,
   2855 	    aiop->daio_id.hdr_len);
   2856 
   2857 	(void) my_gettimeofday(&aiop->tv, NULL);
   2858 	status = daio->pwrite(aiop->fd->fd, aiop->buf,
   2859 	    opts.disk_io_sizes.vals[aiop->iolen], offset, &aiop->daio_id);
   2860 	DAIO_SET_RETURN(aiop->aio_res, status);
   2861 	DAIO_SET_ERROR(aiop->aio_res, errno);
   2862 
   2863 	if (status != opts.disk_io_sizes.vals[aiop->iolen]) {
   2864 		report_error(aiop, err_info, ERR_SYS);
   2865 		aiop->dev->errors += 1;
   2866 		return (CONTINUE);
   2867 	}
   2868 	return (BREAK);
   2869 }
   2870 
   2871 static loop_type
   2872 on_error_reread(ullong_t start, struct aio_str *aiop)
   2873 {
   2874 	ssize_t status;
   2875 	union err_info err_info;
   2876 	ullong_t offset = (ullong_t)start +
   2877 	    (ullong_t)(INDEX_TO_DIOLEN(max_disk_io_len)*aiop->off);
   2878 
   2879 	err_info.str = "pread";
   2880 	pfprintf(stderr, "%s On error re-read\n", aiop->fd->name);
   2881 
   2882 	(void) my_gettimeofday(&aiop->tv, NULL);
   2883 	status = daio->pread(aiop->fd->fd, aiop->buf,
   2884 	    opts.disk_io_sizes.vals[aiop->iolen], offset, &aiop->daio_id);
   2885 	DAIO_SET_RETURN(aiop->aio_res, status);
   2886 	DAIO_SET_ERROR(aiop->aio_res, errno);
   2887 	if (status == DAIO_CORRUPT) {
   2888 		if (is_readonly() || !do_memcmp(start, aiop)) {
   2889 			report_error(aiop, err_info, ERR_CORRUPT);
   2890 			aiop->dev->errors += 1;
   2891 			memdiff(aiop, err_info.str);
   2892 			return (CONTINUE);
   2893 		}
   2894 		/* There should be an assert here */
   2895 	} else if (status != opts.disk_io_sizes.vals[aiop->iolen]) {
   2896 		report_error(aiop, err_info, ERR_SYS);
   2897 		aiop->dev->errors += 1;
   2898 		return (CONTINUE);
   2899 	}
   2900 	return (BREAK);
   2901 }
   2902 
   2903 static int
   2904 do_path_stop_check(struct fds *fd, struct device *dev)
   2905 {
   2906 	if (fd->error_path == 0 && fd->stop_flag == 0) {
   2907 		if (fd->need_to_stop == 1 &&
   2908 		    set_shared_stop_flag(fd->shared_data_handle) != -1) {
   2909 			fd->need_to_stop = 0;
   2910 			/*
   2911 			 * need to cancel all the io outstanding for this
   2912 			 * path
   2913 			 */
   2914 			fd->stop_flag = 1;
   2915 			cancel_all_io_byfd(fd);
   2916 			snapshot_recent(dev->recent);
   2917 			bring_error_path_online(fd);
   2918 		} else {
   2919 			if (do_stop_check(fd->shared_data_handle) == 1) {
   2920 				fd->stop_flag = 1;
   2921 				cancel_all_io_byfd(fd);
   2922 				bring_error_path_online(fd);
   2923 			} else {
   2924 				return (0);
   2925 			}
   2926 		}
   2927 	}
   2928 	return (1);
   2929 }
   2930 
   2931 static int
   2932 init_stop_check(void)
   2933 {
   2934 	stop_check = do_stop_check;
   2935 	return (1);
   2936 }
   2937 
   2938 static int
   2939 init_path_stop_check(void)
   2940 {
   2941 	path_stop_check = do_path_stop_check;
   2942 	return (1);
   2943 }
   2944 
   2945 void
   2946 newfd(struct aio_str *aiop)
   2947 {
   2948 	struct fds *fd = aiop->fd;
   2949 	if (aiop->fd != aiop->fd->next) {
   2950 		while (aiop->fd->next != fd) {
   2951 			if (path_stop_check(aiop->fd->next, aiop->dev) == 0) {
   2952 				aiop->fd = aiop->fd->next;
   2953 				remove_from_aio_list(&fd->all_aios, aiop);
   2954 				add_to_all_aios(aiop);
   2955 				return;
   2956 			}
   2957 			aiop->fd = aiop->fd->next;
   2958 		}
   2959 		/*
   2960 		 * To get here we searched them all and found none that
   2961 		 * were not error paths or had been stopped. Reset the path
   2962 		 * back to the original.
   2963 		 */
   2964 		aiop->fd = fd;
   2965 	}
   2966 
   2967 }
   2968 loop_type
   2969 handle_err_generic(struct aio_str *aiop, ullong_t start,
   2970 	on_error_t *on_error_func)
   2971 {
   2972 	struct blks *blocks = NULL;
   2973 	loop_type status = BREAK;
   2974 
   2975 	blocks = aio_attach(aiop);
   2976 
   2977 	aiop->dev->errors++;
   2978 	if (blocks[AIO_BLOCK_INDEX(aiop)].r.w.last_io !=
   2979 	    NULL && does_check(daio->what_checker())) {
   2980 		char i;
   2981 		union err_info err_info;
   2982 
   2983 		err_info.str = "aioread";
   2984 		if (on_error_func == on_error_corrupt) {
   2985 			memdiff(aiop, (aiop->retrycnt == 0) ?
   2986 			    "aioread" : "aioread RETRY");
   2987 			check_matching_io(start, aiop);
   2988 			report_error(aiop, err_info, ERR_CORRUPT);
   2989 		} else {
   2990 			report_error(aiop, err_info, ERR_SYS);
   2991 		}
   2992 
   2993 		for (i = 0; on_error_func[i] != NULL; i++) {
   2994 			loop_type l;
   2995 			if ((l = on_error_func[i](start, aiop)) == BREAK)
   2996 				break;
   2997 			else if (l == RETRY) {
   2998 				status = RETRY;
   2999 				break;
   3000 			}
   3001 		}
   3002 	}
   3003 	if (incr_shared_device_error(aiop->dev->shared_data_handle,
   3004 	    aiop->dev->errors) != -1) {
   3005 		aiop->dev->errors = 0;
   3006 	}
   3007 	shm_ops->detach(AIO_BLOCK_HANDLE(aiop));
   3008 	return (status);
   3009 }
   3010 loop_type
   3011 handle_write_error(struct aio_str *aiop, ullong_t start)
   3012 {
   3013 	loop_type status = BREAK;
   3014 	int i;
   3015 
   3016 	aiop->dev->errors++;
   3017 	for (i = 0; on_write_error[i] != NULL; i++) {
   3018 		loop_type l;
   3019 		if ((l = on_write_error[i](start, aiop)) == BREAK)
   3020 			break;
   3021 		else if (l == RETRY) {
   3022 			status = RETRY;
   3023 			break;
   3024 		}
   3025 	}
   3026 	if (incr_shared_device_error(aiop->dev->shared_data_handle,
   3027 	    aiop->dev->errors) != -1) {
   3028 		aiop->dev->errors = 0;
   3029 	}
   3030 	return (status);
   3031 }
   3032 loop_type
   3033 handle_read_corrupt(struct aio_str *aiop, ullong_t start)
   3034 {
   3035 	return (handle_err_generic(aiop, start, on_error_corrupt));
   3036 }
   3037 
   3038 loop_type
   3039 handle_read_short(struct aio_str *aiop, ullong_t start)
   3040 {
   3041 	return (handle_err_generic(aiop, start, on_error_short));
   3042 }
   3043 static time_t
   3044 handle_readonly(struct aio_str *aiop, ullong_t start, read_type_t read_type)
   3045 {
   3046 	struct blks *blocks = NULL;
   3047 	struct timeval tv;
   3048 	hrtime_t delta = DAIO_GET_TIME_TAKEN(aiop->aio_res);
   3049 
   3050 
   3051 	TNF_PROBE_2(handle_read, "handle_readonly",
   3052 	    "sunw%cte%diskomizer%aio readonly wait",
   3053 	    tnf_opaque, aiop, aiop,
   3054 	    aio_tnf_str, *aiop, aiop);
   3055 
   3056 	while (my_gettimeofday(&tv, NULL) == -1)
   3057 		pperror("gettimeofday");
   3058 
   3059 	if (aiop->fd == NULL) {
   3060 		aiop->fd = aiop->dev->fdhead;
   3061 		add_to_aio_list(&aiop->fd->all_aios, aiop);
   3062 	} else {
   3063 		if (DAIO_RETURN(aiop->aio_res) == DAIO_CORRUPT) {
   3064 			if (handle_read_corrupt(aiop, start) == RETRY) {
   3065 				return (do_new_read(aiop, start, RETRY_READ));
   3066 			}
   3067 		} else if (DAIO_RETURN(aiop->aio_res) !=
   3068 		    opts.disk_io_sizes.vals[aiop->iolen]) {
   3069 			if (handle_read_short(aiop, start) == RETRY) {
   3070 				return (do_new_read(aiop, start, RETRY_READ));
   3071 			}
   3072 		} else {
   3073 			struct blks *block;
   3074 			aiop->fd->last_read_time = delta;
   3075 			if (aiop->retrycnt != 0) {
   3076 				char *now_str;
   3077 
   3078 				now_str = alloc_time_str(tv.tv_sec);
   3079 				pprintf("Read retry %d of block 0x%llx "
   3080 				    "on %s o.k. %s\n",
   3081 				    aiop->retrycnt,
   3082 				    aio_str2byteoff(aiop),
   3083 				    aiop->fd->name,
   3084 				    NIL(now_str));
   3085 				not_null_free(now_str);
   3086 			}
   3087 			blocks = aio_attach(aiop);
   3088 
   3089 			block = &blocks[AIO_BLOCK_INDEX(aiop)];
   3090 			block->r.o.prev_io =
   3091 			    blocks[AIO_BLOCK_INDEX(aiop)].r.o.last_io;
   3092 			block->r.o.prev_chksum =
   3093 			    blocks[AIO_BLOCK_INDEX(aiop)].r.o.last_chksum;
   3094 			block->r.o.last_io =
   3095 			    (((ulong_t)aiop->daio_id.buf) & 0x1);
   3096 			block->r.o.last_chksum = aiop->daio_id.chksum;
   3097 
   3098 			block->u.prev_requested = block->last_requested;
   3099 			block->last_requested = aiop->tv.tv_sec;
   3100 			block->last_returned_delta =
   3101 			    tv.tv_sec - aiop->tv.tv_sec;
   3102 		}
   3103 		if (blocks == NULL) {
   3104 			blocks = aio_attach(aiop);
   3105 		}
   3106 
   3107 		blocks[AIO_BLOCK_INDEX(aiop)].read_count += 1;
   3108 		shm_ops->detach(AIO_BLOCK_HANDLE(aiop));
   3109 
   3110 		clear_writemap(aiop);
   3111 		newfd(aiop);
   3112 	}
   3113 	update_aio_read_stats(aiop);
   3114 	aiop->count++;
   3115 	return_aio_read_buf(aiop);
   3116 
   3117 	return (do_new_read(aiop, start, read_type));
   3118 }
   3119 time_t
   3120 handle_readonly_rand(struct aio_str *aiop, ullong_t start)
   3121 {
   3122 	return (handle_readonly(aiop, start, READ_ONLY_RAND));
   3123 }
   3124 time_t
   3125 handle_readonly_seq(struct aio_str *aiop, ullong_t start)
   3126 {
   3127 	return (handle_readonly(aiop, start, READ_ONLY_SEQ));
   3128 }
   3129 
   3130 time_t
   3131 handle_read(struct aio_str *aiop, ullong_t start)
   3132 {
   3133 	struct blks *blocks = NULL;
   3134 	hrtime_t delta = DAIO_GET_TIME_TAKEN(aiop->aio_res);
   3135 
   3136 	TNF_PROBE_2(handle_read, "handle_read",
   3137 	    "sunw%cte%diskomizer%aio read wait",
   3138 	    tnf_opaque, aiop, aiop,
   3139 	    aio_tnf_str, *aiop, aiop);
   3140 
   3141 	aiop->fd->total_read--;
   3142 
   3143 	if (DAIO_RETURN(aiop->aio_res) == DAIO_CORRUPT) {
   3144 		if (!do_memcmp(start, aiop) &&
   3145 		    handle_read_corrupt(aiop, start) == RETRY) {
   3146 			return (do_new_read(aiop, start, RETRY_READ));
   3147 		}
   3148 	} else if (DAIO_RETURN(aiop->aio_res) !=
   3149 	    opts.disk_io_sizes.vals[aiop->iolen]) {
   3150 		if (handle_read_short(aiop, start) == RETRY) {
   3151 			return (do_new_read(aiop, start, RETRY_READ));
   3152 		}
   3153 	} else {
   3154 		aiop->fd->last_read_time = delta;
   3155 		if (aiop->retrycnt != 0) {
   3156 			pprintf("Read retry %d of block 0x%llx on %s o.k.\n",
   3157 			    aiop->retrycnt, aio_str2byteoff(aiop),
   3158 			    aiop->fd->name);
   3159 		}
   3160 		if (opts.obscure_execute && is_executable(aiop->buf)) {
   3161 			run_func(aiop->buf,
   3162 			    opts.disk_io_sizes.vals[aiop->iolen] -
   3163 			    SIZEOF_BUFHDR);
   3164 		}
   3165 	}
   3166 	if (blocks == NULL) {
   3167 		blocks = aio_attach(aiop);
   3168 	}
   3169 
   3170 	blocks[AIO_BLOCK_INDEX(aiop)].read_count += 1;
   3171 	shm_ops->detach(AIO_BLOCK_HANDLE(aiop));
   3172 
   3173 	clear_writemap(aiop);
   3174 
   3175 	update_aio_read_stats(aiop);
   3176 	aiop->count++;
   3177 	return_aio_read_buf(aiop);
   3178 
   3179 	newfd(aiop);
   3180 	return (do_new_read(aiop, start, NORMAL_READ));
   3181 }
   3182 time_t
   3183 handle_read_then_write(struct aio_str *aiop, ullong_t start)
   3184 {
   3185 	hrtime_t delta = DAIO_GET_TIME_TAKEN(aiop->aio_res);
   3186 	struct timeval tv;
   3187 
   3188 	while (my_gettimeofday(&tv, NULL) == -1)
   3189 		pperror("gettimeofday");
   3190 
   3191 	update_aio_read_stats(aiop);
   3192 	if (aiop->fd == NULL) {
   3193 		/* This is the first write so no read to check */
   3194 		aiop->count++;
   3195 		aiop->buf = get_write_buf();
   3196 		aiop->fd = aiop->dev->fdhead;
   3197 		add_to_aio_list(&aiop->fd->all_aios, aiop);
   3198 	} else {
   3199 		aiop->fd->total_read--;
   3200 
   3201 		if (DAIO_RETURN(aiop->aio_res) == DAIO_CORRUPT) {
   3202 			/* handle read error */
   3203 			if (!do_memcmp(start, aiop) &&
   3204 			    handle_read_corrupt(aiop, start) == RETRY) {
   3205 				return (do_new_read(aiop, start, RETRY_READ));
   3206 			}
   3207 		} else if (DAIO_RETURN(aiop->aio_res) !=
   3208 		    opts.disk_io_sizes.vals[aiop->iolen]) {
   3209 			/* handle read error */
   3210 			if (handle_read_short(aiop, start) == RETRY) {
   3211 				return (do_new_read(aiop, start, RETRY_READ));
   3212 			}
   3213 		} else {
   3214 			if (opts.obscure_execute && is_executable(aiop->buf)) {
   3215 				run_func(aiop->buf,
   3216 				    opts.disk_io_sizes.vals[aiop->iolen] -
   3217 				    SIZEOF_BUFHDR);
   3218 			}
   3219 			aiop->fd->last_read_time = delta;
   3220 		}
   3221 		return_read_buf(aiop->buf);
   3222 		/*
   3223 		 * Need to return the disk block to the free list
   3224 		 *
   3225 		 * the use of clear_writemap_success() reflects the fact that
   3226 		 * to get here the write to this block must have succeeded.
   3227 		 */
   3228 		clear_writemap_success(aiop);
   3229 		aiop->buf = get_write_buf();
   3230 		(void) set_io_len(aiop);
   3231 		newfd(aiop);
   3232 		aiop->count++;
   3233 	}
   3234 	aiop->handler = handle_write_then_read;
   3235 	do_new_write(aiop, start, 0);
   3236 	return (tv.tv_sec);
   3237 }
   3238 time_t
   3239 handle_write_then_read(struct aio_str *aiop, ullong_t start)
   3240 {
   3241 	struct timeval tv;
   3242 	struct blks *block;
   3243 	struct blks *blocks;
   3244 	struct bufhdr hdr;
   3245 
   3246 	while (my_gettimeofday(&tv, NULL) == -1)
   3247 		pperror("gettimeofday");
   3248 
   3249 	assert(aiop->buf == NULL || *aiop->buf == 0xAA || *aiop->buf == 0x55);
   3250 
   3251 	aiop->fd->total_write--;
   3252 
   3253 	if (DAIO_RETURN(aiop->aio_res) !=
   3254 	    opts.disk_io_sizes.vals[aiop->iolen]) {
   3255 		union err_info err_info;
   3256 		err_info.str = "aiowrite";
   3257 		report_error(aiop, err_info, ERR_SYS);
   3258 		if (handle_write_error(aiop, start) == RETRY) {
   3259 			do_new_write(aiop, start, 1);
   3260 		} else {
   3261 			if (is_sequential(aiop) &&
   3262 			    (aiop->off % opts.nprocs) ==  this_proc()) {
   3263 				push_unwritten(aiop);
   3264 			}
   3265 			do_new_write(aiop, start, 0);
   3266 		}
   3267 		return (tv.tv_sec);
   3268 	}
   3269 
   3270 	update_aio_write_stats(aiop);
   3271 	aiop->count++;
   3272 	aiop->fd->last_write_time = DAIO_GET_TIME_TAKEN(aiop->aio_res);
   3273 	hdr = get_bufhdr(aiop->buf);
   3274 	blocks = aio_attach(aiop);
   3275 
   3276 	block = &blocks[AIO_BLOCK_INDEX(aiop)];
   3277 	block->r.w.prev_io = block->r.w.last_io;
   3278 	block->r.w.prev_iolen = block->r.w.last_iolen;
   3279 	block->u.prev_requested = block->last_requested;
   3280 	block->last_requested = aiop->tv.tv_sec;
   3281 	block->last_returned_delta = tv.tv_sec - aiop->tv.tv_sec;
   3282 	block->r.w.last_io = aiop->buf;
   3283 	block->r.w.last_iolen = aiop->iolen;
   3284 	if (hdr.start == BUF_TYPE_A)
   3285 		block->ab = 1;
   3286 	else
   3287 		block->ab = 0;
   3288 
   3289 	block->read_count = 0;
   3290 	shm_ops->detach(AIO_BLOCK_HANDLE(aiop));
   3291 	return_write_buf(aiop->buf);
   3292 	aiop->buf = get_read_buf();
   3293 	aiop->handler = handle_read_then_write;
   3294 	return (do_new_read(aiop, start, WRITE_READ));
   3295 }
   3296 time_t
   3297 handle_write(struct aio_str *aiop, ullong_t start)
   3298 {
   3299 	struct timeval tv;
   3300 	union err_info err_info;
   3301 	struct bufhdr hdr;
   3302 	struct blks *block;
   3303 	struct blks *blocks;
   3304 
   3305 	err_info.str = "aiowrite";
   3306 
   3307 	if (aiop->buf != NULL) {
   3308 		if (*aiop->buf != 0xAA && *aiop->buf != 0x55) {
   3309 			char tmp = *aiop->buf;
   3310 			void *sig = expect_signal(SIGSEGV,
   3311 			    "Buffer not mapped writable but was updated!",
   3312 			    aiop->buf, sizeof (*aiop->buf));
   3313 			*aiop->buf = 0;
   3314 			*aiop->buf = tmp;
   3315 			cancel_expected_signal(SIGSEGV, sig);
   3316 			exit(1);
   3317 		}
   3318 		if (aiop->fd != NULL) {
   3319 			aiop->fd->total_write--;
   3320 		}
   3321 	} else {
   3322 		if (aiop->fd != NULL) {
   3323 			aiop->fd->total_write--;
   3324 			plog(LOG_WARNING, "buf == NULL, off %#llx (0t%lld)\n",
   3325 			    (ullong_t)aiop->off, (ullong_t)aiop->off);
   3326 		}
   3327 	}
   3328 
   3329 	TNF_PROBE_2(handle_write, "handle_write",
   3330 	    "sunw%cte%diskomizer%aio write wait",
   3331 	    tnf_opaque, aiop, aiop,
   3332 	    aio_tnf_str, *aiop, aiop);
   3333 	while (my_gettimeofday(&tv, NULL) == -1)
   3334 		pperror("gettimeofday");
   3335 
   3336 	if (DAIO_RETURN(aiop->aio_res) !=
   3337 	    opts.disk_io_sizes.vals[aiop->iolen]) {
   3338 		/* retry the write */
   3339 		if (aiop->fd != NULL) {
   3340 			report_error(aiop, err_info, ERR_SYS);
   3341 
   3342 			if (handle_write_error(aiop, start) == RETRY) {
   3343 				do_new_write(aiop, start, 1);
   3344 			} else {
   3345 				if (is_sequential(aiop) &&
   3346 				    (aiop->off % opts.nprocs) ==
   3347 				    this_proc()) {
   3348 					push_unwritten(aiop);
   3349 				}
   3350 				do_new_write(aiop, start, 0);
   3351 			}
   3352 			return (tv.tv_sec);
   3353 		} else {
   3354 			aiop->fd = aiop->dev->fdhead;
   3355 			add_to_aio_list(&aiop->fd->all_aios, aiop);
   3356 			do_new_write(aiop, start, 0);
   3357 			return (tv.tv_sec);
   3358 		}
   3359 	}
   3360 	aiop->fd->last_write_time = DAIO_GET_TIME_TAKEN(aiop->aio_res);
   3361 
   3362 	update_aio_write_stats(aiop);
   3363 	aiop->count++;
   3364 
   3365 	hdr = get_bufhdr(aiop->buf);
   3366 	if (aiop->retrycnt) {
   3367 		ullong_t block = aio_str2byteoff(aiop);
   3368 		pfprintf(stderr,
   3369 		    "%s Block 0t%lld (%#llx) retry %d succeeded\n",
   3370 		    aiop->fd->name, (ullong_t)block,
   3371 		    (ullong_t)block, ++aiop->retrycnt);
   3372 	}
   3373 
   3374 	blocks = aio_attach(aiop);
   3375 
   3376 	block = &blocks[AIO_BLOCK_INDEX(aiop)];
   3377 	block->r.w.prev_io = block->r.w.last_io;
   3378 	block->r.w.prev_iolen = block->r.w.last_iolen;
   3379 	block->u.prev_requested = block->last_requested;
   3380 	block->last_requested = aiop->tv.tv_sec;
   3381 	block->last_returned_delta = tv.tv_sec - aiop->tv.tv_sec;
   3382 	block->r.w.last_io = aiop->buf;
   3383 	block->r.w.last_iolen = aiop->iolen;
   3384 	if (hdr.start == BUF_TYPE_A)
   3385 		block->ab = 1;
   3386 	else
   3387 		block->ab = 0;
   3388 
   3389 	block->read_count = 0;
   3390 	shm_ops->detach(AIO_BLOCK_HANDLE(aiop));
   3391 
   3392 	clear_writemap_success(aiop);
   3393 
   3394 	assert(aiop->buf != NULL);
   3395 	if (aiop->count %
   3396 	    opts.expert_release_write_buffers_after_n_uses == 0) {
   3397 		return_write_buf(aiop->buf);
   3398 		aiop->buf = NULL;
   3399 	}
   3400 	newfd(aiop);
   3401 	do_new_write(aiop, start, 0);
   3402 	return (tv.tv_sec);
   3403 }
   3404 
   3405 void
   3406 init_all_aio(struct device *devices, struct aio_str *aio,
   3407 	int count)
   3408 {
   3409 	int i, j;
   3410 	struct device *device;
   3411 
   3412 	for (j = i = 0; i < count; i++) {
   3413 		for (device = devices; device != NULL; device = device->next) {
   3414 			if (i == 0)
   3415 				device->block = this_proc();
   3416 			aio[j].dev = device;
   3417 			aio[j].fd = NULL;
   3418 			aio[j].iolen = 0;
   3419 			add_to_aio_list(&device->stopped_ios, &aio[j]);
   3420 			while (my_gettimeofday(&device->state_ttl, NULL) == -1)
   3421 				pperror("gettimeofday");
   3422 
   3423 			j++;
   3424 		}
   3425 	}
   3426 }
   3427 void
   3428 cancel_all_io_byfd(struct fds *fd)
   3429 {
   3430 	struct aioqtop not_cancelled;
   3431 	struct aioqtop cancelled;
   3432 	struct aio_str *io;
   3433 	struct device *devp;
   3434 	int count = 0;
   3435 
   3436 	ZERO_OBJ(not_cancelled);
   3437 	ZERO_OBJ(cancelled);
   3438 
   3439 	while ((io = pop_from_aio_list(&fd->all_aios)) != NULL) {
   3440 		devp = io->dev;
   3441 		if (daio->cancel(&io->aio_res) == -1) {
   3442 			io->count = errno;
   3443 			add_to_aio_list(&not_cancelled, io);
   3444 		} else {
   3445 			count++;
   3446 			add_to_aio_list(&cancelled, io);
   3447 		}
   3448 	}
   3449 	plog(LOG_WARNING, "%d io's cancelled to path %s\n", count,
   3450 	    fd->name);
   3451 	while ((io = pop_from_aio_list(&cancelled)) != NULL) {
   3452 		DAIO_SET_ERROR(io->aio_res, ECANCELED);
   3453 		DAIO_SET_RETURN(io->aio_res, -1);
   3454 		add_to_aio_list(&devp->cancelled, io);
   3455 		cancelled_count++;
   3456 	}
   3457 	if (cancelled_count) {
   3458 		start_cancelled_io = do_start_cancelled_io;
   3459 	}
   3460 	fd->all_aios = not_cancelled;
   3461 }
   3462 void
   3463 cancel_all_io(void)
   3464 {
   3465 	int i = 0;
   3466 	int errors = 0;
   3467 	int total = 0;
   3468 	int reaped = 0;
   3469 	struct aio_str *io;
   3470 	struct aio_str *first_error_io = NULL;
   3471 	struct fds *fd;
   3472 	struct aioqtop not_cancelled;
   3473 	struct device *dev;
   3474 	/*
   3475 	 * If daio is NULL, then no paths can be open so nothing more to do.
   3476 	 */
   3477 	if (daio == NULL)
   3478 		return;
   3479 
   3480 	time_now_log(LOG_NOTICE, gettext("cancelling all aios\n"));
   3481 
   3482 	ZERO_OBJ(not_cancelled);
   3483 
   3484 	for (dev = devices; dev != NULL; dev = dev->next) {
   3485 		for (fd = dev->fdhead; ; fd = fd->next) {
   3486 			while ((io = pop_from_aio_list(
   3487 			    &fd->all_aios)) != NULL) {
   3488 				total++;
   3489 				if (daio->cancel(&io->aio_res) == -1) {
   3490 					io->count = errno;
   3491 					add_to_aio_list(&not_cancelled, io);
   3492 				} else {
   3493 					i++;
   3494 				}
   3495 			}
   3496 			if (fd->next == dev->fdhead)
   3497 				break;
   3498 		}
   3499 	}
   3500 	/*
   3501 	 * Now reap all the remaining ios, popping them off the list of
   3502 	 * ios that could not be cancelled.
   3503 	 */
   3504 	while ((io = (aio_str_t *)daio->wait(NULL)) != (aio_str_t *)-1 ||
   3505 	    errno != EINVAL) {
   3506 		if (io != (aio_str_t *)-1 && io != (aio_str_t *)0) {
   3507 			reaped++;
   3508 			remove_from_aio_list(&not_cancelled, io);
   3509 		}
   3510 	}
   3511 	/*
   3512 	 * If the list contains more than one entry there was a problem,
   3513 	 * probably in the internal logic of diskomizer.
   3514 	 */
   3515 	while ((io = pop_from_aio_list(&not_cancelled)) != NULL) {
   3516 		errno = io->count;
   3517 		/*
   3518 		 * If we were interupted the signal might have come in while
   3519 		 * we were handling an io so we could have just one io
   3520 		 * that is not in the aio system. So only report errors
   3521 		 * if there are more then one. If there are more than one
   3522 		 * report them all.
   3523 		 *
   3524 		 */
   3525 		if (errors++ > 0) {
   3526 			if (first_error_io != NULL) {
   3527 				AIOCANCEL_ERROR(first_error_io);
   3528 				first_error_io = NULL;
   3529 			}
   3530 			AIOCANCEL_ERROR(io);
   3531 		} else {
   3532 			first_error_io = io;
   3533 		}
   3534 	}
   3535 	for (dev = devices; dev != NULL; dev = dev->next) {
   3536 		close_and_free_paths(dev);
   3537 	}
   3538 	time_now_log(LOG_NOTICE,
   3539 	    "%d/%d aios cancelled successfully, %d reaped\n",
   3540 	    i, total, reaped);
   3541 }
   3542 pid_t
   3543 master_pid()
   3544 {
   3545 	return (parent_pid);
   3546 }
   3547 static int
   3548 is_master()
   3549 {
   3550 	return (parent_pid == getpid());
   3551 }
   3552 static void
   3553 register_death(pid_t pid)
   3554 {
   3555 	int i;
   3556 
   3557 	for (i = 0; i < opts.nprocs; i++) {
   3558 		if (proc_store[i].pid != pid) {
   3559 			proc_store[i].pid = 0;
   3560 			break;
   3561 		}
   3562 	}
   3563 }
   3564 static int
   3565 ischildless()
   3566 {
   3567 	int i;
   3568 
   3569 	if (proc_store != NULL) {
   3570 		for (i = 0; i < opts.nprocs; i++) {
   3571 			if (proc_store[i].pid != 0)
   3572 				return (0);
   3573 		}
   3574 	}
   3575 	return (1);
   3576 }
   3577 static int
   3578 haskids()
   3579 {
   3580 	return (!ischildless());
   3581 }
   3582 static void
   3583 mourning(pid_t pid, int stat)
   3584 {
   3585 	union {
   3586 		char dir[PATH_MAX];
   3587 		char buf[SIG2STR_MAX];
   3588 	} u;
   3589 	if (pid == -1)
   3590 		return;
   3591 
   3592 	if (!WIFEXITED(stat) && !WIFSIGNALED(stat))
   3593 		return;
   3594 
   3595 	register_death(pid);
   3596 
   3597 	if (WIFSIGNALED(stat)) {
   3598 		char *x = strsignal(WTERMSIG(stat));
   3599 		if (sig2str(WTERMSIG(stat), u.buf) == -1) {
   3600 			(void) strcpy(u.buf, "(Unknown)");
   3601 		}
   3602 		plog(LOG_ERR,
   3603 		    "Process %ld killed by signal %d %s,%s%s%s.\n",
   3604 		    (ulong_t)pid, WTERMSIG(stat), u.buf,
   3605 		    x == NULL ? "" : " ", x == NULL ? "" : x,
   3606 		    WCOREDUMP(stat) ? " core dumped" : "");
   3607 	}
   3608 	if (pid) {
   3609 		(void) snprintf(u.dir, sizeof (u.dir), "%s/%ld",
   3610 		    opts.workingdir, (ulong_t)pid);
   3611 		plog(LOG_DEBUG, "removing %s\n", u.dir);
   3612 		if (rmdir(u.dir) == -1)
   3613 			pperror("rmdir(%s)", u.dir);
   3614 	}
   3615 }
   3616 
   3617 static int
   3618 all_countdowns_zero(struct device *devp)
   3619 {
   3620 	while (devp) {
   3621 		if (devp->countdown > 0) {
   3622 			return (0);
   3623 		}
   3624 		devp = devp->next;
   3625 	}
   3626 	return (1);
   3627 }
   3628 
   3629 static int
   3630 stoptime_reached(void)
   3631 {
   3632 	return (stoptime > 0 && stoptime < gethrtime());
   3633 }
   3634 
   3635 static void
   3636 report_exit_reason(void)
   3637 {
   3638 	if (opts.nloops && all_countdowns_zero(devices)) {
   3639 		time_now_log(LOG_NOTICE,
   3640 		    "All devices have completed %ld loops; exiting",
   3641 		    opts.nloops);
   3642 } else if (stoptime_reached()) {
   3643 		time_now_log(LOG_NOTICE, "stop time reached; exiting");
   3644 	}
   3645 }
   3646 
   3647 void
   3648 cleanup(void)
   3649 {
   3650 	(void) sigignore(SIGTERM);
   3651 	(void) sigignore(SIGINT);
   3652 	new_log_transaction(stderr);
   3653 	if (is_master()) {
   3654 		int stat;
   3655 		pid_t pid;
   3656 		struct device *dev;
   3657 
   3658 		infantacide();
   3659 
   3660 		while (haskids() &&
   3661 		    (pid = waitpid((pid_t)-1, &stat, WNOHANG)) != -1 &&
   3662 		    errno != ECHILD) {
   3663 			if (pid == 0) {
   3664 				sleep(1);
   3665 				infantacide();
   3666 			} else {
   3667 				mourning(pid, stat);
   3668 			}
   3669 		}
   3670 		save_data_bufs();
   3671 
   3672 		for (dev = devices; dev != NULL; dev = dev->next) {
   3673 			close_and_free_paths(dev);
   3674 		}
   3675 		if (rmdir(opts.workingdir) == -1) {
   3676 			pperror("rmdir(%s)", opts.workingdir);
   3677 		}
   3678 		shm_ops->fini();
   3679 	} else {
   3680 		report_exit_reason();
   3681 		cancel_all_io();
   3682 	}
   3683 
   3684 	time_now_log(LOG_NOTICE, "exiting");
   3685 }
   3686 /*
   3687  * change_dir change into our own directory.
   3688  */
   3689 
   3690 void
   3691 change_dir()
   3692 {
   3693 	char dir[PATH_MAX];
   3694 
   3695 	(void) snprintf(dir, sizeof (dir),
   3696 	    "%s/%ld", opts.workingdir, (ulong_t)getpid());
   3697 
   3698 	if (mkdir(opts.workingdir, 0755) == -1 && errno != EEXIST)
   3699 		pperror("mkdir(%s, 0755)", opts.workingdir);
   3700 	if (mkdir(dir, 0755) == -1)
   3701 		pperror("mkdir(%s, 0755)", dir);
   3702 
   3703 	if (chdir(dir) == -1) {
   3704 		pperror("chdir(%s)", dir);
   3705 		(void) snprintf(dir, sizeof (dir),
   3706 		    "%s.%ld", diffs_str, (ulong_t)getpid());
   3707 		diffs = strdup(dir);
   3708 		if (diffs == NULL)
   3709 			diffs = diffs_str;
   3710 	} else {
   3711 		diffs = diffs_str;
   3712 	}
   3713 }
   3714 
   3715 /*
   3716  * aios_outstanding_or_on_hold:
   3717  *	return 1 if there are aios outstanding.
   3718  * 	return 0 if there are none.
   3719  */
   3720 int
   3721 aios_queued_to_fd(struct device *dev)
   3722 {
   3723 	struct fds *fd;
   3724 	for (fd = dev->fdhead; ; fd = fd->next) {
   3725 		if (is_aio_on_list(&fd->all_aios))
   3726 			return (1);
   3727 		if (fd->next == dev->fdhead)
   3728 			break;
   3729 	}
   3730 	return (0);
   3731 }
   3732 struct device *
   3733 first_to_restart(struct device *devices)
   3734 {
   3735 	struct device *dev;
   3736 	struct device *first_to_start;
   3737 	struct timeval now_tv;
   3738 
   3739 	do {
   3740 		while (my_gettimeofday(&now_tv, NULL) == -1)
   3741 			pperror("gettimeofday");
   3742 
   3743 		for (first_to_start = devices;
   3744 		    first_to_start != NULL && (
   3745 		    is_aio_on_list(&first_to_start->stopped_ios) == 0 ||
   3746 		    get_dev_state(first_to_start, &now_tv) ==
   3747 		    DEV_NOT_READY);
   3748 		    first_to_start = first_to_start->next) {
   3749 			/*LINTED*/
   3750 		}
   3751 		if (first_to_start == NULL)
   3752 			break;
   3753 
   3754 		for (dev = first_to_start->next; dev != NULL;
   3755 		    dev = dev->next) {
   3756 			if (get_dev_state(dev, &now_tv) == DEV_NOT_READY) {
   3757 				continue;
   3758 			}
   3759 			if (is_aio_on_list(&first_to_start->stopped_ios) !=
   3760 			    0 && dev->state_ttl.tv_sec != -1 &&
   3761 			    (first_to_start->state_ttl.tv_sec == -1 ||
   3762 			    timeval_lt(dev->state_ttl,
   3763 			    first_to_start->state_ttl))) {
   3764 				first_to_start = dev;
   3765 			}
   3766 		}
   3767 	} while (first_to_start != NULL &&
   3768 	    first_to_start->state_ttl.tv_sec == -1 && sleep(1) != 2);
   3769 
   3770 	return (first_to_start);
   3771 }
   3772 
   3773 struct aio_str *
   3774 wait_to_restart(struct device *devices)
   3775 {
   3776 	struct device *first_to_start;
   3777 	struct aio_str *aiop;
   3778 	struct timeval tv;
   3779 
   3780 	first_to_start = first_to_restart(devices);
   3781 
   3782 	if (first_to_start == NULL)
   3783 		return (NULL);
   3784 
   3785 	while (my_gettimeofday(&tv, NULL) == -1)
   3786 		pperror("gettimeofday");
   3787 
   3788 	tv = timeval_timeval_sub(first_to_start->state_ttl, tv);
   3789 
   3790 	if (tv.tv_sec) {
   3791 		char buf[128];
   3792 		(void) strftime(buf, 128, TIME_FORMAT,
   3793 		    localtime(&first_to_start->state_ttl.tv_sec));
   3794 
   3795 		if (tv.tv_sec > secs_till_exit()) {
   3796 			pprintf("All IO on hold until after our exit time.\n");
   3797 			exit(0);
   3798 		}
   3799 		pprintf("Sleeping for %ld seconds, until %s\n", tv.tv_sec, buf);
   3800 		(void) sleep(tv.tv_sec);
   3801 		check_exit_flag();
   3802 	}
   3803 	if (tv.tv_usec) {
   3804 		(void) usleep(tv.tv_usec);
   3805 		check_exit_flag();
   3806 	}
   3807 
   3808 	aiop = pop_from_aio_list(&first_to_start->stopped_ios);
   3809 	if (aiop != NULL && aiop->fd != NULL)
   3810 		add_to_aio_list(&aiop->fd->all_aios, aiop);
   3811 	return (aiop);
   3812 }
   3813 int
   3814 aios_outstanding(struct device *devices)
   3815 {
   3816 	struct device *dev;
   3817 
   3818 	for (dev = devices; dev != NULL; dev = dev->next) {
   3819 		if (aios_queued_to_fd(dev)) {
   3820 			return (1);
   3821 		}
   3822 	}
   3823 	return (0);
   3824 }
   3825 #ifdef NOT_USED
   3826 int
   3827 aios_on_hold(struct device *devices)
   3828 {
   3829 	struct device *dev;
   3830 
   3831 	for (dev = devices; dev != NULL; dev = dev->next) {
   3832 		if (is_aio_on_list(&dev->stopped_ios)) {
   3833 			return (1);
   3834 		}
   3835 	}
   3836 	return (0);
   3837 }
   3838 #endif
   3839 int
   3840 aios_outstanding_or_on_hold(struct device *devices)
   3841 {
   3842 	struct device *dev;
   3843 
   3844 	for (dev = devices; dev != NULL; dev = dev->next) {
   3845 		if (is_aio_on_list(&dev->cancelled))
   3846 			return (1);
   3847 		if (is_aio_on_list(&dev->stopped_ios))
   3848 			return (1);
   3849 		if (aios_queued_to_fd(dev)) {
   3850 			return (1);
   3851 		}
   3852 	}
   3853 	return (0);
   3854 }
   3855 /*
   3856  * Return true if diskomizer would exit before the time supplied is reached.
   3857  */
   3858 int
   3859 would_stop_before(time_t secs)
   3860 {
   3861 	time_t xit;
   3862 	if (secs == -1 || (xit = secs_till_exit()) < 0) {
   3863 		return (0);
   3864 	} else {
   3865 		struct timeval tv;
   3866 
   3867 		while (my_gettimeofday(&tv, NULL) == -1)
   3868 			pperror("gettimeofday");
   3869 
   3870 		return (secs >= tv.tv_sec + xit);
   3871 	}
   3872 }
   3873 /*ARGSUSED*/
   3874 static int
   3875 has_cancelled(struct device *devices)
   3876 {
   3877 	return (cancelled_count == 0 ? 0 : 1);
   3878 #ifdef SLOW_BUT_SURE
   3879 	while (device != NULL) {
   3880 		if (is_aio_on_list(&device->cancelled)) {
   3881 			return (1);
   3882 		}
   3883 		device = device->next;
   3884 	}
   3885 	return (0);
   3886 #endif
   3887 }
   3888 struct timeval
   3889 get_timeout(struct device *devices, int report_time)
   3890 {
   3891 	struct device *first_to_start;
   3892 	struct timeval tv;
   3893 	time_t secs_til_xit;
   3894 
   3895 	if (has_cancelled(devices)) {
   3896 		tv.tv_sec = tv.tv_usec = 0;
   3897 		return (tv);
   3898 	}
   3899 	secs_til_xit = secs_till_exit();
   3900 	if (secs_til_xit < 0) {
   3901 		secs_til_xit = report_time;
   3902 	}
   3903 
   3904 	if (!all_running()) {
   3905 		first_to_start = first_to_restart(devices);
   3906 	} else {
   3907 		first_to_start = NULL;
   3908 	}
   3909 
   3910 	while (my_gettimeofday(&tv, NULL) == -1)
   3911 		pperror("gettimeofday");
   3912 
   3913 	if (first_to_start == NULL) {
   3914 		tv.tv_sec = MIN(secs_til_xit, report_time);
   3915 		tv.tv_usec = 0;
   3916 		return (tv);
   3917 	}
   3918 
   3919 	tv = timeval_timeval_sub(first_to_start->state_ttl, tv);
   3920 
   3921 	if (tv.tv_sec > report_time || tv.tv_sec > secs_til_xit) {
   3922 		tv.tv_sec = MIN(secs_til_xit, report_time);
   3923 		tv.tv_usec = 0;
   3924 	} else if (tv.tv_sec < 0) {
   3925 		tv.tv_sec = tv.tv_usec = 0;
   3926 	}
   3927 	return (tv);
   3928 }
   3929 void
   3930 init_all_blk_str(struct device *dev, int proc_no)
   3931 {
   3932 	int i;
   3933 	struct timeval now_tv;
   3934 
   3935 	for (i = 0; dev != NULL; dev = dev->next, i++) {
   3936 		if ((i % opts.nprocs) == proc_no) {
   3937 			/* init_block_str(dev); */
   3938 			while (my_gettimeofday(&now_tv, NULL) == -1)
   3939 				pperror("gettimeofday");
   3940 			(void) set_dev_state(dev, DEV_NOT_READY, DEV_STOPPED,
   3941 			    &now_tv);
   3942 		}
   3943 	}
   3944 }
   3945 /*
   3946  * Start the first cancelled io for each device.
   3947  */
   3948 void
   3949 do_start_cancelled_io(struct device *devices, ullong_t start)
   3950 {
   3951 	struct device *devp;
   3952 
   3953 	for (devp = devices; devp != NULL; devp = devp->next) {
   3954 		struct aio_str *aiop;
   3955 
   3956 		check_exit_flag();
   3957 
   3958 		aiop = pop_from_aio_list(&devp->cancelled);
   3959 		if (aiop != NULL) {
   3960 			cancelled_count--;
   3961 			aiop->handler(aiop, start);
   3962 		}
   3963 	}
   3964 	if (!has_cancelled(devices)) {
   3965 		start_cancelled_io =
   3966 		    (void (*)(struct device *devices, ullong_t start))nop;
   3967 	}
   3968 }
   3969 
   3970 static void
   3971 report_times(void)
   3972 {
   3973 	struct tms tms;
   3974 
   3975 	if (times(&tms) != (clock_t)-1) {
   3976 		plog(LOG_NOTICE, "User   time %d seconds\n",
   3977 		    tms.tms_cutime/CLK_TCK);
   3978 		plog(LOG_NOTICE, "System time %d seconds\n",
   3979 		    tms.tms_cstime/CLK_TCK);
   3980 	}
   3981 }
   3982 
   3983 void
   3984 do_aio(struct device *devices, ullong_t start, int report_time)
   3985 {
   3986 	struct aio_str *aio_writes;
   3987 	int i;
   3988 	int rdflag = 0;
   3989 	pid_t pid;
   3990 	int ndevices = how_many_devices(devices);
   3991 	dev_state dev_state;
   3992 
   3993 	if (opts.wthreads + opts.wrthreads == 0) {
   3994 		plog(LOG_ERR, "WTHREADS and WRTHREADS can not both be zero\n");
   3995 		exit(1);
   3996 	}
   3997 
   3998 	aio_writes = my_calloc((opts.wthreads + opts.wrthreads) * ndevices,
   3999 	    sizeof (struct aio_str));
   4000 	if (aio_writes == NULL) {
   4001 		pfprintf(stderr, "Can't allocate write structures\n");
   4002 		exit(1);
   4003 	}
   4004 	proc_store  = my_calloc(opts.nprocs, sizeof (struct proc_store));
   4005 	if (proc_store == NULL) {
   4006 		pfprintf(stderr, "Can't allocate process store\n");
   4007 		exit(1);
   4008 	}
   4009 	if (opts.seconds_to_run > 0) {
   4010 		secs_till_exit = do_secs_till_exit;
   4011 		stoptime = gethrtime() + (opts.seconds_to_run * 1000 * MILLION);
   4012 	} else {
   4013 		secs_till_exit = inf_secs_till_exit;
   4014 		stoptime = -1;
   4015 	}
   4016 
   4017 	(void) printf("\tPID = %ld\n", (ulong_t)getpid());
   4018 	(void) printf("\t%s\n", gettext(checker_string(daio->what_checker())));
   4019 
   4020 	if (opts_fini() != 0) {
   4021 		exit(EXIT_FAILURE);
   4022 	}
   4023 
   4024 	save_usage_tracking(usage_tracking_handle, opts.obscure_usage_file);
   4025 	send_usage_tracking(usage_tracking_handle);
   4026 	close_usage_tracking(usage_tracking_handle);
   4027 
   4028 	init_read_bufs(devices);
   4029 	init_all_write_bufs(aio_writes, devices);
   4030 
   4031 	shm_ops->complete(NULL);
   4032 
   4033 	report_uadmin();
   4034 
   4035 	if (opts.debug_no_action) {
   4036 		exit(EXIT_SUCCESS);
   4037 	}
   4038 
   4039 	if (!is_readonly() && opts.expert_do_path_check &&
   4040 	    check_for_duplicate_paths(devices) == 0) {
   4041 		exit(EXIT_FAILURE);
   4042 	}
   4043 
   4044 	(void) sighold(SIGTERM);
   4045 	(void) sighold(SIGINT);
   4046 	proc_no = 0;
   4047 	NOTE(COMPETING_THREADS_NOW)
   4048 	for (i = 0; i < opts.nprocs; i++) {
   4049 		int forkcount = 0;
   4050 		(void) fflush(stdout);
   4051 		(void) fflush(stderr);
   4052 		do {
   4053 			pid = opts.use_fork1 == 0 ? fork() : fork1();
   4054 
   4055 			if (pid == -1) {
   4056 				FORK_ERROR(opts.use_fork1 == 0 ?
   4057 				    "" : "1");
   4058 				if (forkcount >= opts.max_fork_failure)
   4059 					break;
   4060 				forkcount++;
   4061 				(void) sleep(opts.fork_failure_wait_time);
   4062 			}
   4063 		} while (pid == -1);
   4064 		if (pid == -1)
   4065 			FORK_ERROR(opts.use_fork1 == 0 ? "" : "1");
   4066 		else if (pid == 0) {
   4067 			proc_no = i;
   4068 			break;
   4069 		}
   4070 		proc_store[i].pid = pid;
   4071 		plog(LOG_DEBUG, "fork%s %ld\n", opts.use_fork1 == 0 ? "" : "1",
   4072 		    (ulong_t)pid);
   4073 	}
   4074 	if (pid != 0) { /* We are the parent */
   4075 		int status;
   4076 
   4077 		(void) sigrelse(SIGINT);
   4078 		(void) sigrelse(SIGTERM);
   4079 		while ((pid = waitpid((pid_t)-1, &status, 0)) != -1 &&
   4080 		    errno != ECHILD) {
   4081 			if (WIFEXITED(status) || WIFSIGNALED(status)) {
   4082 				mourning(pid, status);
   4083 				if (WEXITSTATUS(status) != 0 ||
   4084 				    WIFSIGNALED(status)) {
   4085 					exit_status = EXIT_FAILURE;
   4086 				}
   4087 			}
   4088 		}
   4089 		report_times();
   4090 		exit(exit_status);
   4091 	}
   4092 	free(proc_store);
   4093 
   4094 	daio->init((opts.wthreads + opts.rthreads + opts.wrthreads) * ndevices);
   4095 	(void) sigrelse(SIGINT);
   4096 	(void) sigrelse(SIGTERM);
   4097 	change_dir();
   4098 	/* init_all_blk_str(devices, proc_no); */
   4099 	if (usr1_exit)
   4100 		exit(0);
   4101 	init_all_aio(devices, aio_writes, opts.wthreads + opts.wrthreads);
   4102 
   4103 	assert(devices->block == this_proc());
   4104 	new_log_transaction(stderr);
   4105 
   4106 	/* This is the main loop for the diskomizer. */
   4107 	for (i = 0; aios_outstanding_or_on_hold(devices) != 0; i++) {
   4108 		struct aio_str *aiop;
   4109 		struct timeval timeout, now_tv;
   4110 		time_t tyme;
   4111 		int x;
   4112 
   4113 		check_exit_flag();
   4114 		if (aios_outstanding(devices) != 0) {
   4115 			timeout = get_timeout(devices, report_time);
   4116 
   4117 			new_log_transaction(stderr);
   4118 			aiop = (struct aio_str *)daio->wait(&timeout);
   4119 
   4120 			x = errno;
   4121 			check_exit_flag();
   4122 			while (my_gettimeofday(&now_tv, NULL) == -1)
   4123 				pperror("gettimeofday");
   4124 
   4125 			if ((long)aiop == -1) {
   4126 				errno = x;
   4127 				if (errno == EINVAL &&
   4128 				    aios_outstanding_or_on_hold(
   4129 				    devices) == 0) {
   4130 					AIOWAIT_ERROR(timeout);
   4131 					exit(1);
   4132 				}
   4133 				AIOWAIT_ERROR(timeout);
   4134 				continue;
   4135 			} else if ((long)aiop == 0) {
   4136 				/* the aiowait timed out */
   4137 				report_all_hangers(devices, report_time);
   4138 				restart_stopped_devices(start, devices,
   4139 				    &now_tv);
   4140 				continue;
   4141 			}
   4142 		} else {
   4143 		/*
   4144 		 * If all the io requests have been stopped then we call
   4145 		 * wait_to_restart which will return the first io to
   4146 		 * restart from all the devices and it will sleep until
   4147 		 * that io is due to be queued. It will return NULL if the
   4148 		 * next io to start would be started after the process should
   4149 		 * have exited or if there are no stopped devices or if all
   4150 		 * the ios have been deferred.
   4151 		 */
   4152 			if ((aiop = wait_to_restart(devices)) != NULL) {
   4153 				while (my_gettimeofday(&now_tv, NULL) == -1)
   4154 					pperror("gettimeofday");
   4155 				dev_state = set_dev_state(aiop->dev,
   4156 				    DEV_STOPPED, DEV_STARTING, &now_tv);
   4157 				assert(dev_state == DEV_STOPPED ||
   4158 				    dev_state == DEV_STARTING);
   4159 			} else if ((aiop = get_deferred_io(devices)) == NULL) {
   4160 				exit(exit_status);
   4161 			}
   4162 		}
   4163 		if (aiop == NULL ||
   4164 		    (opts.nloops != 0 && aiop->dev->countdown == 0) ||
   4165 		    stoptime_reached()) {
   4166 			/* this stops the other processes being killed */
   4167 			usr1_exit++;
   4168 			if (aiop != NULL) {
   4169 				if (aiop->fd != NULL)
   4170 					remove_from_aio_list(
   4171 					    &aiop->fd->all_aios, aiop);
   4172 				if (aiop->next != NULL)
   4173 					aiop->next->prev = aiop->prev;
   4174 			}
   4175 			continue;
   4176 		}
   4177 		/*
   4178 		 * Controls.
   4179 		 */
   4180 		dev_state = get_dev_state(aiop->dev, &now_tv);
   4181 		if (dev_state == DEV_RUNNING || dev_state == DEV_STARTING) {
   4182 			tyme = aiop->handler(aiop, start);
   4183 		} else {
   4184 			if (aiop->fd != NULL) {
   4185 				remove_from_aio_list(&aiop->fd->all_aios,
   4186 				    aiop);
   4187 			}
   4188 			add_to_aio_list(&aiop->dev->stopped_ios, aiop);
   4189 			if (aios_queued_to_fd(aiop->dev) == 0) {
   4190 				(void) set_dev_state(aiop->dev,
   4191 				    DEV_STOPPING, DEV_STOPPED, &now_tv);
   4192 			}
   4193 		}
   4194 		start_cancelled_io(devices, start);
   4195 		start_deferred(devices, start);
   4196 		report_hangers(aiop->dev, tyme, report_time);
   4197 		restart_stopped_devices(start, devices, &now_tv);
   4198 		if (aiop->off >= aiop->dev->read_start_block &&
   4199 		    aiop->handler != handle_read &&
   4200 		    i > (2*(opts.rthreads + opts.wthreads))) {
   4201 			if (opts.rthreads > aiop->dev->running_rthreads) {
   4202 				if (aiop->dev->running_rthreads == 0 &&
   4203 				    rdflag == 0) {
   4204 
   4205 					rdflag = 1;
   4206 					time_now_log(LOG_NOTICE,
   4207 					    "Starting first %s reader %d",
   4208 					    random_str, i);
   4209 				}
   4210 				if (init_read(aiop, start))
   4211 					aiop->dev->running_rthreads++;
   4212 			} else if (rdflag == 1) {
   4213 				time_now_log(LOG_NOTICE,
   4214 				    "All %sreaders started %d",
   4215 				    random_str, i);
   4216 				rdflag = 2;
   4217 			}
   4218 		}
   4219 	}
   4220 
   4221 	exit(exit_status);
   4222 }
   4223 
   4224 
   4225 /*
   4226  * select_error_func:
   4227  * 	search the handlers array for an entry whose name matches the name
   4228  *	passed in. If the name passed in is NULL then default to using the
   4229  *	first handler in the list.
   4230  */
   4231 static int
   4232 select_error_func(const char *name,
   4233 	struct error_handlers *handlers,
   4234 	int nhandlers,
   4235 	on_error_t *oef,
   4236 	int rw)
   4237 {
   4238 	int i;
   4239 	struct error_handlers *h;
   4240 
   4241 	if (name == NULL) {
   4242 		h = &handlers[0];
   4243 	} else for (i = 0; i < nhandlers; i++) {
   4244 		h = &handlers[i];
   4245 		if (((h->rw & rw) != 0) && strcasecmp(h->name, name) == 0) {
   4246 			break;
   4247 		} else {
   4248 			h = NULL;
   4249 		}
   4250 	}
   4251 	if (h != NULL) {
   4252 		*oef = h->func;
   4253 		if (h->setup() != 1) {
   4254 			fprintf(stderr, "Unable to init %s\n", h->name);
   4255 		}
   4256 		return (h->breaker == 0 ? 0 : 1);
   4257 	}
   4258 	return (-1);
   4259 }
   4260 
   4261 on_error_t *
   4262 setup_onerror(char *prog, const char *str, int rw)
   4263 {
   4264 	char *tmp;
   4265 	char *opaque;
   4266 	char *toogo;
   4267 	char i;
   4268 	on_error_t *oef = NULL;
   4269 
   4270 	if ((toogo = strdup(str)) == NULL) {
   4271 		(void) fprintf(stderr, "strdup(%s) failed: %s\n",
   4272 		    str, strerror(errno));
   4273 		return (NULL);
   4274 	}
   4275 
   4276 	for (i = 0, tmp = toogo; ; i++) {
   4277 		on_error_t *noef;
   4278 		int n;
   4279 
   4280 		if ((tmp = strtok_r(tmp, ",", &opaque)) == NULL) {
   4281 			break;
   4282 		}
   4283 		noef = realloc(oef, (i+2) * sizeof (on_error_t));
   4284 		if (noef == NULL) {
   4285 			free(toogo);
   4286 			free(oef);
   4287 			return (NULL);
   4288 		}
   4289 		oef = noef;
   4290 		oef[i+1] = NULL;
   4291 		if ((n = select_error_func(tmp, on_error_table,
   4292 		    ARRAY_LEN(on_error_table), &oef[i], rw)) != 0) {
   4293 			if (n == -1) {
   4294 				(void) fprintf(stderr,
   4295 				    "bad on error option %s in %s\n",
   4296 				    NIL(tmp), str);
   4297 			}
   4298 			break;
   4299 		}
   4300 		tmp = NULL;
   4301 	}
   4302 	free(toogo);
   4303 	if (oef == NULL) {
   4304 		if ((oef = malloc(sizeof (on_error_t))) != NULL) {
   4305 			oef[0] = NULL;
   4306 		}
   4307 	}
   4308 	return (oef);
   4309 }
   4310 
   4311 int
   4312 how_many_devices(struct device *devices)
   4313 {
   4314 	int i = 0;
   4315 
   4316 	while (devices != NULL) {
   4317 		i++;
   4318 		devices = devices->next;
   4319 	}
   4320 	return (i);
   4321 }
   4322 
   4323 ullong_t
   4324 set_file_size(const char *dir)
   4325 {
   4326 	struct statvfs buf;
   4327 	if (opts.expert_amount_to_leave_unused && opts.number_of_files &&
   4328 	    statvfs(dir, &buf) != -1)  {
   4329 		ullong_t count;
   4330 		count = (buf.f_bavail * buf.f_frsize) -
   4331 		    opts.expert_amount_to_leave_unused;
   4332 		count = (count/opts.number_of_files);
   4333 
   4334 		return (count);
   4335 	}
   4336 	return (opts.file_size);
   4337 }
   4338 
   4339 int
   4340 set_number_of_files(const char *dir)
   4341 {
   4342 	struct statvfs buf;
   4343 	if (opts.expert_amount_to_leave_unused && opts.file_size &&
   4344 	    statvfs(dir, &buf) != -1) {
   4345 		ullong_t count;
   4346 		longlong_t n;
   4347 		int i;
   4348 		count = (ullong_t)buf.f_bavail * (ullong_t)buf.f_frsize;
   4349 		count -= opts.expert_amount_to_leave_unused;
   4350 		n = count / opts.file_size;
   4351 		i = (int)(MIN(n, INT_MAX));
   4352 		if (opts.number_of_files)
   4353 			return (MIN(i, opts.number_of_files));
   4354 		else
   4355 			return (i);
   4356 	}
   4357 	return (opts.number_of_files);
   4358 }
   4359 /*
   4360  * read the path as if it is a symbolic link and process that.
   4361  */
   4362 static char *
   4363 do_link(char *path)
   4364 {
   4365 	char buf[PATH_MAX+1];
   4366 	char *res;
   4367 	int x;
   4368 
   4369 	if ((x = readlink(path, &buf[0], sizeof (buf))) > 0) {
   4370 		buf[x] = NULL;
   4371 		res = full_path(path, &buf[0]);
   4372 	} else {
   4373 		res = my_strdup(path);
   4374 	}
   4375 
   4376 	if (res == NULL) {
   4377 		exit(EXIT_FAILURE);
   4378 	}
   4379 	return (res);
   4380 }
   4381 
   4382 static struct fds *
   4383 open_path_count(struct device *devp, char *name, ullong_t size)
   4384 {
   4385 	struct fds *fd;
   4386 	int i = 0;
   4387 
   4388 	do {
   4389 		if ((fd = open_path(devp, name, size)) != NULL)
   4390 			break;
   4391 	} while (i++ < opts.open_retries);
   4392 
   4393 	return (fd);
   4394 }
   4395 struct device *
   4396 open_path_group(struct paths *paths, int paths_to_use, int error_paths)
   4397 {
   4398 	int count;
   4399 	struct other_paths *opath;
   4400 	struct device *devp;
   4401 	struct fds *fd;
   4402 	int total_paths = paths_to_use + error_paths;
   4403 
   4404 	if ((devp = (struct device *)my_calloc(1,
   4405 	    sizeof (struct device))) == NULL) {
   4406 		return (NULL);
   4407 	}
   4408 
   4409 	for (count = 0, opath = paths->op;
   4410 	    count < total_paths && opath != NULL; /* */) {
   4411 		if ((fd = open_path_count(devp,
   4412 		    opath->path, 0)) != NULL) {
   4413 			fd->error_path =
   4414 			    count >= paths_to_use ? 1 : 0;
   4415 			fd->path_id = count++;
   4416 		}
   4417 		opath = opath->next;
   4418 	}
   4419 	return (devp);
   4420 }
   4421 /*
   4422  * Open_devices
   4423  *
   4424  * ARGUMENT: char *name
   4425  *	A space seperated list of devices.  Devices may be grouped by
   4426  *	putting curly brackets around them to sepficy multiple paths to
   4427  *	the same device.
   4428  */
   4429 struct device *
   4430 open_devices(char *name)
   4431 {
   4432 	struct device *devp;
   4433 	struct device *newone;
   4434 	int brace_count = 0;
   4435 	int error_paths = opts.error_paths;
   4436 	int paths_to_use = opts.paths_to_use;
   4437 	struct paths *path_group = NULL;
   4438 	struct other_paths *op;
   4439 	char *tmp;
   4440 	char *toogo;
   4441 	char *opaque;
   4442 
   4443 	if ((toogo = strdup(name)) == NULL) {
   4444 		(void) fprintf(stderr, "strdup(%s) failed: %s\n", name,
   4445 		    strerror(errno));
   4446 		exit(1);
   4447 	}
   4448 	tmp = toogo;
   4449 	devp = NULL;
   4450 
   4451 	while ((tmp = strtok_r(tmp, "\t ", &opaque)) != NULL) {
   4452 		struct stat64 sbuf;
   4453 		if (usr1_exit)
   4454 			exit(0);
   4455 
   4456 		if (*tmp == OPEN_BRACE) {
   4457 			if (brace_count++ == 0) {
   4458 				error_paths = paths_to_use = 0;
   4459 			}
   4460 			if (path_group == NULL) {
   4461 				path_group = my_calloc(
   4462 				    sizeof (struct paths), 1);
   4463 				if (path_group == NULL) {
   4464 					exit(EXIT_FAILURE);
   4465 				}
   4466 			}
   4467 		} else if (*tmp == CLOSE_BRACE) {
   4468 			if (--brace_count == 0) {
   4469 				if (path_group->logicalpath == NULL) {
   4470 					plog(LOG_WARNING, gettext(
   4471 					    "Empty path device list "
   4472 					    "found"));
   4473 					free(path_group);
   4474 					path_group = NULL;
   4475 					continue;
   4476 				}
   4477 				newone = open_device(NULL,
   4478 				    path_group, opts.file_size,
   4479 				    paths_to_use, error_paths);
   4480 				if (newone != NULL) {
   4481 					newone->next = devp;
   4482 					devp = newone;
   4483 				}
   4484 				free_paths(path_group);
   4485 				path_group = NULL;
   4486 				error_paths = opts.error_paths;
   4487 				paths_to_use = opts.paths_to_use;
   4488 			}
   4489 			if (brace_count < 0)
   4490 				plog(LOG_WARNING,
   4491 				    "Unbalanced braces in device list\n");
   4492 		} else if (*tmp == '-') {
   4493 		/* PATH options */
   4494 			plog(LOG_WARNING, "Path options are not currently "
   4495 			    "supported: \"%s\" ignored\n", tmp);
   4496 		} else if (path_group != NULL) {
   4497 			if (path_group->op == NULL) {
   4498 				op = my_calloc(
   4499 				    sizeof (struct other_paths), 1);
   4500 				if (op == NULL) {
   4501 					exit(EXIT_FAILURE);
   4502 				}
   4503 				path_group->op = op;
   4504 				path_group->logicalpath = my_strdup(tmp);
   4505 			} else {
   4506 				/* lint does not like empty loops */
   4507 				for (op = path_group->op; op->next != NULL; ) {
   4508 					op = op->next;
   4509 				}
   4510 				op->next = my_calloc(
   4511 				    sizeof (struct other_paths), 1);
   4512 				if (op->next == NULL) {
   4513 					exit(EXIT_FAILURE);
   4514 				}
   4515 				op = op->next;
   4516 			}
   4517 			if (brace_count > 1) {
   4518 				if (path_stop_check == do_path_stop_check) {
   4519 					error_paths++;
   4520 				}
   4521 			} else {
   4522 				paths_to_use++;
   4523 			}
   4524 			op->path = do_link(tmp);
   4525 		} else if ((opts.number_of_files ||
   4526 		    opts.expert_amount_to_leave_unused) &&
   4527 		    daio->stat(tmp, &sbuf) != -1 && S_ISDIR(sbuf.st_mode)) {
   4528 			int len = strlen(tmp) +
   4529 			    strlen(opts.obscure_data_file_basename) + 16;
   4530 			int i;
   4531 			char *x;
   4532 			int nf = set_number_of_files(tmp);
   4533 			ullong_t size = set_file_size(tmp);
   4534 			pprintf("%s %d files of %lld bytes\n", tmp, nf,
   4535 			    (ullong_t)size);
   4536 			for (i = 0; i < nf; i++) {
   4537 				/*
   4538 				 * If the open succeds then we just have to
   4539 				 * "leak" this memory here as it is in use
   4540 				 * in the device structures.
   4541 				 */
   4542 				if ((x = malloc(len)) == NULL) {
   4543 					MALLOC_ERROR(len);
   4544 					exit(1);
   4545 				}
   4546 				(void) sprintf(x, "%s/%s%d", tmp,
   4547 				    opts.obscure_data_file_basename, i);
   4548 
   4549 				newone = open_device(x, NULL, size,
   4550 				    paths_to_use, error_paths);
   4551 
   4552 				if (newone != NULL) {
   4553 					newone->next = devp;
   4554 					devp = newone;
   4555 				} else {
   4556 					free(x);
   4557 				}
   4558 			}
   4559 		} else {
   4560 			newone = open_device(tmp, NULL, opts.file_size,
   4561 			    paths_to_use, error_paths);
   4562 
   4563 			if (newone != NULL) {
   4564 				newone->next = devp;
   4565 				devp = newone;
   4566 			}
   4567 		}
   4568 		tmp = NULL;
   4569 	}
   4570 	if (brace_count != 0) {
   4571 		plog(LOG_WARNING, "Unbalanced braces in device list\n");
   4572 	}
   4573 	/* don't free toogo as it is being used in the devices structures. */
   4574 	/* free(toogo); */
   4575 	if (usr1_exit) {
   4576 		exit(0);
   4577 	}
   4578 	if (devp != NULL) {
   4579 		init_device_control(devp);
   4580 	}
   4581 	return (devp);
   4582 }
   4583 
   4584 void
   4585 print_dev(struct device *dev)
   4586 {
   4587 	struct fds *fds;
   4588 	static const char device_str[] = "device";
   4589 
   4590 	(void) printf("Logical Device: %s\n", dev->logicalname);
   4591 
   4592 	USAGE_TRACKING_OPEN_KEY(device_str, NULL, dev->logicalname);
   4593 
   4594 	fds = dev->fdhead;
   4595 	(void) printf("Physical device%s:\n", fds->next == fds ? "" : "s");
   4596 
   4597 	USAGE_TRACKING_OPEN_KEY("paths", NULL, NULL);
   4598 
   4599 	for (;;) {
   4600 		USAGE_TRACKING_STORE_KEY_VALUE("longname", fds->longname);
   4601 		USAGE_TRACKING_STORE_KEY_VALUE("created", TRUE_OR_FALSE(
   4602 		    fds->created));
   4603 		(void) printf("\t%s%s%s%s%s%s\n", fds->longname,
   4604 		    fds->error_path || fds->created ? " (" : "",
   4605 		    fds->error_path ?  "error path" : "",
   4606 		    fds->error_path && fds->created ? ", " : "",
   4607 		    fds->created ? "created" : "",
   4608 		    fds->error_path || fds->created ? ")" : "");
   4609 		if (fds->longname != fds->shortname) {
   4610 			(void) printf("\t\t(%s)\n", fds->shortname);
   4611 		}
   4612 		if (fds->next != dev->fdhead) {
   4613 			fds = fds->next;
   4614 		} else {
   4615 			break;
   4616 		}
   4617 	}
   4618 
   4619 	USAGE_TRACKING_CLOSE_KEY();
   4620 
   4621 	print_number_of_bytes(dev->device_block_size,
   4622 	    "Device block size", "Device block size");
   4623 	print_number_of_bytes(dev->length, "length", "length");
   4624 	print_number(LEN_BYTES2BLOCKS(dev), "block", "blocks");
   4625 	(void) fflush(stdout);
   4626 	if (write_loops) {
   4627 		print_number(dev->countdown, "write", "writes");
   4628 	} else if (opts.nloops) {
   4629 		print_number(dev->countdown, "read", "reads");
   4630 	}
   4631 
   4632 	USAGE_TRACKING_STORE_KEY_VALUE_INT("length", dev->length);
   4633 
   4634 	USAGE_TRACKING_STORE_KEY_VALUE_INT("blocks", LEN_BYTES2BLOCKS(dev));
   4635 
   4636 	USAGE_TRACKING_CLOSE_KEY();
   4637 }
   4638 /*
   4639  * close all the fds and free the data associated with all the  paths
   4640  * for a device.
   4641  */
   4642 void
   4643 close_and_free_paths(struct device *dev)
   4644 {
   4645 	struct fds *fd, *next;
   4646 
   4647 	for (fd = dev->fdhead, next = fd->next; ; fd = next, next = fd->next) {
   4648 		(void) daio->close(fd->fd);
   4649 		if (opts.expert_cleanup_created_files && fd->created &&
   4650 		    is_master() && exit_status == EXIT_SUCCESS &&
   4651 		    get_shared_device_error(dev->shared_data_handle) == 0) {
   4652 			pprintf(gettext("Removing %s\n"), fd->longname);
   4653 			if (daio->unlink(fd->longname) == -1) {
   4654 				pperror(gettext("unlink(%s)"), fd->longname);
   4655 			}
   4656 		}
   4657 		if (fd->shortname != fd->longname) {
   4658 			free(fd->longname);
   4659 		}
   4660 		free(fd->shortname);
   4661 		free(fd);
   4662 		if (dev->fdhead == next) {
   4663 			break;
   4664 		}
   4665 	}
   4666 	dev->fdhead = NULL;
   4667 }
   4668 struct fds *
   4669 open_path(struct device *devp, char *name, ullong_t size)
   4670 {
   4671 	struct fds *fd;
   4672 	struct dk_cinfo dk_cinfo;
   4673 	struct stat64 sbuf;
   4674 	char create;
   4675 
   4676 	check_exit_flag();
   4677 
   4678 	if (daio->stat(name, &sbuf) == -1) {
   4679 		if (size == 0) {
   4680 			pfprintf(stderr, "stat(%s) == -1 errno = %d (%s)\n",
   4681 			    name, errno, strerror(errno));
   4682 			return (NULL);
   4683 		} else {
   4684 			create = 1;
   4685 		}
   4686 	} else {
   4687 		create = 0;
   4688 	}
   4689 	if ((fd = (struct fds *)calloc(1, sizeof (struct fds))) == NULL) {
   4690 		return (NULL);
   4691 	}
   4692 
   4693 	if (opts.debug_no_action == 0) {
   4694 		if ((fd->fd = daio->open(name,
   4695 		    (is_readonly() ? O_RDONLY : O_RDWR)|
   4696 		    (opts.o_sync ? O_SYNC : 0)|
   4697 		    (opts.o_excl ? O_EXCL : 0) |
   4698 		    (opts.o_ndelay ? O_NDELAY : 0) |
   4699 		    (create ? O_CREAT : 0) |
   4700 		    (opts.o_trunc ? O_TRUNC : 0), 0600)) == -1) {
   4701 			pperror("open(%s, %s%s%s%s%s%s)",
   4702 			    name, (is_readonly() ? "O_RDONLY" : "O_RDWR"),
   4703 			    (opts.o_excl ? "|O_EXCL": ""),
   4704 			    (opts.o_sync ? "|O_SYNC": ""),
   4705 			    (opts.o_ndelay ? "|O_NDELAY": ""),
   4706 			    (opts.o_trunc ? "|O_TRUNC": ""),
   4707 			    (create ? "|O_CREAT, 0600" : ""));
   4708 			free(fd);
   4709 			return (NULL);
   4710 		}
   4711 		if (daio->directio(fd->fd, opts.directio == 1 ?
   4712 		    DIRECTIO_ON : DIRECTIO_OFF) == -1) {
   4713 			if (errno != ENOTTY || opts.directio == 1) {
   4714 				pperror("directio(\"%s\") failed", name);
   4715 			}
   4716 		}
   4717 		fd->created = create;
   4718 		if (create) {
   4719 			if (daio->fstat(fd->fd, &sbuf) == -1) {
   4720 				FSTAT_ERROR(fd->fd, name);
   4721 				(void) daio->close(fd->fd);
   4722 				free(fd);
   4723 				return (NULL);
   4724 			}
   4725 			if (S_ISREG(sbuf.st_mode) &&
   4726 			    daio->ftruncate(fd->fd, size) == -1) {
   4727 				(void) daio->close(fd->fd);
   4728 				free(fd);
   4729 				return (NULL);
   4730 			}
   4731 		}
   4732 	}
   4733 	if (sbuf.st_mode & (S_IFCHR|S_IFBLK)) {
   4734 		fd->devid.dev = sbuf.st_rdev;
   4735 	} else {
   4736 		fd->devid.dev = sbuf.st_dev;
   4737 	}
   4738 	fd->devid.ino = sbuf.st_ino;
   4739 	fd->read_times.str = read_str;
   4740 	fd->read_times.best = 0xffffffff;
   4741 	fd->write_times.str = write_str;
   4742 	fd->write_times.best = 0xffffffff;
   4743 	fd->last_read_time = fd->last_write_time = ~0;
   4744 	if ((fd->longname = strdup(name)) == NULL) {
   4745 		(void) daio->close(fd->fd);
   4746 		free(fd);
   4747 		return (NULL);
   4748 	}
   4749 	fd->stop_flag = 0;
   4750 	fd->shared_data_handle = init_shared_device_info(opts.nprocs);
   4751 	if (fd->shared_data_handle == NULL) {
   4752 		plog(LOG_ERR, gettext("Unable to allocate shared data "
   4753 		    "handle for %s\n"), name);
   4754 	}
   4755 
   4756 	if (daio->ioctl(fd->fd, DKIOCINFO, &dk_cinfo) == -1) {
   4757 		fd->shortname = fd->longname;
   4758 	} else {
   4759 		fd->shortname = calloc(1,
   4760 		    strlen(dk_cinfo.dki_dname) + (3 * 10));
   4761 		if (fd->shortname == NULL) {
   4762 			fd->shortname = fd->longname;
   4763 		} else {
   4764 			(void) sprintf(fd->shortname, "%s%d:%c",
   4765 			    dk_cinfo.dki_dname, dk_cinfo.dki_unit,
   4766 			    dk_cinfo.dki_partition + 'a');
   4767 		}
   4768 	}
   4769 	if (opts.use_long_names) {
   4770 		fd->name = fd->longname;
   4771 	} else {
   4772 		fd->name = fd->shortname;
   4773 	}
   4774 	Longest_device_name = MAX(Longest_device_name, strlen(fd->name));
   4775 	if (devp->fdhead == NULL) {
   4776 		devp->fdhead = fd;
   4777 		fd->next = fd;
   4778 	} else {
   4779 		fd->next = devp->fdhead->next;
   4780 		devp->fdhead->next = fd;
   4781 	}
   4782 	return (fd);
   4783 }
   4784 
   4785 void *
   4786 read_vtoc_all_paths(struct fds *fdhead)
   4787 {
   4788 	struct fds *fd;
   4789 	void *handle = NULL;
   4790 
   4791 	fd = fdhead;
   4792 	do {
   4793 		if ((handle = daio->read_vtoc(fd->fd)) != NULL) {
   4794 			break;
   4795 		}
   4796 		fd = fd->next;
   4797 
   4798 	} while (fd->next != fdhead); /* do loop! */
   4799 
   4800 	return (handle);
   4801 }
   4802 struct paths *
   4803 do_ap(const char *inpath)
   4804 {
   4805 	return (daio->findap(inpath, opts.dev_tree));
   4806 }
   4807 
   4808 /*
   4809  * Set the minimum possible block size that can be used for all the devices
   4810  * in this test set. Typically the block sizes seen are 512 bytes, 2048 bytes
   4811  * or 4096 bytes. It will choose the smallest common multiple of the block
   4812  * sizes available. Typically this will just be the largest block size of
   4813  * all the devices but if you had a 3K and 4K block sized device this will
   4814  * return the smallest block size possible is 12k.
   4815  */
   4816 static void
   4817 set_minimum_block_size(int block_size)
   4818 {
   4819 	if (min_block_size == 0) {
   4820 		min_block_size = block_size;
   4821 	} else {
   4822 		min_block_size = min_block_size * block_size /
   4823 		    gcd(min_block_size, block_size);
   4824 	}
   4825 }
   4826 
   4827 struct device *
   4828 open_device(char *name, struct paths *paths,  ullong_t size,
   4829 	int paths_to_use, int error_paths)
   4830 {
   4831 	struct stat64 sbuf;
   4832 	ullong_t nsize;
   4833 	void *vtoc_handle;
   4834 	struct device *devp;
   4835 	struct fds *fd;
   4836 	int total_paths = paths_to_use + error_paths;
   4837 
   4838 
   4839 	if (paths != NULL) {
   4840 		name = my_strdup(paths->logicalpath);
   4841 		if (name == NULL) {
   4842 			return (NULL);
   4843 		}
   4844 		devp = open_path_group(paths, paths_to_use, error_paths);
   4845 	} else if (total_paths > 1 &&
   4846 	    (paths = do_ap(name)) != NULL) {
   4847 		devp = open_path_group(paths, paths_to_use, error_paths);
   4848 		free_paths(paths);
   4849 	} else {
   4850 		if ((devp = (struct device *)calloc(1,
   4851 			sizeof (struct device))) == NULL) {
   4852 			return (NULL);
   4853 		}
   4854 		if ((fd = open_path_count(devp, name, size)) != NULL) {
   4855 			fd->path_id = 0;
   4856 			fd->error_path = 0;
   4857 		}
   4858 	}
   4859 	Longest_logical_name = MAX(Longest_logical_name, strlen(name));
   4860 
   4861 	if (devp->fdhead == NULL) {
   4862 		free(devp);
   4863 		return (NULL);
   4864 	} else {
   4865 		devp->logicalname = name;
   4866 	}
   4867 
   4868 	if (opts.debug_no_action) {
   4869 		return (devp);
   4870 	}
   4871 	/*
   4872 	 * this is a mess.
   4873 	 */
   4874 
   4875 	if (daio->fstat(devp->fdhead->fd, &sbuf) == -1) {
   4876 		FSTAT_ERROR(devp->fdhead->fd, devp->fdhead->name);
   4877 		close_and_free_paths(devp);
   4878 		free(devp);
   4879 		return (NULL);
   4880 	}
   4881 	devp->next = NULL;
   4882 	devp->choose_block = seq_block;
   4883 
   4884 	if (!(sbuf.st_mode & S_IFCHR)) {
   4885 		plog(LOG_DEBUG, "Not a character device\n");
   4886 		nsize = (ullong_t)sbuf.st_size;
   4887 		devp->device_block_size = SIZEOF_BUF;
   4888 	} else if ((vtoc_handle = read_vtoc_all_paths(devp->fdhead)) == NULL) {
   4889 		nsize = (ullong_t)SIZEOF_BUF*(ullong_t)sbuf.st_blocks;
   4890 		devp->device_block_size = SIZEOF_BUF;
   4891 	} else {
   4892 		const struct disko_partition *part;
   4893 
   4894 		devp->device_block_size = disko_vtoc_sectorsz(vtoc_handle);
   4895 
   4896 		if (devp->device_block_size == 0) {
   4897 			devp->device_block_size = DEFAULT_BLOCK_SIZE;
   4898 		}
   4899 
   4900 		part = disko_vtoc_this_partition(vtoc_handle);
   4901 
   4902 		devp->v_part = malloc(sizeof (struct disko_partition));
   4903 
   4904 		if (devp->v_part != NULL) {
   4905 			*devp->v_part = *part;
   4906 		}
   4907 
   4908 		nsize = (ullong_t)devp->device_block_size * part->p_size;
   4909 
   4910 		disko_vtoc_free(vtoc_handle);
   4911 	}
   4912 	set_minimum_block_size(devp->device_block_size);
   4913 
   4914 	if (size == 0 || (nsize > 0 && nsize < size)) {
   4915 		size = nsize;
   4916 	}
   4917 
   4918 	if (size == 0) {
   4919 		(void) fprintf(stderr, gettext("File size is zero on %s\n"),
   4920 		    name);
   4921 		(void) fflush(stderr);
   4922 		close_and_free_paths(devp);
   4923 		free(devp);
   4924 		return (NULL);
   4925 	}
   4926 	devp->length = size-(opts.start_offset *
   4927 	    INDEX_TO_DIOLEN(max_disk_io_len));
   4928 	devp->read_start_block = opts.expert_recent_log_size +
   4929 	    ((devp->length/INDEX_TO_DIOLEN(max_disk_io_len)) *
   4930 	    opts.start_reads_percentage)/100;
   4931 
   4932 	if (devp->read_start_block < (opts.nprocs * (opts.rthreads +
   4933 	    opts.wthreads))) {
   4934 		devp->read_start_block = (opts.nprocs * (opts.rthreads +
   4935 		    opts.wthreads));
   4936 	}
   4937 	if (devp->read_start_block > LEN_BYTES2BLOCKS(devp)) {
   4938 		devp->read_start_block = LEN_BYTES2BLOCKS(devp);
   4939 	}
   4940 
   4941 	if (opts.nloops) {
   4942 		devp->countdown = (opts.nloops * devp->length) /
   4943 		    (opts.nprocs * INDEX_TO_DIOLEN(max_disk_io_len));
   4944 	} else {
   4945 		devp->countdown = ~(uint64_t)0;
   4946 	}
   4947 
   4948 	print_dev(devp);
   4949 	print_number_of_bytes(size, "size", "size");
   4950 	if (opts.start_offset * INDEX_TO_DIOLEN(max_disk_io_len) > size) {
   4951 		(void) printf("starting offset is greater than disk size! ");
   4952 		(void) printf("%llx > %llx\n", (ullong_t)(opts.start_offset *
   4953 		    INDEX_TO_DIOLEN(max_disk_io_len)), (ullong_t)size);
   4954 		close_and_free_paths(devp);
   4955 		free(devp);
   4956 		return (NULL);
   4957 	}
   4958 	if (LEN_BYTES2BLOCKS(devp) < (opts.nprocs *
   4959 	    (opts.wthreads + opts.rthreads + opts.wrthreads))) {
   4960 
   4961 		(void) printf("There are not enough blocks (%#llx) to support "
   4962 		    "this many I/O's (%#lx) on device %s, device closed\n",
   4963 		    (ullong_t)LEN_BYTES2BLOCKS(devp),
   4964 		    (ulong_t)(opts.nprocs * (opts.wthreads +
   4965 		    opts.rthreads + opts.wrthreads)), name);
   4966 		close_and_free_paths(devp);
   4967 		free(devp);
   4968 		return (NULL);
   4969 	}
   4970 	if (opts.expert_max_active_time == 0) {
   4971 		devp->state_ttl.tv_sec = 0;
   4972 		devp->state_ttl.tv_usec = 0;
   4973 	} else {
   4974 		while (my_gettimeofday(&devp->state_ttl, NULL) == -1)
   4975 			pperror("gettimeofday");
   4976 		devp->state_ttl = set_ttl(devp->state_ttl,
   4977 		    opts.expert_max_active_time,
   4978 		    opts.expert_min_active_time);
   4979 	}
   4980 	devp->shared_data_handle = init_shared_device_info(opts.nprocs);
   4981 	if (devp->shared_data_handle == NULL) {
   4982 		plog(LOG_ERR, gettext("Unable to allocate shared data "
   4983 		    "handle for %s\n"), name);
   4984 	}
   4985 	devp->seq_passes = opts.sequential_passes;
   4986 	devp->recent = init_recent(opts.expert_recent_log_size);
   4987 	return (devp);
   4988 }
   4989 
   4990 static int
   4991 check_for_duplicate_paths(struct device *devp)
   4992 {
   4993 	uchar_t *buf;
   4994 	int buflen = min_block_size;
   4995 	struct device *d;
   4996 	struct fds *fd;
   4997 	int status = 1;
   4998 
   4999 	if ((buf = malloc(buflen)) == NULL) {
   5000 		return (0);
   5001 	}
   5002 	memset(buf, NULL, buflen);
   5003 
   5004 	/*
   5005 	 * first zero all the target blocks
   5006 	 */
   5007 	for (d = devp; d != NULL; d = d->next) {
   5008 		fd = d->fdhead;
   5009 		do {
   5010 			check_exit_flag();
   5011 			if (daio->pwrite(fd->fd, buf, buflen,
   5012 			    INDEX_TO_DIOLEN(max_disk_io_len) *
   5013 			    OPTION(start_offset), NULL) != buflen) {
   5014 					PWRITE_ERROR(fd->fd, fd->name,
   5015 					    (ulong_t)buf,
   5016 					    buflen,
   5017 					    INDEX_TO_DIOLEN(
   5018 					    max_disk_io_len) *
   5019 					    OPTION(start_offset));
   5020 				status = 0;
   5021 			}
   5022 
   5023 			fd = fd->next;
   5024 		} while (fd != d->fdhead);
   5025 	}
   5026 	/* Now write the dev structure to the first path only */
   5027 	for (d = devp; d != NULL; d = d->next) {
   5028 		check_exit_flag();
   5029 		fd = d->fdhead;
   5030 		(void) memcpy(&buf[0], d, sizeof (struct device));
   5031 		if (daio->pwrite(fd->fd, buf, buflen,
   5032 		    INDEX_TO_DIOLEN(max_disk_io_len) *
   5033 		    OPTION(start_offset), NULL) != buflen) {
   5034 			PWRITE_ERROR(fd->fd, fd->name, (ulong_t)buf,
   5035 			    buflen, INDEX_TO_DIOLEN(max_disk_io_len) *
   5036 			    OPTION(start_offset));
   5037 			status = 0;
   5038 		}
   5039 	}
   5040 	/*
   5041 	 * Now read all the blocks via each path and verify that they
   5042 	 * are ok.
   5043 	 */
   5044 	for (d = devp; d != NULL; d = d->next) {
   5045 		fd = d->fdhead;
   5046 		do {
   5047 			check_exit_flag();
   5048 			memset(buf, NULL, buflen);
   5049 			if (daio->pread(fd->fd, buf, buflen,
   5050 			    INDEX_TO_DIOLEN(max_disk_io_len) *
   5051 			    OPTION(start_offset), NULL) != buflen) {
   5052 				PREAD_ERROR(fd->fd, fd->name,
   5053 				    (ulong_t)buf, buflen,
   5054 				    INDEX_TO_DIOLEN(max_disk_io_len) *
   5055 				    OPTION(start_offset));
   5056 				status = 0;
   5057 			} else if (memcmp(buf, d, sizeof (struct device)) !=
   5058 			    0) {
   5059 				status = 0;
   5060 				pfprintf(stderr,
   5061 				    "dev %s path %s failed path check\n",
   5062 				    d->logicalname, fd->name);
   5063 			}
   5064 			fd = fd->next;
   5065 		} while (fd != d->fdhead);
   5066 	}
   5067 	free(buf);
   5068 	return (status);
   5069 }
   5070 
   5071 void
   5072 print_uname(FILE *out)
   5073 {
   5074 	static char uname_str[] = "uname";
   5075 
   5076 	struct utsname name;
   5077 	char platform[255], hw_prov[255], domain[255];
   5078 	(void) sysinfo(SI_PLATFORM, &platform[0], sizeof (platform));
   5079 	(void) sysinfo(SI_HW_PROVIDER, &hw_prov[0], sizeof (hw_prov));
   5080 	(void) sysinfo(SI_SRPC_DOMAIN, &domain[0], sizeof (domain));
   5081 	if (uname(&name) == -1) {
   5082 		pperror(uname_str);
   5083 	}
   5084 	(void) fprintf(out, "System info:\n\t%s %s %s %s %s %s %s\n",
   5085 	    name.sysname, name.nodename,
   5086 	    name.release, name.version, name.machine, platform,
   5087 	    hw_prov);
   5088 }
   5089 
   5090 void
   5091 set_max_blocks(void)
   5092 {
   5093 	int i;
   5094 
   5095 	for (i = 0; i < opts.disk_io_sizes.wlen; i++) {
   5096 		if (opts.disk_io_sizes.vals[max_disk_io_len] <
   5097 		    opts.disk_io_sizes.vals[opts.disk_io_sizes.weightings[i]]) {
   5098 			max_disk_io_len = opts.disk_io_sizes.weightings[i];
   5099 		}
   5100 	}
   5101 }
   5102 
   5103 int
   5104 check_block_sizes(void)
   5105 {
   5106 	int i;
   5107 	int bs;
   5108 	int ret = 0;
   5109 
   5110 	for (i = 0; i < opts.disk_io_sizes.wlen; i++) {
   5111 		bs = opts.disk_io_sizes.vals[opts.disk_io_sizes.weightings[i]];
   5112 		if ((bs % min_block_size) != 0) {
   5113 			plog(LOG_ERR, "Disk IO size 0x%x (%d) is not a "
   5114 			    "multiple of the minimum block size, 0x%x (%d)\n",
   5115 			    bs, bs, min_block_size, min_block_size);
   5116 			ret = 1;
   5117 		}
   5118 	}
   5119 	return (ret);
   5120 }
   5121 
   5122 static void
   5123 usr1(int sig, siginfo_t *info, void *v)
   5124 {
   5125 	plog(LOG_DEBUG, "USR1 caught\n");
   5126 	usr1_exit++;
   5127 }
   5128 static int exit_flag;
   5129 /*ARGSUSED*/
   5130 static void
   5131 set_exit_flag(int sig, siginfo_t *info, void *v)
   5132 {
   5133 	plog(LOG_DEBUG, "Sig %d\n", sig);
   5134 	if (info == NULL) {
   5135 		/*
   5136 		 * Keyboard generated SIGINT has no info pointer.
   5137 		 */
   5138 		if (sig == SIGINT)
   5139 			killer_pid = master_pid();
   5140 	} else if (killer_pid == 0)
   5141 		killer_pid = info->si_pid;
   5142 	exit_flag++;
   5143 }
   5144 void
   5145 check_exit_flag()
   5146 {
   5147 	if (exit_flag) {
   5148 		(void) sighold(SIGTERM);
   5149 		exit(killer_pid == master_pid() ? exit_status : EXIT_FAILURE);
   5150 	}
   5151 }
   5152 static void
   5153 print_startup_info(void)
   5154 {
   5155 	(void) printf("Setting up to do:\n");
   5156 	print_number(LONG_BIT, "Bit mode", "Bit mode");
   5157 	print_number_of_bytes(min_block_size,
   5158 	    "Common block size", "Common block size");
   5159 	(void) printf("\tRead %s mode\n", is_readonly() ? "only" : write_str);
   5160 	random_str = is_readonly() ? "random " : "";
   5161 	print_number(opts.wthreads, "write", "writes");
   5162 	print_number(opts.wrthreads, "Write - read", "Write - reads");
   5163 	print_number(opts.rthreads, "read", "reads");
   5164 	print_number_of_bytes(INDEX_TO_DIOLEN(max_disk_io_len),
   5165 	    "Max block size", "Max block size");
   5166 	print_number(opts.nprocs, "proc", "procs");
   5167 	print_number(opts.nlocks, "lock", "locks");
   5168 	(void) printf("\t%d%% of disk written before reads start\n",
   5169 	    opts.start_reads_percentage);
   5170 	if (!is_readonly() && opts.obscure_execute &&
   5171 	    does_check(daio->what_checker())) {
   5172 		(void) printf("\tWill execute code read into buffer\n");
   5173 	}
   5174 	(void) printf("\tUsing %s as buffer allocator\n",
   5175 	    shm_ops->longname(NULL));
   5176 
   5177 	USAGE_TRACKING_STORE_KEY_VALUE("allocator", shm_ops->longname(NULL));
   5178 #define	UT_KVS(A) USAGE_TRACKING_STORE_KEY_VALUE_INT(#A, opts.A);
   5179 	UT_KVS(nprocs);
   5180 	UT_KVS(wthreads);
   5181 	UT_KVS(wrthreads);
   5182 	UT_KVS(rthreads);
   5183 #undef UT_KVS
   5184 
   5185 }
   5186 
   5187 static void
   5188 setup_signals(void)
   5189 {
   5190 	setup_signal_catcher(SIGTERM, set_exit_flag, SA_SIGINFO);
   5191 	setup_signal_catcher(SIGHUP, set_exit_flag, SA_SIGINFO);
   5192 	setup_signal_catcher(SIGINT, set_exit_flag, SA_SIGINFO);
   5193 	setup_signal_catcher(SIGUSR1, usr1, 0);
   5194 }
   5195 
   5196 int
   5197 main(int argc, char **argv)
   5198 {
   5199 	const char *path;
   5200 	srand48(getpid());
   5201 
   5202 	path = set_diskomizer_path();
   5203 
   5204 	if (do_args(argc, argv, pprintf, path) == 0) {
   5205 		usage(*argv);
   5206 	}
   5207 	/*
   5208 	 * Usage tracking has to open after argument checking as we need
   5209 	 * the values from the configuration files.
   5210 	 */
   5211 	usage_tracking_handle = open_usage_tracking(
   5212 	    opts.obscure_usagetracking_domain,
   5213 	    opts.obscure_sendmail,
   5214 	    opts.obscure_usage_email, /* from */
   5215 	    "diskomizer", /* to */
   5216 	    diskomizer_str, /* tool */
   5217 	    VERSION);
   5218 
   5219 	setup_signal((int (*)(void *, const char *, ...))pfprintf, stderr);
   5220 	set_limits();
   5221 
   5222 	set_max_blocks();
   5223 
   5224 	if (opts.STDOUT != NULL)
   5225 		if (freopen(opts.STDOUT, "a+", stdout) == NULL) {
   5226 			pperror("Unable to open %s for stdout\n",
   5227 			    opts.STDOUT);
   5228 			exit(1);
   5229 		}
   5230 	if (opts.STDERR != NULL)
   5231 		if (freopen(opts.STDERR, "a+", stderr) == NULL) {
   5232 			pperror("Unable to open %s for stderr\n",
   5233 			    opts.STDERR);
   5234 			exit(1);
   5235 		}
   5236 	popenlog("diskomizer");
   5237 
   5238 
   5239 	if (my_gettimeofday(&start_time, NULL) == -1) {
   5240 		plog(LOG_ERR, "Unable to get time of day\n");
   5241 		exit(EXIT_FAILURE);
   5242 	}
   5243 
   5244 	set_serial_and_provider();
   5245 
   5246 	if (opts.expert_write_cluster_length == 0)
   5247 		opts.expert_write_cluster_length = 1;
   5248 	if (opts.expert_read_cluster_length == 0)
   5249 		opts.expert_read_cluster_length = 1;
   5250 	if (opts.STDERR != NULL && opts.STDOUT != NULL && opts.background)
   5251 		background();
   5252 	(void) printf("\tCopyright %s Sun Microsystems, Inc."
   5253 	    "  All Rights Reserved\n\tUse is subject to license terms.\n\t"
   5254 	    "Version %s\n", THIS_YEAR, VERSION);
   5255 
   5256 	print_args(argc, argv, (void (*)(const char *, ...))printf);
   5257 	/* Check for values which mean we do nothing */
   5258 	if (opts.nprocs < 1 || opts.start_reads_percentage > 100) {
   5259 		exit(1);
   5260 	}
   5261 
   5262 	if (opts.read_minimum > 0 &&
   5263 	    opts.rthreads < opts.wthreads * opts.read_minimum) {
   5264 		(void) printf("WARNING: The ratio of readers to writers with "
   5265 		    "read_minimum set to %d\ncould lead to thrashing "
   5266 		    "or deadlock\n", opts.read_minimum);
   5267 	}
   5268 
   5269 	/*
   5270 	 * Set up all the functions to use.
   5271 	 */
   5272 	/* First what to do on error.  */
   5273 	if ((on_error_short = setup_onerror(*argv, opts.on_error_short,
   5274 	    READ_ERR)) == NULL) {
   5275 		exit_status = EXIT_FAILURE;
   5276 		exit(exit_status);
   5277 	}
   5278 
   5279 	if ((on_error_corrupt = setup_onerror(*argv, opts.on_error_corrupt,
   5280 	    READ_ERR)) == NULL) {
   5281 		exit_status = EXIT_FAILURE;
   5282 		exit(exit_status);
   5283 	}
   5284 
   5285 	if ((on_write_error = setup_onerror(*argv, opts.on_write_error,
   5286 	    WRITE_ERR)) == NULL) {
   5287 		exit_status = EXIT_FAILURE;
   5288 		exit(exit_status);
   5289 	}
   5290 
   5291 	/* Now the type of allocator to be used */
   5292 
   5293 	if ((init_uchar_func = setup_write_buf_initializer()) == NULL ||
   5294 	    (read_buffer_initializer = setup_read_buf_initializer()) == NULL) {
   5295 		usage(*argv);
   5296 	}
   5297 
   5298 	/* Choose a shared memmory allocator */
   5299 	shm_ops = choose_shm_ops(opts.allocator);
   5300 	if (opts.device == NULL) {
   5301 		(void) prompt();
   5302 	}
   5303 	if (opts.device == NULL) {
   5304 		pfprintf(stderr, "No devices specified.\n");
   5305 		exit(1);
   5306 	}
   5307 	print_uname(stdout);
   5308 	USAGE_TRACKING_STORE_KEY_VALUE_INT("pid", getpid());
   5309 
   5310 	print_bufhdr_offsets(stdout);
   5311 
   5312 	print_serial_and_provider(stdout);
   5313 
   5314 	if (opts.nlocks == 0)
   5315 		opts.nlocks = (opts.nprocs * 2) + 1; /* should be prime */
   5316 	pgrp = setpgrp();
   5317 	parent_pid = getpid();
   5318 	setup_signals();
   5319 	/* Now setup the locking primitives to use to protect the bit maps */
   5320 	init_locks();
   5321 	/* register a clean up routine. */
   5322 	(void) atexit(cleanup);
   5323 	/* get our daio */
   5324 	if ((daio = daio_choose_ops(opts.aio_routines)) == NULL) {
   5325 		char *reason = dlerror();
   5326 		(void) pfprintf(stderr,
   5327 		    "Unable to load daio routines(%s): %s\n",
   5328 		    opts.aio_routines,
   5329 		    reason == NULL ? "Unknown" : reason);
   5330 		exit(1);
   5331 	}
   5332 
   5333 	daio->init_master(opts.checker, INDEX_TO_DIOLEN(max_disk_io_len));
   5334 
   5335 	if (opts.nloops && opts.rthreads == 0 && opts.wrthreads == 0) {
   5336 		write_loops = 1;
   5337 	}
   5338 
   5339 	USAGE_TRACKING_OPEN_KEY("devices", NULL, NULL);
   5340 	/* now open the devices */
   5341 	if ((devices = open_devices(opts.device)) == NULL) {
   5342 		(void) pfprintf(stderr, "No devices opened\n");
   5343 		exit(1);
   5344 	}
   5345 	USAGE_TRACKING_CLOSE_KEY();
   5346 	findap_fini(); /* free up any data that was cached */
   5347 
   5348 	if (check_block_sizes()) {
   5349 		exit(1);
   5350 	}
   5351 
   5352 	print_startup_info();
   5353 
   5354 	/* and go! */
   5355 	do_aio(devices, start_offset(), opts.report_time);
   5356 	/*NOTREACHED*/
   5357 	return (1);
   5358 }
   5359 
   5360 long long
   5361 convert_time(struct timeval tv)
   5362 {
   5363 	long long tyme;
   5364 	long long mill = MILLION;
   5365 
   5366 	tyme = (long long)tv.tv_sec;
   5367 	assert(tyme >= 0);
   5368 	tyme *= mill;
   5369 	assert(tyme >= 0);
   5370 	tyme += tv.tv_usec;
   5371 	assert(tyme >= 0);
   5372 	return (tyme);
   5373 }
   5374 int
   5375 longest_logical_name(void)
   5376 {
   5377 	return (Longest_logical_name);
   5378 }
   5379 int
   5380 longest_device_name(void)
   5381 {
   5382 	return (Longest_device_name);
   5383 }
   5384 void
   5385 update_time_stats(char off, struct times *tp, hrtime_t hrtyme,
   5386 	struct aio_str *aiop)
   5387 {
   5388 	if (hrtyme < 0) {
   5389 		pfprintf(stderr, "Warning time appears to go backwards\n");
   5390 		return;
   5391 	}
   5392 
   5393 	if (hrtyme > tp->worst) {
   5394 		tp->worst = hrtyme;
   5395 	}
   5396 	if (hrtyme < tp->best) {
   5397 		tp->best = hrtyme;
   5398 	}
   5399 
   5400 	tp->ave -= tp->last_few[tp->count % ARRAY_LEN(tp->last_few)];
   5401 	tp->last_few[tp->count++ % ARRAY_LEN(tp->last_few)] = hrtyme;
   5402 	tp->ave += hrtyme;
   5403 
   5404 	if (opts.how_often_to_report &&
   5405 	    (tp->count % opts.how_often_to_report) == 0) {
   5406 		plog(LOG_INFO, "%-*s (%-*s) %s times (%.*f,%.*f,%.*f) %3d%%\n",
   5407 		    longest_logical_name(), aiop->dev->logicalname,
   5408 		    longest_device_name(), aiop->fd->name,
   5409 		    tp->str,
   5410 		    opts.expert_decimal_places, (double)tp->best/ACCURACY,
   5411 		    opts.expert_decimal_places,
   5412 		    (double)(tp->ave/MIN(tp->count,
   5413 		    ARRAY_LEN(tp->last_few)))/ACCURACY,
   5414 		    opts.expert_decimal_places, (double)tp->worst/ACCURACY,
   5415 		    off);
   5416 	}
   5417 }
   5418