1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 #pragma ident "@(#)diskomizer64mpism.c 2.91 09/07/16 SMI" 23 24 /* 25 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 26 * Use is subject to license terms. 27 */ 28 29 /* 30 * diskomizer64mpism 31 * 32 * Write to and then read from disk partitions and or files. 33 * 34 * This is a test program. 35 * 36 * To do: 37 * 38 * The messages it prints out at the begining are a mess. 39 * The code should be better commented. 40 * 41 * Chris.Gerhard (at) uk.sun.com - SMCC CTE 42 */ 43 #include "args.h" 44 #include "diskomizer64mpism.h" 45 #include "bufs.h" 46 #include "buf_init.h" 47 #include <netdb.h> 48 #include <sys/systeminfo.h> 49 #include <tnf/probe.h> 50 #include <sys/times.h> 51 #include <diskomizer/log.h> 52 #include "findap.h" 53 #include "device_control.h" 54 #include "timeval.h" 55 #include "list_ops.h" 56 #include "bits.h" 57 #include "locks.h" 58 #include "shm_ops.h" 59 #include "signal_catch.h" 60 #include "limit.h" 61 #include "time.h" 62 #include "prompt.h" 63 #include "errors.h" 64 #include "utils.h" 65 #include "shared_device_info.h" 66 #include "decode_errors.h" 67 #include <sys/utsname.h> 68 #include <sys/statvfs.h> 69 #include <dlfcn.h> 70 #include <diskomizer/uadmin.h> 71 #include <usage_tracking/usage_tracking.h> 72 #include "disko_usage_track.h" 73 #ifdef __i386 74 #include <note.h> 75 #endif 76 77 #define OPEN_BRACE '{' 78 #define CLOSE_BRACE '}' 79 #define DEFAULT_BLOCK_SIZE 0x200 /* 512 */ 80 81 static char diskomizer_str[] = "diskomizer"; 82 iolen_index_t max_disk_io_len; 83 static char *diffs; 84 static char diffs_str[] = "diffs"; 85 86 static char write_str[] = "write"; 87 static char read_str[] = "read"; 88 static const char nil[] = "nil"; 89 #define NIL(A) (A == NULL ? nil : A) 90 static char *random_str; 91 static hrtime_t stoptime; 92 static time_t (*secs_till_exit)(void); 93 static struct timeval start_time; 94 95 void *usage_tracking_handle; 96 97 enum read_type { 98 NORMAL_READ, 99 RETRY_READ, 100 WRITE_READ, 101 READ_ONLY_RAND, 102 READ_ONLY_SEQ 103 }; 104 typedef enum read_type read_type_t; 105 106 typedef uchar_t (*initializer_t)(int buf, int i); 107 108 static int exit_status = EXIT_SUCCESS; 109 110 /* 111 * The minimum block size that can be used. Essentially the lowest common 112 * muliple of the blocksizes available. 113 */ 114 static int min_block_size; 115 116 struct proc_store { 117 pid_t pid; 118 }; 119 static struct proc_store *proc_store; 120 /* 121 * The daio_ops 122 */ 123 static struct daio_ops *daio; 124 /* 125 * All the functions we have 126 */ 127 time_t handle_read(struct aio_str *aiop, ullong_t start); 128 time_t do_new_read(struct aio_str *aiop, ullong_t start, read_type_t read_type); 129 time_t handle_readonly_rand(struct aio_str *aiop, ullong_t start); 130 131 static int proc_no; 132 long long convert_time(struct timeval tv); 133 void update_time_stats(char off, struct times *tp, long long tyme, 134 struct aio_str *aiop); 135 static int pend_write_with_lock(bitmap_t map[], ullong_t off, int maplen); 136 static int do_memcmp(ullong_t start, struct aio_str *aiop); 137 static struct device *open_device(char *name, struct paths *, ullong_t size, 138 int paths_to_use, int error_paths); 139 static void check_matching_io(ullong_t start, struct aio_str *aiop); 140 static int is_master(void); 141 static void unwritten_block_rand(bitmap_t *map, struct aio_str *aiop, 142 ullong_t start, ullong_t len, int maplen); 143 static void unwritten_block_seq(bitmap_t *map, struct aio_str *aiop, 144 ullong_t start, ullong_t len, int maplen); 145 extern void close_and_free_paths(struct device *dev); 146 extern void run_func(uchar_t *buf, size_t size); 147 static struct bufhdr 148 build_bufhr(struct device *dev, ullong_t start, ullong_t off); 149 void newfd(struct aio_str *aiop); 150 struct fds *open_path(struct device *devp, char *name, ullong_t size); 151 void cancel_all_io_byfd(struct fds *fd); 152 struct fds *find_path(struct fds *fdhead, char path_id); 153 static int check_for_duplicate_paths(struct device *devp); 154 static void do_start_cancelled_io(struct device *devices, ullong_t start); 155 static int return_zero(void); 156 static int return_one(void); 157 /* 158 * The error handling functions. 159 */ 160 static loop_type on_error_reread(ullong_t start, struct aio_str *aiop); 161 static loop_type on_error_exit(ullong_t start, struct aio_str *aiop); 162 static loop_type on_error_stop(ullong_t start, struct aio_str *aiop); 163 static loop_type on_error_nop(ullong_t start, struct aio_str *aiop); 164 static loop_type on_error_abort(ullong_t start, struct aio_str *aiop); 165 static loop_type on_error_pause(ullong_t start, struct aio_str *aiop); 166 static loop_type on_error_retry(ullong_t start, struct aio_str *aiop); 167 static loop_type on_error_rewrite(ullong_t start, struct aio_str *aiop); 168 static loop_type on_error_fail_path(ullong_t start, struct aio_str *aiop); 169 /* 170 * Error handling init functions. 171 */ 172 static int init_path_stop_check(void); 173 static int init_stop_check(void); 174 /* 175 * the "globals" that we use 176 */ 177 struct device *devices; /* all the devices there are */ 178 179 write_buf_initializer_t init_uchar_func; 180 static read_buf_initializer_t read_buffer_initializer; 181 182 pid_t pgrp; 183 /* 184 * statics 185 */ 186 static pid_t parent_pid; 187 static pid_t killer_pid; 188 static on_error_t *on_error_corrupt; 189 static on_error_t *on_error_short; 190 static on_error_t *on_write_error; 191 static int Longest_device_name = 0; 192 static int Longest_logical_name = 0; 193 static int write_loops; 194 195 static int usr1_exit = 0; 196 197 struct shm_ops *shm_ops; 198 /* 199 * Count of the total number of io's that are currently cancelled. 200 */ 201 static int cancelled_count = 0; 202 /* 203 * Start cancelled. Only gets unset from nop if there are cancelled ios 204 * to restart. A rare thing. 205 */ 206 static void (*start_cancelled_io)(struct device *, ullong_t) = 207 (void (*)(struct device *, ullong_t)) nop; 208 static void (*start_deferred)(struct device *dev, ullong_t) = 209 (void (*)(struct device *, ullong_t)) nop; 210 211 static int (*stop_check)(void *handle) = (int (*)(void *))return_zero; 212 static int (*path_stop_check)(struct fds *fd, struct device *dev) = 213 (int (*)(struct fds *, struct device *dev))return_zero; 214 static char nom[] = "no memory"; 215 #define NOT_NULL(A) (A == NULL ? &nom[0] : A) 216 #define PLURAL(A) (A == 1 ? "" : "s") 217 #define LEN_BYTES2BLOCKS(A) (A->length / INDEX_TO_DIOLEN(max_disk_io_len)) 218 #define TRUE_OR_FALSE(A) (A ? "true" : "false") 219 220 struct error_handlers { 221 char *name; /* String that describes this error handler */ 222 on_error_t func; /* error handleing function */ 223 int (*setup)(void); /* init routine for the error handler */ 224 uint_t breaker:1; /* Is this the last error handler on the list */ 225 uint_t rw:2; /* Does this hander apply to read or write or both */ 226 }; 227 228 #define READ_ERR 1 229 #define WRITE_ERR (READ_ERR << 1) 230 #define BOTH_ERR (READ_ERR | WRITE_ERR) 231 232 struct error_handlers on_error_table[] = { 233 {"EXIT", on_error_exit, return_one, 1, BOTH_ERR}, 234 {"ABORT", on_error_abort, return_one, 1, BOTH_ERR}, 235 {"CONTINUE", on_error_nop, return_one, 0, BOTH_ERR}, 236 {"NONE", on_error_nop, return_one, 0, BOTH_ERR}, 237 {"STOP", on_error_stop, init_stop_check, 1, BOTH_ERR}, 238 {"PAUSE", on_error_pause, return_one, 0, BOTH_ERR}, 239 {"RETRY", on_error_retry, return_one, 0, BOTH_ERR}, 240 {"FAIL_PATH", on_error_fail_path, init_path_stop_check, 0, BOTH_ERR}, 241 {"UADMIN", on_error_uadmin, uadmin_init, 1, BOTH_ERR}, 242 {"REREAD", on_error_reread, return_one, 0, READ_ERR}, 243 {"REWRITE", on_error_rewrite, return_one, 0, WRITE_ERR} 244 }; 245 246 /* 247 * TNF declarations. 248 */ 249 /* 250 * The DEFINE should not have explicit mentions of the daio_ZZZZ 251 * elements, they should be opaque. 252 */ 253 TNF_DEFINE_RECORD_5(aio_str_t, aio_tnf_str, 254 tnf_opaque, buf, tnf_short, iolen, tnf_ulonglong, off, 255 tnf_longlong, aio_res.result.daio_return, 256 tnf_uint, aio_res.result.daio_errno) 257 /* 258 * locking functions. 259 */ 260 static char * 261 hostname(void) 262 { 263 static char hostname[MAXHOSTNAMELEN + 1]; 264 (void) sysinfo(SI_HOSTNAME, &hostname[0], MAXHOSTNAMELEN); 265 return (&hostname[0]); 266 } 267 int 268 this_proc(void) 269 { 270 return (proc_no); 271 } 272 static int 273 return_one(void) 274 { 275 return (1); 276 } 277 static int 278 return_zero(void) 279 { 280 return (0); 281 } 282 283 void 284 nop(void) 285 { 286 } 287 288 static void 289 not_null_free(void *ptr) 290 { 291 if (ptr != NULL) 292 free(ptr); 293 } 294 static time_t 295 inf_secs_till_exit(void) 296 { 297 return (LONG_MAX); 298 } 299 static time_t 300 do_secs_till_exit(void) 301 { 302 return ((stoptime - gethrtime()) / BILLION); 303 } 304 305 off64_t 306 start_offset(void) 307 { 308 return ((off64_t)(opts.start_offset * 309 (ullong_t)INDEX_TO_DIOLEN(max_disk_io_len))); 310 } 311 312 static void 313 return_aio_read_buf(struct aio_str *aiop) 314 { 315 if (!(aiop->count % opts.expert_release_read_buffers_after_n_uses) && 316 aiop->buf != NULL) { 317 return_read_buf(aiop->buf); 318 aiop->buf = NULL; 319 } 320 } 321 322 static int 323 do_stop_check(void *handle) 324 { 325 return (get_shared_stop_flag(handle, this_proc())); 326 } 327 int 328 is_readonly(void) 329 { 330 return (opts.o_rdonly == 1); 331 } 332 const char * 333 rw_string(void) 334 { 335 return (is_readonly() ? read_str : write_str); 336 } 337 /* 338 * background. disassociate from controlling tty make session leader 339 * then fork. The parent exits and the child goes on in 340 * the back ground. 341 */ 342 static void 343 background() 344 { 345 pid_t pid; 346 (void) freopen("/dev/null", "+r", stdin); 347 348 pid = opts.use_fork1 == 0 ? fork() : fork1(); 349 350 if (pid == 0) { 351 if (setsid() == (pid_t)-1) 352 pperror("setsid"); 353 return; 354 } 355 if (pid < 0) { 356 FORK_ERROR(opts.use_fork1 == 0 ? "" : "s"); 357 exit(1); 358 } 359 exit(0); 360 } 361 static struct blks * 362 aio_attach(struct aio_str *aiop) 363 { 364 int error_count = 0; 365 struct blks *blocks; 366 while ((blocks = shm_ops->attach(AIO_BLOCK_HANDLE(aiop))) == 367 NULL) { 368 if (error_count++ % 10000 == 0) 369 ATTACH_ERROR(AIO_BLOCK_HANDLE(aiop)); 370 } 371 if (error_count > 0) 372 plog(LOG_WARNING, "attached o.k.\n"); 373 return (blocks); 374 } 375 static void 376 update_aio_time_stats(struct aio_str *aiop, struct times *ts) 377 { 378 if (aiop->count > 0) { 379 ullong_t len = LEN_BYTES2BLOCKS(aiop->dev); 380 381 update_time_stats((100 * MIN(aiop->dev->block, len))/ 382 ((aiop->dev->length/ 383 INDEX_TO_DIOLEN(max_disk_io_len))), 384 ts, 385 DAIO_GET_TIME_TAKEN(aiop->aio_res), aiop); 386 } 387 } 388 389 static void 390 update_aio_read_stats(struct aio_str *aiop) 391 { 392 update_aio_time_stats(aiop, &aiop->fd->read_times); 393 } 394 395 static void 396 update_aio_write_stats(struct aio_str *aiop) 397 { 398 update_aio_time_stats(aiop, &aiop->fd->write_times); 399 } 400 401 ullong_t 402 diskomizer_off2byteoff(ullong_t off) 403 { 404 return ((off + opts.start_offset) * INDEX_TO_DIOLEN(max_disk_io_len)); 405 } 406 407 static ullong_t 408 byteoff2diskomizer_off(ullong_t off) 409 { 410 return ((off/INDEX_TO_DIOLEN(max_disk_io_len)) - opts.start_offset); 411 } 412 /* 413 * Sanity check. 414 */ 415 #define ASSERT_OFFSET(X) \ 416 assert(byteoff2diskomizer_off(diskomizer_off2byteoff(X)) == X) 417 418 static ullong_t 419 aio_str2byteoff(struct aio_str *aiop) 420 { 421 return (diskomizer_off2byteoff(aiop->off)); 422 } 423 424 static int64_t 425 aio_str2lba(struct aio_str *aiop) 426 { 427 long long byteoff; 428 int64_t lba; 429 430 if (aiop->dev->v_part == NULL || aiop->dev->device_block_size == 0) { 431 return (-1); 432 } 433 byteoff = aio_str2byteoff(aiop); 434 435 lba = byteoff / (int64_t)aiop->dev->device_block_size; 436 437 return (aiop->dev->v_part->p_start + lba); 438 } 439 440 static void 441 plog_dd(int pri, struct aio_str *aiop) 442 { 443 if ((INDEX_TO_DIOLEN(max_disk_io_len) % 444 INDEX_TO_DIOLEN(aiop->iolen)) == 0) { 445 daio->plog_dd(pri, aiop->fd->fd, INDEX_TO_DIOLEN(aiop->iolen), 446 INDEX_TO_DIOLEN(max_disk_io_len), 447 aio_str2byteoff(aiop)); 448 449 } 450 } 451 452 static void 453 report_device(int pri, struct aio_str *aiop) 454 { 455 plog(pri, "Requested File %s (%s)\n", 456 aiop->fd->name, aiop->dev->logicalname); 457 } 458 /* 459 * report block. 460 * 461 * Report all the information about the block that was requested to be read 462 */ 463 static void 464 report_offset(int pri, struct aio_str *aiop) 465 { 466 long long byteoff; 467 long long lba; 468 469 byteoff = aio_str2byteoff(aiop); 470 471 lba = aio_str2lba(aiop); 472 473 if (lba >= 0) { 474 plog(pri, 475 "Requested File offset 0t%lld (0x%llx), block size " 476 "0t%d (0x%x), LBA 0t%lld (0x%llx)\n", 477 byteoff, byteoff, INDEX_TO_DIOLEN(aiop->iolen), 478 INDEX_TO_DIOLEN(aiop->iolen), lba, lba); 479 } else { 480 plog(pri, "Requested File offset 0t%lld (0x%llx), block size " 481 "0t%d (0x%x)\n", byteoff, byteoff, 482 INDEX_TO_DIOLEN(aiop->iolen), 483 INDEX_TO_DIOLEN(aiop->iolen)); 484 } 485 } 486 487 static void 488 report_device_and_offset(int pri, struct aio_str *aiop) 489 { 490 report_device(pri, aiop); 491 report_offset(pri, aiop); 492 } 493 494 495 /* 496 * report_error. This is the generic error reporting routine. 497 * It reports all errors to stderr, giving similar information 498 * and advise as to other commands that can be tried. 499 */ 500 void 501 report_error(struct aio_str *aiop, const union err_info u, err_type error) 502 { 503 struct timeval now_tv; 504 long long disk_block; 505 int pri; 506 507 while (my_gettimeofday(&now_tv, NULL) == -1) 508 pperror("gettimeofday"); 509 510 disk_block = aio_str2byteoff(aiop); 511 512 mutex->stderr_enter(); 513 514 515 if (error == ERR_HUNG) { 516 pri = LOG_WARNING; 517 time_log(pri, now_tv.tv_sec, "Time now"); 518 time_log(pri, aiop->tv.tv_sec, "Requested io requested at"); 519 520 report_device(pri, aiop); 521 522 if (is_readonly()) { 523 plog(pri, "%s has %ld out of %ld read%s\n", 524 aiop->dev->logicalname, 525 aiop->fd->number_of_hung_read, 526 aiop->fd->total_read, 527 aiop->fd->number_of_hung_read == 1 ? "" : "s"); 528 } else { 529 plog(pri, "%s has %ld out of %ld read%s and %ld " 530 "out of %ld write%s\n", 531 aiop->dev->logicalname, 532 aiop->fd->number_of_hung_read, 533 aiop->fd->total_read, 534 aiop->fd->number_of_hung_read == 1 ? "" : "s", 535 aiop->fd->number_of_hung_write, 536 aiop->fd->total_write, 537 aiop->fd->number_of_hung_write == 1 ? "" : "s"); 538 } 539 plog(pri, "waiting for more than %ld second%s\n", 540 u.time, PLURAL(u.time)); 541 if (is_readonly()) { 542 plog(pri, "Last read took %lld\n", 543 aiop->fd->last_read_time/BILLION); 544 } else { 545 plog(pri, "Last read took %lld, last write took " 546 "%lld seconds\n", 547 aiop->fd->last_read_time/BILLION, 548 aiop->fd->last_write_time/BILLION); 549 } 550 plog(pri, 551 "oldest i/o is a %s waiting for %ld second%s\n", 552 is_read_io(aiop) ? "read" : "write", 553 now_tv.tv_sec - aiop->tv.tv_sec, 554 ((now_tv.tv_sec - aiop->tv.tv_sec) > 1) ? "s" : ""); 555 } else if (error == ERR_CORRUPT) { 556 time_t request_time; 557 time_t return_time; 558 int read_count; 559 struct blks *blocks; 560 struct fds *fd; 561 562 pri = LOG_ERR; 563 564 time_log(pri, now_tv.tv_sec, "Time now"); 565 time_log(pri, aiop->tv.tv_sec, "Requested io requested at"); 566 567 report_device_and_offset(pri, aiop); 568 569 report_error_desc(pri, aiop, u.str); 570 571 blocks = aio_attach(aiop); 572 573 fd = find_path(aiop->dev->fdhead, 574 blocks[AIO_BLOCK_INDEX(aiop)].path_id); 575 assert(fd != NULL); 576 577 read_count = blocks[AIO_BLOCK_INDEX(aiop)].read_count; 578 request_time = blocks[AIO_BLOCK_INDEX(aiop)].last_requested; 579 return_time = request_time + GET_LAST_RETURN( 580 blocks[AIO_BLOCK_INDEX(aiop)].last_returned_delta); 581 582 shm_ops->detach(AIO_BLOCK_HANDLE(aiop)); 583 584 time_log(pri, blocks[AIO_BLOCK_INDEX(aiop)].last_requested, 585 "Last %s to the requested block submitted", rw_string()); 586 time_log(pri, return_time, 587 "Last %s to the requested block returned", rw_string()); 588 589 plog(pri, "Last %s to the requested block used path: %s\n", 590 rw_string(), fd->name); 591 592 if (!is_readonly()) { 593 plog(pri, "Requested block has been read %d times " 594 "since last written\n", read_count); 595 } 596 597 decode_errors(pri, aiop, read_count); 598 } else if (error == ERR_DEFERRED) { 599 pri = LOG_WARNING; 600 601 time_log(pri, now_tv.tv_sec, "Time now"); 602 time_log(pri, aiop->tv.tv_sec, "Requested io requested at"); 603 plog(pri, "%s to device %s deferred\n", 604 u.str, aiop->dev->logicalname); 605 } else if (DAIO_RETURN(aiop->aio_res) < 0) { 606 char *datestr; 607 pri = LOG_ERR; 608 609 time_log(pri, now_tv.tv_sec, "Time now"); 610 time_log(pri, aiop->tv.tv_sec, "Requested io requested at"); 611 (void) plog(pri, "%s %s%s error, errno %d %s\n", 612 aiop->fd->name, u.str, 613 aiop->retrycnt > 0 ? " retry" : "", 614 DAIO_ERROR(aiop->aio_res), 615 strerror(DAIO_ERROR(aiop->aio_res))); 616 datestr = alloc_time_str_fmt(aiop->tv.tv_sec, "%b %e %H:%M"); 617 if (datestr != NULL) { 618 plog(pri, "Try \"egrep '^%s.*%s' " 619 "/var/adm/messages\"\n", 620 datestr, hostname()); 621 free(datestr); 622 } 623 } else { 624 pri = LOG_WARNING; 625 626 time_log(pri, now_tv.tv_sec, "Time now"); 627 time_log(pri, aiop->tv.tv_sec, "Requested io requested at"); 628 plog(pri, "%s short %s%s, Transferred %ld (%#lx)" 629 " bytes, requested %d (%#x) bytes.\n", 630 aiop->fd->name, u.str, 631 aiop->retrycnt > 0 ? " retry" : "", 632 (long)DAIO_RETURN(aiop->aio_res), 633 (long)DAIO_RETURN(aiop->aio_res), 634 INDEX_TO_DIOLEN(aiop->iolen), 635 INDEX_TO_DIOLEN(aiop->iolen)); 636 } 637 dlog(pri, "Block at byte offset 0t%lld (%#llx) block size %d (%#x)\n", 638 disk_block, disk_block, INDEX_TO_DIOLEN(aiop->iolen), 639 INDEX_TO_DIOLEN(aiop->iolen)); 640 plog_dd(pri, aiop); 641 642 (void) fflush(stderr); 643 (void) fsync(fileno(stderr)); 644 mutex->stderr_exit(); 645 } 646 ulong_t 647 my_lrand(void) 648 { 649 union { 650 ulong_t l; 651 uint32_t i[sizeof (ulong_t)/sizeof (uint32_t)]; 652 } u; 653 int i; 654 #ifdef __lint 655 ZERO_OBJ(u); 656 #endif 657 658 for (i = 0; i < (sizeof (ulong_t)/sizeof (uint32_t)); i++) 659 u.i[i] = (uint32_t)lrand48(); 660 661 return (u.l); 662 } 663 664 static void 665 remove_from_all_aios(struct aio_str *aiop) 666 { 667 remove_from_aio_list(&aiop->fd->all_aios, aiop); 668 } 669 670 static void 671 infantacide(void) 672 { 673 (void) killpg(pgrp, SIGTERM); 674 } 675 /*ARGSUSED*/ 676 static loop_type 677 on_error_exit(ullong_t start, struct aio_str *aiop) 678 { 679 union err_info err_info; 680 681 err_info.str = "On error exit"; 682 DAIO_SET_RETURN(aiop->aio_res, 0); 683 report_error(aiop, err_info, ERR_SYS); 684 remove_from_all_aios(aiop); 685 (void) sighold(SIGTERM); 686 if (incr_shared_device_error(aiop->dev->shared_data_handle, 687 aiop->dev->errors) != -1) { 688 aiop->dev->errors = 0; 689 } 690 exit_status = EXIT_FAILURE; 691 exit(exit_status); 692 /*NOTREACHED*/ 693 return (BREAK); 694 } 695 /*ARGSUSED*/ 696 static loop_type 697 on_error_stop(ullong_t start, struct aio_str *aiop) 698 { 699 pfprintf(stderr, "%s Set On error stop\n", aiop->fd->name); 700 if (set_shared_stop_flag(aiop->dev->shared_data_handle) == -1) 701 aiop->dev->need_to_stop = 1; 702 aiop->dev->stop_flag = 1; 703 return (BREAK); 704 } 705 706 /*ARGSUSED*/ 707 static loop_type 708 on_error_nop(ullong_t start, struct aio_str *aiop) 709 { 710 if (is_write_io(aiop)) { 711 struct blks *blocks; 712 713 /* 714 * Clear the last io as the retries never suceeded 715 * so we don't read this block which is now in an 716 * undefined state. 717 */ 718 blocks = aio_attach(aiop); 719 blocks[AIO_BLOCK_INDEX(aiop)].r.w.last_io = NULL; 720 shm_ops->detach(AIO_BLOCK_HANDLE(aiop)); 721 } 722 pfprintf(stderr, "%s On error continue\n", aiop->fd->name); 723 return (BREAK); 724 } 725 726 /*ARGSUSED*/ 727 static loop_type 728 on_error_abort(ullong_t start, struct aio_str *aiop) 729 { 730 union err_info err_info; 731 732 err_info.str = "On error abort"; 733 report_error(aiop, err_info, ERR_SYS); 734 /* pfprintf(stderr, "On error abort\n"); */ 735 (void) sighold(SIGTERM); 736 if (incr_shared_device_error(aiop->dev->shared_data_handle, 737 aiop->dev->errors) != -1) { 738 aiop->dev->errors = 0; 739 } 740 abort(); /* On error abort. This one is o.k. */ 741 return (BREAK); 742 } 743 /* 744 * report_hangers_fd. counts the number of I/O requests that 745 * have been waiting for more than hanger_time seconds and then 746 * calls report_error() with the i/o that has been waiting the 747 * longest and a count of the number of i/o requests that are 748 * over time. It only calls report_error() when the number of i/o 749 * requests or the oldest outstanding i/o change or if the last 750 * report was more than hanger_time seconds ago and there are some 751 * i/o hung. 752 */ 753 static int 754 report_hangers_fd(struct fds *fd, time_t tyme, time_t hanger_time) 755 { 756 int total_hung_read = 0; 757 int total_hung_write = 0; 758 #ifdef IO_COUNT_DEBUG 759 int total_read = 0; 760 int total_write = 0; 761 #endif 762 struct aio_str *aiop; 763 union err_info err_info; 764 765 if (fd->error_path != 0 || fd->stop_flag != 0) { 766 return (0); 767 } 768 /* fd->total_read = fd->total_write = 0; */ 769 770 err_info.time = hanger_time; 771 772 for (aiop = fd->all_aios.head; aiop != NULL; aiop = aiop->next) { 773 if (tyme - aiop->tv.tv_sec > hanger_time) { 774 if (is_read_io(aiop)) { 775 total_hung_read++; 776 } else { 777 total_hung_write++; 778 } 779 } else { 780 break; 781 } 782 } 783 784 #ifdef IO_COUNT_DEBUG 785 assert(total_read == fd->total_read); 786 assert(total_write == fd->total_write); 787 #endif 788 789 if (fd->oldest_io == NULL) { 790 fd->oldest_io = fd->all_aios.head; 791 } 792 793 if (total_hung_read != fd->number_of_hung_read || 794 total_hung_write != fd->number_of_hung_write || 795 (fd->all_aios.head != fd->oldest_io && 796 (total_hung_read || total_hung_write))) { 797 798 fd->number_of_hung_read = total_hung_read; 799 fd->number_of_hung_write = total_hung_write; 800 report_error(fd->all_aios.head, err_info, ERR_HUNG); 801 fd->last_report = tyme; 802 fd->oldest_io = fd->all_aios.head; 803 804 } else if (total_hung_read + total_hung_write > 0 && 805 fd->all_aios.head != NULL && fd->last_report + hanger_time < tyme) { 806 807 report_error(fd->all_aios.head, err_info, ERR_HUNG); 808 fd->last_report = tyme; 809 } 810 return (total_hung_read + total_hung_write); 811 } 812 /* 813 * Search the list of i/o that are currently outstanding and report 814 * on any that have been outstanding for more than hanger_time. 815 * Also display howlong the oldest i/o has been Waiting for and when 816 * it was submitted. 817 */ 818 static void 819 report_hangers(struct device *dev, time_t tyme, time_t hanger_time) 820 { 821 int total = 0; 822 struct fds *fd; 823 824 for (fd = dev->fdhead; ; fd = fd->next) { 825 total += report_hangers_fd(fd, tyme, hanger_time); 826 if (dev->fdhead == fd->next) 827 break; 828 } 829 } 830 static void 831 report_all_hangers(struct device *dev, time_t hanger_time) 832 { 833 struct timeval tv; 834 835 while (my_gettimeofday(&tv, NULL) == -1) 836 pperror("gettimeofday"); 837 838 for (; dev != NULL; dev = dev->next) { 839 report_hangers(dev, tv.tv_sec, hanger_time); 840 } 841 } 842 char * 843 my_strdup(const char *s) 844 { 845 char *x = strdup(s); 846 if (x == NULL) { 847 STRDUP_ERROR(s); 848 } 849 return (x); 850 } 851 void * 852 my_calloc(long a, long b) 853 { 854 void *x; 855 856 x = calloc(a, b); 857 858 if (x == NULL) { 859 CALLOC_ERROR(a, b); 860 return (NULL); 861 } 862 return (x); 863 } 864 865 static void 866 add_to_all_aios(struct aio_str *aiop) 867 { 868 add_to_aio_list(&aiop->fd->all_aios, aiop); 869 } 870 871 static int 872 init_read(struct aio_str aio[], ullong_t start) 873 { 874 struct aio_str *aiop; 875 876 aiop = my_calloc(1, sizeof (struct aio_str)); 877 if (aiop == NULL) { 878 pfprintf(stderr, "init_read, can't allocate memory\n"); 879 return (0); 880 } 881 882 aiop->buf = NULL; 883 aiop->off = 0; 884 aiop->handler = is_readonly() ? handle_readonly_rand : handle_read; 885 aiop->dev = aio->dev; 886 aiop->fd = aio->dev->fdhead; 887 aio->dev->fdhead = aio->dev->fdhead->next; 888 add_to_all_aios(aiop); 889 (void) do_new_read(aiop, start, 890 is_readonly() ? READ_ONLY_RAND : NORMAL_READ); 891 return (1); 892 } 893 static int 894 has_no_unwritten(struct aio_str *aiop) 895 { 896 return (aiop->dev->unwritten == NULL); 897 } 898 static void 899 push_unwritten(struct aio_str *aiop) 900 { 901 struct offset_list *new; 902 903 if ((new = calloc(1, sizeof (struct offset_list))) == NULL) { 904 CALLOC_ERROR(1, sizeof (struct offset_list)); 905 aiop->dev->failed_to_push_unwritten = 1; 906 } else { 907 plog(LOG_DEBUG, 908 "Block %#llx (0t%lld) %s pushed onto unwritten queue\n", 909 aiop->off, aiop->off, aiop->dev->logicalname); 910 new->offset = aiop->off; 911 new->next = aiop->dev->unwritten; 912 aiop->dev->unwritten = new; 913 aiop->dev->choose_block = unwritten_block_seq; 914 } 915 } 916 static int 917 find_unwritten(bitmap_t *map, struct aio_str *aiop, int maplen) 918 { 919 struct offset_list *u, *p; 920 int status = 0; 921 p = NULL; 922 923 for (u = aiop->dev->unwritten; u != NULL; u = u->next) { 924 if (pend_write_with_lock(map, u->offset, maplen) == 0) { 925 if (p == NULL) 926 aiop->dev->unwritten = u->next; 927 else 928 p->next = u->next; 929 aiop->off = u->offset; 930 plog(LOG_DEBUG, 931 "Block %#llx (0t%lld) %s locked and removed " 932 "from unwritten queue\n", 933 aiop->off, aiop->off, aiop->dev->logicalname); 934 free(u); 935 status = 1; 936 break; 937 } 938 p = u; 939 } 940 return (status); 941 } 942 static int 943 is_unwritten(struct aio_str *aiop) 944 { 945 struct offset_list *u; 946 for (u = aiop->dev->unwritten; u != NULL; u = u->next) { 947 if (u->offset == aiop->off) 948 return (1); 949 } 950 return (0); 951 } 952 /* 953 * randomish_block 954 * return a random block to try to do io too or from. If we 955 * are short of memory the block is less random to try and 956 * decrease the number of attach/detach pairs that actually 957 * result in system calls, and therefore reduce the number of 958 * faults. 959 * 960 * In particular when short of memory the next io will tend to 961 * be in the same block ob blks structures or the next block 962 * for odd numbered processes and the previous block for even 963 * numbered processes. The overall effect is still close to 964 * random at the device, but individual processes thrash less. 965 */ 966 ulong_t 967 randomish_block(struct aio_str *aiop) 968 { 969 ulong_t t; 970 if (shm_ops->is_short_of_mem()) { 971 t = aiop->off + ((my_lrand() % (shm_ops->max_size() / 972 sizeof (struct blks))) * this_proc() % 2 ? -1 : 1); 973 } else { 974 t = my_lrand(); 975 } 976 return (t); 977 } 978 /*ARGSUSED2*/ 979 void 980 rand_block(bitmap_t *map, struct aio_str *aiop, 981 ullong_t start, ullong_t len, int maplen) 982 { 983 ulong_t t; 984 ullong_t *next_io_blk_ptr; 985 986 if (is_write_io(aiop)) { 987 next_io_blk_ptr = &aiop->dev->next_write_blk; 988 if (!(aiop->dev->next_write_blk % 989 opts.expert_write_cluster_length)) { 990 t = randomish_block(aiop); 991 t = t - (t % opts.expert_write_cluster_length); 992 } else { 993 TNF_PROBE_1(cluster_write, "rand_block", 994 "sunw%cte%diskomizer%blocks write cluster", 995 tnf_ulonglong, next_read_blk, 996 aiop->dev->next_read_blk); 997 t = *next_io_blk_ptr; 998 } 999 } else { 1000 next_io_blk_ptr = &aiop->dev->next_read_blk; 1001 if (!(aiop->dev->next_read_blk % 1002 opts.expert_read_cluster_length)) { 1003 t = randomish_block(aiop); 1004 t = t - (t % opts.expert_read_cluster_length); 1005 } else { 1006 TNF_PROBE_1(cluster_read, "rand_block", 1007 "sunw%cte%diskomizer%blocks read cluster", 1008 tnf_ulonglong, next_read_blk, 1009 aiop->dev->next_read_blk); 1010 t = *next_io_blk_ptr; 1011 } 1012 } 1013 aiop->off = (t)%(len); 1014 aiop->off = find_next_free(map, aiop->off, len, maplen); 1015 if (next_io_blk_ptr != NULL) 1016 *next_io_blk_ptr = aiop->off + 1; 1017 assert(aiop->off < len); 1018 } 1019 1020 static void 1021 unwritten_block(bitmap_t *map, struct aio_str *aiop, 1022 ullong_t start, ullong_t len, int maplen, choose_block_t chooser) 1023 { 1024 plog(LOG_DEBUG, "in unwritten_block %llx\n", 1025 aiop->dev->unwritten == NULL ? 1026 0LL : aiop->dev->unwritten->offset); 1027 if (find_unwritten(map, aiop, maplen)) { 1028 struct blks *blocks; 1029 1030 if (has_no_unwritten(aiop)) 1031 aiop->dev->choose_block = chooser; 1032 blocks = aio_attach(aiop); 1033 blocks[AIO_BLOCK_INDEX(aiop)].u.was_unwritten = 1; 1034 shm_ops->detach(AIO_BLOCK_HANDLE(aiop)); 1035 } else { 1036 chooser(map, aiop, start, len, maplen); 1037 } 1038 } 1039 /* 1040 * find the "next" block to read for this aio_str. Increment by 1041 * by the number of processes, so that when initializing the disk 1042 * each block only gets written once. 1043 */ 1044 static void 1045 seq_block(bitmap_t *map, struct aio_str *aiop, 1046 ullong_t start, ullong_t len, int maplen) 1047 { 1048 int does_not_have_lock; 1049 1050 assert((aiop->dev->block % opts.nprocs) == this_proc() || 1051 aiop->dev->block == len); 1052 1053 aiop->off = aiop->dev->block; 1054 if (aiop->dev->block >= len) { 1055 does_not_have_lock = 1; 1056 } else while ((does_not_have_lock = 1057 pend_write_with_lock(map, aiop->off, maplen)) != 0) { 1058 /* 1059 * Only push blocks that this process would have to write 1060 * onto the unwritten queue. This only effects the last 1061 * block on the device. 1062 */ 1063 if ((aiop->off % opts.nprocs) == this_proc()) { 1064 push_unwritten(aiop); 1065 } 1066 aiop->dev->block += opts.nprocs; 1067 aiop->off = aiop->dev->block % len; 1068 if (aiop->dev->block >= len) { 1069 aiop->dev->block = len; 1070 break; 1071 } 1072 } 1073 1074 if (does_not_have_lock != 0) { 1075 if (aiop->dev->seq_passes == 0 || 1076 --aiop->dev->seq_passes == 0) { 1077 time_now_log(LOG_NOTICE, 1078 "Finished sequential %ss on %s count %d", 1079 is_readonly() ? read_str : write_str, 1080 aiop->dev->logicalname, 1081 aiop->count); 1082 1083 if (has_no_unwritten(aiop)) { 1084 aiop->dev->choose_block = rand_block; 1085 } else { 1086 aiop->dev->choose_block = unwritten_block_rand; 1087 } 1088 } else { 1089 aiop->off = aiop->dev->block = this_proc(); 1090 time_now_log(LOG_NOTICE, 1091 "Starting sequential series again on %s counts %d", 1092 aiop->dev->logicalname, aiop->count); 1093 } 1094 aiop->dev->choose_block(map, aiop, start, len, maplen); 1095 } else { 1096 assert(!does_not_have_lock); 1097 1098 ASSERT_OFFSET(aiop->dev->block); 1099 1100 aiop->dev->block += opts.nprocs; 1101 if (aiop->dev->block >= len) { 1102 aiop->dev->block = len; 1103 } 1104 } 1105 } 1106 static void 1107 unwritten_block_seq(bitmap_t *map, struct aio_str *aiop, 1108 ullong_t start, ullong_t len, int maplen) 1109 { 1110 unwritten_block(map, aiop, start, len, maplen, seq_block); 1111 } 1112 static void 1113 unwritten_block_rand(bitmap_t *map, struct aio_str *aiop, 1114 ullong_t start, ullong_t len, int maplen) 1115 { 1116 unwritten_block(map, aiop, start, len, maplen, rand_block); 1117 } 1118 /* 1119 * I leave the source as this _may_ be useful in the future. 1120 */ 1121 #ifdef NOT_USED_CODE 1122 static char 1123 set_write(bitmap_t map[], ullong_t off, int maplen) 1124 { 1125 ulong_t tmp = GET_OFF(off) % maplen; 1126 char status; 1127 1128 mutex->enter(tmp); 1129 if (map[tmp] & GET_BIT(off)) { 1130 /* we are already locked */ 1131 status = 0; 1132 } else { 1133 map[tmp] |= GET_BIT(off); 1134 status = 1; 1135 } 1136 mutex->exit(tmp); 1137 TNF_PROBE_3(set_write, "set_write", "sunw%cte%diskomizer", 1138 tnf_opaque, off, off, 1139 tnf_opaque, map, map, 1140 tnf_char, status, status); 1141 return (status); 1142 } 1143 #endif 1144 1145 void 1146 clear_write(bitmap_t map[], ullong_t off, ulong_t maplen) 1147 { 1148 ulong_t tmp = (GET_OFF(off) % maplen); 1149 ulong_t x; 1150 bitmap_t bit = ~(GET_BIT(off)); 1151 1152 mutex->enter(tmp); 1153 x = map[tmp]; 1154 map[tmp] &= bit; 1155 assert(~bit != (ulong_t)0); 1156 if (x == map[tmp]) { 1157 plog(LOG_ALERT, "Ooops block %#llx (0t%lld) was not locked\n", 1158 diskomizer_off2byteoff(off), diskomizer_off2byteoff(off)); 1159 TNF_PROBE_2(clear_write, "clear_write failed", 1160 "sunw%cte%diskomizer", 1161 tnf_opaque, off, off, tnf_opaque, map, map); 1162 } else { 1163 TNF_PROBE_2(clear_write, "clear_write ok", 1164 "sunw%cte%diskomizer", 1165 tnf_opaque, off, off, 1166 tnf_opaque, map, map); 1167 } 1168 mutex->exit(tmp); 1169 } 1170 1171 #ifdef NOT_USED_CODE 1172 static void 1173 print_bitmap(bitmap_t map[], int maplen) 1174 { 1175 int i; 1176 1177 for (i = 0; i < maplen; i++) 1178 pprintf("%#8.8X %#8.8X\n", i, map[i]); 1179 (void) fflush(stdout); 1180 } 1181 #endif 1182 1183 /* 1184 * find_next_free finds the "next" block that is not locked starting from 1185 * offset. 1186 */ 1187 ullong_t 1188 find_next_free(bitmap_t map[], ullong_t off, int len, int maplen) 1189 { 1190 ulong_t tmp = (GET_OFF(off) % maplen); 1191 ulong_t i = 0; 1192 bitmap_t bit = GET_BIT(off); 1193 1194 mutex->enter(tmp); 1195 1196 while ((map[tmp] & bit) != 0) { 1197 ulong_t newtmp; 1198 1199 off = off + 1; 1200 1201 off %= len; 1202 newtmp = (GET_OFF(off) % maplen); 1203 bit = GET_BIT(off); 1204 mutex->getnext(tmp, newtmp); 1205 tmp = newtmp; 1206 TNF_PROBE_2(find_next_free_trying, "find_next_free trying", 1207 "sunw%cte%diskomizer", tnf_longlong, off, off, 1208 tnf_opaque, map, map); 1209 if (!(i < (4 * len))) { 1210 (void) plog(LOG_ALERT, 1211 "Unable to find free entry in map %#lx" 1212 " of length %d\n", 1213 (ulong_t)&map[0], maplen); 1214 (void) fflush(stderr); 1215 /* print_bitmap(map, maplen); */ 1216 mutex->exit(tmp); 1217 (void) sleep(1); 1218 mutex->enter(tmp); 1219 i = 0; 1220 } 1221 i++; 1222 } 1223 map[tmp] |= GET_BIT(off); 1224 mutex->exit(tmp); 1225 TNF_PROBE_2(find_next_free_found, "find_next_free found", 1226 "sunw%cte%diskomizer", tnf_longlong, off, off, 1227 tnf_opaque, map, map); 1228 return (off); 1229 } 1230 1231 /* 1232 * Test to see if the write bit is set for this offset. The lock MUST 1233 * already be held 1234 */ 1235 static int 1236 test_write(bitmap_t map[], ullong_t off, int maplen) 1237 { 1238 ulong_t tmp = GET_OFF(off) % maplen; 1239 1240 return (map[tmp] & GET_BIT(off) ? 1 : 0); 1241 } 1242 1243 /* 1244 * If this block is being read from or written to return true 1245 * Otherwise return lock it and return. 1246 */ 1247 static int 1248 pend_write_with_lock(bitmap_t map[], ullong_t off, int maplen) 1249 { 1250 ulong_t tmp = GET_OFF(off) % maplen; 1251 int status; 1252 1253 mutex->enter(tmp); 1254 plog(LOG_DEBUG, "Disk Block %lld\n", diskomizer_off2byteoff(off)); 1255 if (map[tmp] & GET_BIT(off)) { 1256 status = 1; 1257 } else { 1258 map[tmp] |= GET_BIT(off); 1259 status = 0; 1260 } 1261 mutex->exit(tmp); 1262 return (status); 1263 } 1264 1265 static uchar_t 1266 choose_iolen(struct aio_str *aiop) 1267 { 1268 if (is_executable(aiop->buf)) { 1269 return (max_disk_io_len); 1270 } 1271 return (opts.disk_io_sizes.weightings[lrand48() % 1272 opts.disk_io_sizes.wlen]); 1273 } 1274 1275 static void 1276 init_read_buf(uchar_t *buf, ulong_t len, const uchar_t * const write_buf) 1277 { 1278 void *sig = expect_signal(SIGBUS, "memset", buf, len); 1279 1280 read_buffer_initializer(buf, len, write_buf); 1281 cancel_expected_signal(SIGBUS, sig); 1282 } 1283 1284 static bitmap_t * 1285 attach_dev_writemap(struct device *dev) 1286 { 1287 bitmap_t *map; 1288 int error_count = 0; 1289 1290 while ((map = (bitmap_t *) 1291 shm_ops->attach(dev->writemap_handle)) == NULL) { 1292 if ((error_count++ % 10000) == 0) 1293 ATTACH_ERROR(dev->writemap_handle); 1294 } 1295 if (error_count > 0) 1296 plog(LOG_WARNING, "attached o.k.\n"); 1297 return (map); 1298 } 1299 static bitmap_t * 1300 attach_aio_writemap(struct aio_str *aiop) 1301 { 1302 return (attach_dev_writemap(aiop->dev)); 1303 } 1304 static void 1305 clear_writemap(struct aio_str *aiop) 1306 { 1307 bitmap_t *map = attach_aio_writemap(aiop); 1308 clear_write(map, aiop->off, aiop->dev->writemap_size); 1309 shm_ops->detach(aiop->dev->writemap_handle); 1310 } 1311 static void 1312 clear_writemap_success(struct aio_str *aiop) 1313 { 1314 aiop->off = push_recent(aiop->dev->recent, aiop->off); 1315 if (aiop->off != -1) { 1316 clear_writemap(aiop); 1317 } 1318 } 1319 static struct blks * 1320 choose_new_random_read(struct aio_str *aiop, ullong_t start, ullong_t len) 1321 { 1322 struct blks *blocks; 1323 bitmap_t *map; 1324 1325 if ((aiop->off = pop_recent(aiop->dev->recent)) != -1) { 1326 return (aio_attach(aiop)); 1327 } 1328 1329 map = attach_aio_writemap(aiop); 1330 1331 aiop->retrycnt = 0; 1332 for (;;) { 1333 rand_block(map, aiop, start, len, 1334 aiop->dev->writemap_size); 1335 blocks = aio_attach(aiop); 1336 1337 if (is_readonly()) { 1338 if (0x1 & (uint_t) 1339 blocks[AIO_BLOCK_INDEX(aiop)].r.o.last_io) 1340 break; 1341 } else { 1342 if (blocks[AIO_BLOCK_INDEX(aiop)].r.w.last_io != NULL) 1343 break; 1344 } 1345 shm_ops->detach(AIO_BLOCK_HANDLE(aiop)); 1346 blocks = NULL; 1347 clear_write(map, aiop->off, aiop->dev->writemap_size); 1348 } 1349 shm_ops->detach(aiop->dev->writemap_handle); 1350 return (blocks); 1351 } 1352 /* 1353 * do a new read. 1354 */ 1355 time_t 1356 do_new_read(struct aio_str *aiop, ullong_t start, read_type_t read_type) 1357 { 1358 ullong_t offset; 1359 int fd = aiop->fd->fd; 1360 ullong_t len; 1361 struct blks *blocks = NULL; 1362 1363 /* 1364 * if opts.sequential_passes is equal to seq_passes then we are on the 1365 * first pass or opts.sequential_passes was zero to start with. In 1366 * the second case once the disk is fill aip->dev->block will contain 1367 * the address of the last block anyway. 1368 */ 1369 if (aiop->dev->seq_passes == opts.sequential_passes) { 1370 len = aiop->dev->block; 1371 } else { 1372 len = LEN_BYTES2BLOCKS(aiop->dev); 1373 } 1374 1375 1376 if (read_type != RETRY_READ && 1377 OPTION(nloops) != 0 && aiop->dev->countdown != 0) { 1378 if (--aiop->dev->countdown == 0) { 1379 time_now_log(LOG_INFO, "countdown on device %s is zero", 1380 aiop->dev->logicalname); 1381 } 1382 } 1383 if (read_type == NORMAL_READ) { 1384 struct shadow_hdr const *shadow; 1385 1386 blocks = choose_new_random_read(aiop, start, len); 1387 1388 if (aiop->buf == NULL) 1389 aiop->buf = get_read_buf(); 1390 1391 aiop->iolen = blocks[AIO_BLOCK_INDEX(aiop)].r.w.last_iolen; 1392 1393 aiop->daio_id.bufs = INDEX_TO_DIOLEN(aiop->iolen); 1394 aiop->daio_id.buf = blocks[AIO_BLOCK_INDEX(aiop)].r.w.last_io; 1395 aiop->hdr = build_bufhr(aiop->dev, start, aiop->off); 1396 shadow = get_shadow_hdr(aiop->daio_id.buf); 1397 aiop->daio_id.chksum = shadow->chksums[aiop->iolen]; 1398 aiop->daio_id.buf_id = get_write_buf_id( 1399 blocks[AIO_BLOCK_INDEX(aiop)].r.w.last_io); 1400 aiop->daio_id.hdr_len = sizeof (aiop->hdr); 1401 aiop->daio_id.hdr = (uchar_t *)&aiop->hdr; 1402 } else if (read_type == RETRY_READ) { 1403 aiop->retrycnt++; 1404 } else if (read_type == WRITE_READ) { 1405 struct shadow_hdr const *shadow; 1406 1407 blocks = aio_attach(aiop); 1408 1409 aiop->daio_id.bufs = INDEX_TO_DIOLEN(aiop->iolen); 1410 aiop->daio_id.buf = blocks[AIO_BLOCK_INDEX(aiop)].r.w.last_io; 1411 aiop->daio_id.buf_id = get_write_buf_id( 1412 blocks[AIO_BLOCK_INDEX(aiop)].r.w.last_io); 1413 shadow = get_shadow_hdr(aiop->daio_id.buf); 1414 aiop->daio_id.chksum = shadow->chksums[aiop->iolen]; 1415 aiop->daio_id.hdr_len = sizeof (aiop->hdr); 1416 aiop->daio_id.hdr = (uchar_t *)&aiop->hdr; 1417 } else if (read_type == READ_ONLY_RAND) { 1418 if (aiop->buf == NULL) 1419 aiop->buf = get_read_buf(); 1420 1421 blocks = choose_new_random_read(aiop, start, len); 1422 aiop->daio_id.buf = 1423 BIT2CHARSTAR(blocks[AIO_BLOCK_INDEX(aiop)].r.o.last_io); 1424 aiop->daio_id.chksum = 1425 blocks[AIO_BLOCK_INDEX(aiop)].r.o.last_chksum; 1426 aiop->iolen = max_disk_io_len; 1427 aiop->daio_id.bufs = INDEX_TO_DIOLEN(aiop->iolen); 1428 } else if (read_type == READ_ONLY_SEQ) { 1429 bitmap_t *map; 1430 1431 if (aiop->buf == NULL) 1432 aiop->buf = get_read_buf(); 1433 1434 map = attach_aio_writemap(aiop); 1435 len = LEN_BYTES2BLOCKS(aiop->dev); 1436 aiop->dev->choose_block(map, aiop, start, len, 1437 aiop->dev->writemap_size); 1438 shm_ops->detach(aiop->dev->writemap_handle); 1439 blocks = aio_attach(aiop); 1440 1441 aiop->daio_id.buf = 1442 BIT2CHARSTAR(blocks[AIO_BLOCK_INDEX(aiop)].r.o.last_io); 1443 aiop->daio_id.chksum = 1444 blocks[AIO_BLOCK_INDEX(aiop)].r.o.last_chksum; 1445 aiop->iolen = max_disk_io_len; 1446 aiop->daio_id.bufs = INDEX_TO_DIOLEN(aiop->iolen); 1447 } 1448 1449 offset = aio_str2byteoff(aiop); 1450 1451 while (my_gettimeofday(&aiop->tv, NULL) == -1) 1452 pperror("gettimeofday"); 1453 1454 if (blocks == NULL) 1455 blocks = aio_attach(aiop); 1456 1457 init_read_buf(aiop->buf, INDEX_TO_DIOLEN(max_disk_io_len), 1458 blocks[AIO_BLOCK_INDEX(aiop)].r.w.last_io); 1459 1460 shm_ops->detach(AIO_BLOCK_HANDLE(aiop)); 1461 1462 for (;;) { 1463 if (aiop->dev->stop_flag || 1464 stop_check(aiop->dev->shared_data_handle)) { 1465 if (aiop->dev->stop_flag == 0) { 1466 plog(LOG_NOTICE, "Stopping %s\n", 1467 aiop->dev->logicalname); 1468 aiop->dev->stop_flag = 1; 1469 } else if (aiop->dev->need_to_stop && 1470 set_shared_stop_flag( 1471 aiop->dev->shared_data_handle) != -1) { 1472 aiop->dev->need_to_stop = 0; 1473 } 1474 clear_writemap(aiop); 1475 return_read_buf(aiop->buf); 1476 aiop->buf = NULL; 1477 remove_from_aio_list(&aiop->fd->all_aios, aiop); 1478 break; 1479 } 1480 1481 ZERO_OBJ(aiop->error.desc); 1482 1483 /* Move to the begining of the all_aios list */ 1484 remove_from_aio_list(&aiop->fd->all_aios, aiop); 1485 add_to_aio_list(&aiop->fd->all_aios, aiop); 1486 1487 TNF_PROBE_4(aioread, "aioread", 1488 "sunw%cte%diskomizer%aio read", 1489 tnf_long, fd, aiop->fd->fd, 1490 tnf_opaque, offset, offset, 1491 tnf_opaque, aiop, aiop, 1492 aio_tnf_str, *aiop, aiop); 1493 1494 if (daio->aread(fd, aiop->buf, 1495 INDEX_TO_DIOLEN(aiop->iolen), offset, 1496 &aiop->aio_res, &aiop->daio_id) < 0) { 1497 if (errno == EAGAIN) { 1498 AIOREAD_ERROR(fd, aiop->fd->name, 1499 aiop->buf, 1500 INDEX_TO_DIOLEN(aiop->iolen), offset, 1501 SEEK_SET, &aiop->aio_res); 1502 continue; 1503 } else { 1504 AIOREAD_ERROR(fd, aiop->fd->name, aiop->buf, 1505 INDEX_TO_DIOLEN(aiop->iolen), offset, 1506 SEEK_SET, &aiop->aio_res); 1507 clear_writemap(aiop); 1508 } 1509 } 1510 aiop->fd->total_read++; 1511 break; 1512 } 1513 return (aiop->tv.tv_sec); 1514 } 1515 static struct shadow_hdr const * 1516 set_io_len(struct aio_str *aiop) 1517 { 1518 struct shadow_hdr const *shadow_hdr = get_shadow_hdr(aiop->buf); 1519 if (!shadow_hdr->type.BUF_READY) { 1520 struct shadow_hdr *shadow; 1521 int j; 1522 shadow = (struct shadow_hdr *)shadow_hdr; 1523 init_buf(aiop->buf); 1524 for (j = 0; j <= opts.disk_io_sizes.weightings[ 1525 opts.disk_io_sizes.wlen - 1]; j++) { 1526 shadow->chksums[j] = 1527 check_bufbody(aiop->buf, 1528 INDEX_TO_DIOLEN(j)); 1529 } 1530 shadow->type = get_bufhdr_a(aiop->buf).type; 1531 shadow->type.BUF_READY = 1; 1532 if (opts.obscure_execute && is_executable(aiop->buf)) { 1533 run_func(aiop->buf, 1534 opts.disk_io_sizes.vals[aiop->iolen] - 1535 SIZEOF_BUFHDR); 1536 } 1537 aiop->iolen = choose_iolen(aiop); 1538 } else if (shadow_hdr->type.BUF_READ_ONLY) { 1539 aiop->iolen = max_disk_io_len; 1540 } else { 1541 assert(*aiop->buf == 0xAA || *aiop->buf == 0x55); 1542 aiop->iolen = choose_iolen(aiop); 1543 } 1544 return (shadow_hdr); 1545 } 1546 static int 1547 is_sequential(struct aio_str *aiop) 1548 { 1549 return (aiop->dev->choose_block == seq_block || 1550 aiop->dev->choose_block == unwritten_block_seq); 1551 } 1552 1553 static struct aio_str * 1554 get_deferred_io(struct device *dev) 1555 { 1556 struct device *devp; 1557 struct aio_str *aiop; 1558 1559 for (devp = dev; devp != NULL; devp = devp->next) { 1560 if ((aiop = pop_from_aio_list(&devp->deferred_ios)) != NULL) { 1561 return (aiop); 1562 } 1563 } 1564 return (NULL); 1565 } 1566 1567 static void 1568 deferred_starter(struct device *dev, ullong_t start) 1569 { 1570 int all_going = 1; 1571 struct device *devp; 1572 1573 for (devp = dev; devp != NULL; devp = devp->next) { 1574 struct aio_str *aiop; 1575 1576 check_exit_flag(); 1577 1578 aiop = pop_from_aio_list(&devp->deferred_ios); 1579 if (aiop != NULL) { 1580 cancelled_count--; 1581 aiop->handler(aiop, start); 1582 } 1583 if (is_aio_on_list(&devp->deferred_ios)) { 1584 all_going = 0; 1585 } 1586 } 1587 if (all_going == 1) { 1588 start_deferred = (void (*)(struct device *, ullong_t)) nop; 1589 } 1590 } 1591 1592 static int 1593 number_of_writes(struct device *dev) 1594 { 1595 struct fds *fd = dev->fdhead; 1596 int count = 0; 1597 1598 do { 1599 count += fd->total_write; 1600 fd = fd->next; 1601 } while (fd != dev->fdhead); 1602 1603 return (count); 1604 } 1605 1606 /* static void */ 1607 void 1608 do_new_write(struct aio_str *aiop, ullong_t start, int retry) 1609 { 1610 ullong_t len; 1611 ullong_t offset; 1612 long writemap_size; 1613 struct shadow_hdr const *shadow_hdr; 1614 struct blks *blocks; 1615 struct blks *block; 1616 char deferred; 1617 1618 if (aiop->fd == NULL) { 1619 aiop->fd = aiop->dev->fdhead; 1620 add_to_aio_list(&aiop->fd->all_aios, aiop); 1621 deferred = 1; 1622 } else { 1623 deferred = 0; 1624 } 1625 1626 len = LEN_BYTES2BLOCKS(aiop->dev); 1627 writemap_size = aiop->dev->writemap_size; 1628 1629 if (!retry) { 1630 bitmap_t *map; 1631 int i; 1632 if (write_loops) { 1633 if (--aiop->dev->countdown == 0) { 1634 time_now_log(LOG_INFO, 1635 "countdown on device %s is zero", 1636 aiop->dev->logicalname); 1637 } 1638 } 1639 if (aiop->buf == NULL) { 1640 aiop->buf = get_write_buf(); 1641 } 1642 shadow_hdr = set_io_len(aiop); 1643 map = attach_aio_writemap(aiop); 1644 1645 for (i = 0; /* cstyle */; i++) { 1646 aiop->dev->choose_block(map, aiop, start, len, 1647 writemap_size); 1648 blocks = aio_attach(aiop); 1649 1650 block = &blocks[AIO_BLOCK_INDEX(aiop)]; 1651 if (block->r.w.last_io == NULL || 1652 (block->read_count >= OPTION(read_minimum) && 1653 block->r.w.last_io != aiop->buf)) { 1654 break; 1655 } else { 1656 if (block->r.w.last_io == aiop->buf) { 1657 uchar_t *buf; 1658 /* 1659 * Get the new write buf first so that 1660 * You definitely get a new buffer. 1661 */ 1662 if ((buf = get_write_buf()) != NULL) { 1663 return_write_buf(aiop->buf); 1664 aiop->buf = buf; 1665 shadow_hdr = set_io_len(aiop); 1666 break; 1667 } 1668 } 1669 clear_write(map, aiop->off, writemap_size); 1670 shm_ops->detach(AIO_BLOCK_HANDLE(aiop)); 1671 block = blocks = NULL; 1672 if (i * OPTION(obscure_search_multiplier) >= 1673 len || deferred) { 1674 if (!deferred) { 1675 union err_info err_info; 1676 err_info.str = "write"; 1677 report_error(aiop, err_info, 1678 ERR_DEFERRED); 1679 } 1680 remove_from_aio_list( 1681 &aiop->fd->all_aios, aiop); 1682 aiop->fd = NULL; 1683 add_to_aio_list( 1684 &aiop->dev->deferred_ios, aiop); 1685 return_write_buf(aiop->buf); 1686 aiop->buf = NULL; 1687 shm_ops->detach( 1688 aiop->dev->writemap_handle); 1689 if (number_of_writes(aiop->dev) == 0) { 1690 start_deferred = 1691 deferred_starter; 1692 } 1693 return; 1694 } 1695 } 1696 } 1697 aiop->retrycnt = 0; 1698 shm_ops->detach(aiop->dev->writemap_handle); 1699 } else { 1700 /* if we are retrying then we already have the lock. */ 1701 ullong_t blockno = aio_str2byteoff(aiop); 1702 shadow_hdr = get_shadow_hdr(aiop->buf); 1703 if (retry == 1) { 1704 pfprintf(stderr, 1705 "%s Block 0t%lld (%#llx) retry count %d\n", 1706 aiop->fd->name, blockno, blockno, 1707 ++aiop->retrycnt); 1708 } 1709 blocks = aio_attach(aiop); 1710 1711 block = &blocks[AIO_BLOCK_INDEX(aiop)]; 1712 } 1713 1714 offset = (ullong_t)start + (INDEX_TO_DIOLEN(max_disk_io_len)*aiop->off); 1715 1716 assert((ullong_t)offset >= (ullong_t)start); 1717 assert((ullong_t)offset <= (ullong_t)(start + aiop->dev->length - 1718 INDEX_TO_DIOLEN(max_disk_io_len))); 1719 1720 while (my_gettimeofday(&aiop->tv, NULL) == -1) 1721 pperror("gettimeofday"); 1722 1723 if (shadow_hdr->type.BUF_READ_ONLY == 0) { 1724 /* 1725 * Set up the buffer header and store away the path_id of the 1726 * path we are using, and the header checksum. 1727 */ 1728 unprotect_buf(aiop->buf); 1729 toggle_bufhdr(aiop->buf); 1730 set_bufhdr_all(aiop->buf, shadow_hdr->chksums[aiop->iolen], 1731 INDEX_TO_DIOLEN(aiop->iolen), 1732 aiop->fd->devid, offset, shadow_hdr->type, 1733 ++block->sequence, 1734 aiop->tv.tv_sec); 1735 block->path_id = aiop->fd->path_id; 1736 block->hdrchksum = 1737 set_buf_hdrchksum(aiop->buf); 1738 protect_buf(aiop->buf); 1739 } else { 1740 block->hdrchksum = check_bufhdr(aiop->buf, 1741 get_bufhdr_hdrchksum(aiop->buf)); 1742 } 1743 if (get_bufhdr_hdrchksum(aiop->buf) != block->hdrchksum) { 1744 pfprintf(stderr, "writing bad checksum buf %#lx\n", 1745 (ulong_t)aiop->buf); 1746 } 1747 if (opts.obscure_execute && is_executable(aiop->buf)) { 1748 plog(LOG_DEBUG, "Writing executable buffer\n"); 1749 } 1750 /* 1751 * This if is saying that this process should have initialized 1752 * this block, during the sequential part of the run. So last_io 1753 * should be set. If not then something went wrong. 1754 * 1755 * The block could also been skipped as it was busy and put on 1756 * the unwritten list, so only check if the unwritten list is 1757 * empty. 1758 */ 1759 1760 if (!is_sequential(aiop) && block->r.w.last_io == NULL && 1761 (aiop->off % opts.nprocs) == this_proc() && 1762 aiop->dev->failed_to_push_unwritten == 0 && 1763 blocks[AIO_BLOCK_INDEX(aiop)].u.was_unwritten == 1 && 1764 !is_unwritten(aiop)) { 1765 ullong_t blockno = aio_str2byteoff(aiop); 1766 pfprintf(stderr, "Device %s\n", aiop->fd->name); 1767 pfprintf(stderr, "Device len %#llx\n", aiop->dev->length); 1768 pfprintf(stderr, "This proc %d nprocs %ld\n", this_proc(), 1769 opts.nprocs); 1770 pfprintf(stderr, "Block %#llx (0t%lld) byte off %llx error\n", 1771 aiop->off, aiop->off, blockno); 1772 pfprintf(stderr, "Last Requested %ld\n", 1773 block->last_requested); 1774 pfprintf(stderr, "Last return delta %d\n", 1775 block->last_returned_delta); 1776 pfprintf(stderr, "Read Count %d\n", block->read_count); 1777 pfprintf(stderr, "Last Io Len %d\n", 1778 INDEX_TO_DIOLEN(block->r.w.last_iolen)); 1779 assert(block->r.w.last_io != NULL); 1780 } 1781 1782 shm_ops->detach(AIO_BLOCK_HANDLE(aiop)); 1783 1784 for (;;) { 1785 if (aiop->dev->stop_flag || 1786 stop_check(aiop->dev->shared_data_handle)) { 1787 if (aiop->dev->stop_flag == 0) { 1788 plog(LOG_NOTICE, "Stopping %s\n", 1789 aiop->dev->logicalname); 1790 aiop->dev->stop_flag = 1; 1791 } else if (aiop->dev->need_to_stop && 1792 set_shared_stop_flag( 1793 aiop->dev->shared_data_handle) != -1) { 1794 aiop->dev->need_to_stop = 0; 1795 } 1796 clear_writemap(aiop); 1797 return_write_buf(aiop->buf); 1798 aiop->buf = NULL; 1799 remove_from_aio_list(&aiop->fd->all_aios, aiop); 1800 break; 1801 } 1802 1803 assert(*aiop->buf == 0xAA || *aiop->buf == 0x55); 1804 1805 ZERO_OBJ(aiop->error.desc); 1806 1807 TNF_PROBE_4(daiowrite, "aiowrite", 1808 "sunw%cte%diskomizer%aio write", 1809 tnf_long, fd, aiop->fd->fd, 1810 tnf_opaque, offset, offset, 1811 tnf_opaque, aiop, aiop, 1812 aio_tnf_str, *aiop, aiop); 1813 aiop->daio_id.buf = aiop->buf; 1814 aiop->daio_id.buf_id = get_write_buf_id(aiop->buf); 1815 aiop->daio_id.hdr_len = sizeof (aiop->hdr); 1816 aiop->daio_id.hdr = (uchar_t *)&aiop->hdr; 1817 (void) memcpy(&aiop->hdr, aiop->buf, sizeof (aiop->hdr)); 1818 1819 aiop->daio_id.footer_len = 0; 1820 1821 /* Move to the begining of the all_aios list */ 1822 remove_from_aio_list(&aiop->fd->all_aios, aiop); 1823 add_to_aio_list(&aiop->fd->all_aios, aiop); 1824 1825 if (daio->awrite(aiop->fd->fd, aiop->buf, 1826 INDEX_TO_DIOLEN(aiop->iolen), 1827 offset, &aiop->aio_res, &aiop->daio_id) == -1) { 1828 int serrno = errno; 1829 AIOWRITE_ERROR(aiop->fd->fd, aiop->fd->name, 1830 (ulong_t)aiop->buf, 1831 INDEX_TO_DIOLEN(aiop->iolen), 1832 offset, 1833 SEEK_SET, 1834 (ulong_t)&aiop->aio_res); 1835 if (serrno == EAGAIN) { 1836 continue; 1837 } else { 1838 clear_writemap(aiop); 1839 } 1840 } else if (!deferred) { 1841 aiop->fd->total_write++; 1842 if (is_aio_on_list(&aiop->dev->deferred_ios)) { 1843 aiop = pop_from_aio_list( 1844 &aiop->dev->deferred_ios); 1845 do_new_write(aiop, start, 0); 1846 } 1847 } else { 1848 aiop->fd->total_write++; 1849 plog(LOG_NOTICE, "Started deferred io to %s\n", 1850 aiop->dev->logicalname); 1851 } 1852 break; 1853 } 1854 } 1855 1856 /*ARGSUSED1*/ 1857 void 1858 run_func(uchar_t *buf, size_t size) 1859 { 1860 uchar_t *cptr; 1861 #ifdef SPARC 1862 uint32_t *last, *ptr; 1863 #else 1864 uint32_t *ptr; 1865 #endif 1866 void (*func)(void); 1867 1868 cptr = get_buf_data(buf); 1869 #ifdef SPARC 1870 /* check alignment for SPARC */ 1871 if ((ulong_t)cptr % 4) { 1872 return; 1873 } 1874 #endif 1875 /*LINTED*/ 1876 ptr = (uint32_t *)cptr; 1877 func = (void (*)(void))(ptr); 1878 #ifdef SPARC 1879 last = ptr + size / sizeof (uint32_t); 1880 1881 for (; ptr < last; ptr++) 1882 flush((int32_t *)ptr); 1883 #endif 1884 1885 plog(LOG_DEBUG, "Running func %#lx in buf %#lx, type %llx\n", 1886 func, (ulong_t)buf, get_bufhdr(buf).start); 1887 TNF_PROBE_1(run_func, "run_func", 1888 "sunw%cte%diskomizer%aio execute run", 1889 tnf_opaque, buf, buf); 1890 func(); 1891 } 1892 1893 struct fds * 1894 find_path(struct fds *fdhead, char path_id) 1895 { 1896 struct fds *fd; 1897 1898 for (fd = fdhead->next; ; fd = fd->next) { 1899 if (fd->path_id == path_id) 1900 return (fd); 1901 if (fd == fdhead) 1902 return (NULL); 1903 } 1904 } 1905 1906 static struct bufhdr 1907 build_bufhr(struct device *dev, ullong_t start, ullong_t off) 1908 { 1909 struct bufhdr hdr; 1910 struct shadow_hdr const *shadow_hdr; 1911 struct blks *block; 1912 struct blks *blocks; 1913 struct fds *fd; 1914 ushort16_t hdrchksum; 1915 int error_count = 0; 1916 ullong_t offset = (ullong_t)start + 1917 (ullong_t)(INDEX_TO_DIOLEN(max_disk_io_len)*off); 1918 while ((blocks = shm_ops->attach(DEV_BLOCK_HANDLE(dev, off))) == NULL) { 1919 if (error_count++ % 10000 == 0) 1920 ATTACH_ERROR(DEV_BLOCK_HANDLE(dev, off)); 1921 } 1922 if (error_count > 0) 1923 plog(LOG_WARNING, "attached o.k.\n"); 1924 block = &blocks[DEV_BLOCK_INDEX(dev, off)]; 1925 ZERO_OBJ(hdr); 1926 fd = find_path(dev->fdhead, block->path_id); 1927 assert(fd != NULL); 1928 1929 if (block->bad_hdr) { 1930 (void) memcpy(&hdr, block->r.w.last_io, SIZEOF_BUFHDR); 1931 shm_ops->detach(DEV_BLOCK_HANDLE(dev, off)); 1932 return (hdr); 1933 } 1934 shadow_hdr = get_shadow_hdr(block->r.w.last_io); 1935 1936 if (block->ab == 1) { 1937 hdr.start = hdr.end = BUF_TYPE_A; 1938 hdr.ab.a.chksum = shadow_hdr->chksums[block->r.w.last_iolen]; 1939 hdr.ab.a.type = shadow_hdr->type; 1940 hdr.ab.a.type.sequence = block->sequence; 1941 hdr.ab.a.devid = fd->devid; 1942 hdr.ab.a.off = offset; 1943 hdr.ab.a.time = block->last_requested; 1944 hdr.ab.a.did = master_pid(); 1945 hdr.ab.a.len = INDEX_TO_DIOLEN(block->r.w.last_iolen); 1946 get_serial_and_provider(hdr.ab.a.serial_and_provider, 1947 SIZEOF_SERIAL_AND_PROVIDER); 1948 } else { 1949 hdr.start = hdr.end = BUF_TYPE_B; 1950 hdr.ab.b.time = block->last_requested; 1951 hdr.ab.b.chksum = shadow_hdr->chksums[block->r.w.last_iolen]; 1952 hdr.ab.b.type = shadow_hdr->type; 1953 hdr.ab.b.type.sequence = block->sequence; 1954 hdr.ab.b.devid = fd->devid; 1955 hdr.ab.b.off = offset; 1956 hdr.ab.b.did = master_pid(); 1957 hdr.ab.b.len = INDEX_TO_DIOLEN(block->r.w.last_iolen); 1958 get_serial_and_provider(hdr.ab.b.serial_and_provider, 1959 SIZEOF_SERIAL_AND_PROVIDER); 1960 } 1961 if ((hdrchksum = set_hdrchksum(&hdr)) != block->hdrchksum) { 1962 pfprintf(stderr, 1963 "Bad rebuilt buf header is %#x should be %#x\n", 1964 block->hdrchksum, hdrchksum); 1965 } 1966 shm_ops->detach(DEV_BLOCK_HANDLE(dev, off)); 1967 return (hdr); 1968 } 1969 struct bufhdr 1970 build_prevbufhr(struct device *dev, ullong_t start, ullong_t off) 1971 { 1972 struct bufhdr hdr; 1973 struct shadow_hdr const *shadow_hdr; 1974 struct blks *block; 1975 struct blks *blocks; 1976 struct fds *fd; 1977 ushort16_t hdrchksum; 1978 int error_count = 0; 1979 ullong_t offset = (ullong_t)start + 1980 (ullong_t)(INDEX_TO_DIOLEN(max_disk_io_len)*off); 1981 while ((blocks = shm_ops->attach(DEV_BLOCK_HANDLE(dev, off))) == NULL) { 1982 if (error_count++ % 10000 == 0) 1983 ATTACH_ERROR(DEV_BLOCK_HANDLE(dev, off)); 1984 } 1985 if (error_count > 0) 1986 plog(LOG_WARNING, "attached o.k.\n"); 1987 block = &blocks[DEV_BLOCK_INDEX(dev, off)]; 1988 ZERO_OBJ(hdr); 1989 fd = find_path(dev->fdhead, block->path_id); 1990 assert(fd != NULL); 1991 1992 if (block->bad_hdr) { 1993 (void) memcpy(&hdr, block->r.w.last_io, SIZEOF_BUFHDR); 1994 shm_ops->detach(DEV_BLOCK_HANDLE(dev, off)); 1995 return (hdr); 1996 } 1997 shadow_hdr = get_shadow_hdr(block->r.w.prev_io); 1998 1999 if (block->ab != 1) { 2000 hdr.start = hdr.end = BUF_TYPE_A; 2001 hdr.ab.a.time = block->u.prev_requested; 2002 hdr.ab.a.chksum = shadow_hdr->chksums[block->r.w.prev_iolen]; 2003 hdr.ab.a.type = shadow_hdr->type; 2004 hdr.ab.a.type.sequence = block->sequence - 1; 2005 hdr.ab.a.devid = fd->devid; 2006 hdr.ab.a.off = offset; 2007 get_serial_and_provider(hdr.ab.a.serial_and_provider, 2008 SIZEOF_SERIAL_AND_PROVIDER); 2009 hdr.ab.a.len = INDEX_TO_DIOLEN(block->r.w.prev_iolen); 2010 hdr.ab.a.did = master_pid(); 2011 } else { 2012 hdr.start = hdr.end = BUF_TYPE_B; 2013 hdr.ab.b.chksum = shadow_hdr->chksums[block->r.w.prev_iolen]; 2014 hdr.ab.b.type = shadow_hdr->type; 2015 hdr.ab.b.type.sequence = block->sequence - 1; 2016 hdr.ab.b.devid = fd->devid; 2017 hdr.ab.b.off = offset; 2018 hdr.ab.b.len = INDEX_TO_DIOLEN(block->r.w.prev_iolen); 2019 hdr.ab.b.time = block->u.prev_requested; 2020 hdr.ab.b.did = master_pid(); 2021 get_serial_and_provider(hdr.ab.b.serial_and_provider, 2022 SIZEOF_SERIAL_AND_PROVIDER); 2023 } 2024 if ((hdrchksum = set_hdrchksum(&hdr)) != block->hdrchksum) { 2025 pfprintf(stderr, 2026 "Bad rebuilt buf header is %#x should be %#x\n", 2027 block->hdrchksum, hdrchksum); 2028 } 2029 shm_ops->detach(DEV_BLOCK_HANDLE(dev, off)); 2030 return (hdr); 2031 } 2032 static struct diff_return 2033 memdiff_data(FILE *err, uchar_t *goodptr, uchar_t *badptr, 2034 int offset, int len) 2035 { 2036 int i; 2037 struct diff_return dr; 2038 union { 2039 uchar_t c[sizeof (uint64_t) / sizeof (uchar_t)]; 2040 uint32_t i[sizeof (uint64_t) / sizeof (uint32_t)]; 2041 uint64_t l; 2042 } good, bad, diff; 2043 2044 dr.bits = 0LL; 2045 dr.count = 0LL; 2046 2047 for (i = 0; i < len; i += sizeof (uint64_t)) { 2048 (void) memcpy(&good.c[0], goodptr, sizeof (uint64_t)); 2049 (void) memcpy(&bad.c[0], badptr, sizeof (uint64_t)); 2050 diff.i[0] = good.i[0] ^ bad.i[0]; 2051 diff.i[1] = good.i[1] ^ bad.i[1]; 2052 2053 if (!opts.expert_small_diffs || diff.l) { 2054 int bc = count_uint32_bits(diff.i[0]) + 2055 count_uint32_bits(diff.i[1]); 2056 dr.count += bc; 2057 dr.bits |= diff.l; 2058 #ifdef _BIG_ENDIAN 2059 (void) fprintf(err, 2060 "0x%8.8x %8.8x%8.8x %8.8x%8.8x " 2061 "%8.8x%8.8x %2.2d\n", i + offset, 2062 good.i[0], good.i[1], bad.i[0], bad.i[1], 2063 diff.i[0], diff.i[1], bc); 2064 #elif defined(_LITTLE_ENDIAN) 2065 (void) fprintf(err, 2066 "0x%8.8x " 2067 "%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x " 2068 "%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x " 2069 "%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x " 2070 "%2.2d\n", i + offset, 2071 good.c[0], good.c[1], good.c[2], good.c[3], 2072 good.c[4], good.c[5], good.c[6], good.c[7], 2073 bad.c[0], bad.c[1], bad.c[2], bad.c[3], 2074 bad.c[4], bad.c[5], bad.c[6], bad.c[7], 2075 diff.c[0], diff.c[1], diff.c[2], diff.c[3], 2076 diff.c[4], diff.c[5], diff.c[6], diff.c[7], 2077 bc); 2078 #else 2079 #error "niether _BIG_ENDIAN or _LITTLE_ENDIAN defined" 2080 #endif 2081 } 2082 badptr += sizeof (uint64_t); 2083 goodptr += sizeof (uint64_t); 2084 } 2085 return (dr); 2086 } 2087 2088 static struct diff_return 2089 memdiff_bufhdr(FILE *err, uchar_t *buf, uchar_t *good_hdr) 2090 { 2091 union { 2092 struct bufhdr hdr; 2093 uchar_t c[SIZEOF_BUFHDR]; 2094 } bad; 2095 2096 bad.hdr = get_bufhdr(buf); 2097 2098 return (memdiff_data(err, good_hdr, &bad.c[0], 0, SIZEOF_BUFHDR)); 2099 } 2100 2101 char * 2102 diff_file(void) 2103 { 2104 char *wd; 2105 static char *diffs_file; 2106 2107 if (NULL == diffs_file) { 2108 if (diffs[0] != '/' && (wd = getcwd(NULL, 128)) != NULL) { 2109 int x = strlen(diffs) + strlen(wd) + 2; 2110 if ((diffs_file = malloc(x)) != NULL) { 2111 snprintf(diffs_file, x, "%s/%s", wd, diffs); 2112 } else { 2113 diffs_file = diffs; 2114 } 2115 free(wd); 2116 } else { 2117 diffs_file = diffs; 2118 } 2119 } 2120 return (diffs_file); 2121 } 2122 2123 struct diff_return 2124 memdiff_buf(uint64_t off, struct device *dev, uchar_t *buf, uint32_t iolen, 2125 struct fds *fd, const char *str, struct error *error) 2126 { 2127 static const char zero2seven[] = "0 1 2 3 4 5 6 7"; 2128 uchar_t *badptr; 2129 uchar_t *goodptr; 2130 uchar_t *prevptr; /* pointer to the previous buffer that was written */ 2131 FILE *err; 2132 time_t now; 2133 sigset_t nset; /* new set */ 2134 sigset_t oset; /* old set */ 2135 int sigprocmask_status; 2136 int error_count = 0; 2137 struct diff_return dr; 2138 struct diff_return dr2; 2139 struct blks *blocks, *block; 2140 union { 2141 struct bufhdr hdr; 2142 uchar_t c[SIZEOF_BUFHDR]; 2143 } good, prev; 2144 while ((blocks = shm_ops->attach(DEV_BLOCK_HANDLE(dev, off))) == NULL) { 2145 if (error_count++ % 10000 == 0) 2146 ATTACH_ERROR(DEV_BLOCK_HANDLE(dev, off)); 2147 } 2148 if (error_count > 0) 2149 plog(LOG_WARNING, "attached o.k.\n"); 2150 block = &blocks[DEV_BLOCK_INDEX(dev, off)]; 2151 if (block->r.w.last_io == NULL) { 2152 shm_ops->detach(DEV_BLOCK_HANDLE(dev, off)); 2153 dr.count = -1; 2154 return (dr); 2155 } 2156 goodptr = get_buf_data(block->r.w.last_io); 2157 if (block->r.w.prev_io != NULL) { 2158 prevptr = get_buf_data(block->r.w.prev_io); 2159 } else { 2160 prevptr = NULL; 2161 } 2162 shm_ops->detach(DEV_BLOCK_HANDLE(dev, off)); 2163 2164 badptr = get_buf_data(buf); 2165 2166 if ((err = fopen(diff_file(), "a+")) == NULL) { 2167 err = stderr; 2168 FOPEN_ERROR(diff_file(), "a+"); 2169 2170 (void) sigemptyset(&nset); 2171 (void) sigaddset(&nset, SIGINT); 2172 (void) sigaddset(&nset, SIGTERM); 2173 sigprocmask_status = 2174 sigprocmask(SIG_BLOCK, &nset, &oset); 2175 mutex->stderr_enter(); 2176 } 2177 2178 now = time(NULL); 2179 (void) fprintf(err, "diskomizer %s\n", VERSION); 2180 print_bufhdr_offsets(err); 2181 2182 (void) fprintf(err, "Error Instance %d\n", get_error_instance_number()); 2183 (void) fprintf(err, "Diffs dumped %s", ctime(&now)); 2184 (void) fprintf(err, "Diffs from %s for block 0x%llx\n", 2185 str, diskomizer_off2byteoff(off)); 2186 (void) fprintf(err, 2187 "use \"" 2188 "dd if=%s bs=%d iseek=%lld count=1\" to read the block\n", 2189 fd->longname, iolen, 2190 (opts.start_offset) + off); 2191 good.hdr = build_bufhr(dev, start_offset(), off); 2192 decode_header(err, &good.c[0], buf); 2193 (void) fprintf(err, "%10.10s %16.16s %16.16s %16.16s %s\n", 2194 "", "Written", "Read", "Diffs", "Bit count"); 2195 (void) fprintf(err, "%10.10s %16.16s %16.16s %16.16s\n", 2196 "Offset", zero2seven, zero2seven, zero2seven); 2197 2198 dr = memdiff_bufhdr(err, buf, &good.c[0]); 2199 dr2 = memdiff_data(err, goodptr, badptr, SIZEOF_BUFHDR, 2200 iolen - SIZEOF_BUFHDR); 2201 2202 dr.count += dr2.count; 2203 dr.bits |= dr2.bits; 2204 2205 (void) fprintf(err, "End of diffs for block 0x%llx\n", 2206 diskomizer_off2byteoff(off)); 2207 if (prevptr != NULL && opts.display_prev_diffs) { 2208 prev.hdr = build_prevbufhr(dev, start_offset(), off); 2209 (void) fprintf(err, "Diffs from %s for previous io to block " 2210 "0x%llx\n", str, diskomizer_off2byteoff(off)); 2211 (void) fprintf(err, "%10.10s %16.16s %16.16s %16.16s %s\n", 2212 "", "Written", "Read", "Diffs", "Bit count"); 2213 (void) fprintf(err, "%10.10s %16.16s %16.16s %16.16s\n", 2214 "Offset", zero2seven, zero2seven, zero2seven); 2215 (void) memdiff_bufhdr(err, buf, &prev.c[0]); 2216 (void) memdiff_data(err, prevptr, badptr, SIZEOF_BUFHDR, 2217 iolen - SIZEOF_BUFHDR); 2218 } 2219 2220 2221 (void) fflush(err); 2222 if (fsync(fileno(err)) == -1) { 2223 FSYNC_ERROR(fileno(err), diffs); 2224 } 2225 if (err != stderr) { 2226 (void) fclose(err); 2227 error->diff_file = diff_file(); 2228 dlog(LOG_ERR, "Diffs file dumped to %s\n", diff_file()); 2229 } else { 2230 mutex->stderr_exit(); 2231 error->diff_file = NULL; 2232 if (sigprocmask_status == 0) 2233 (void) sigprocmask(SIG_SETMASK, &oset, NULL); 2234 } 2235 return (dr); 2236 } 2237 struct diff_return 2238 memdiff(struct aio_str *aiop, char *str) 2239 { 2240 struct diff_return dr; 2241 2242 if (!is_readonly()) { 2243 aiop->error.dr = memdiff_buf(aiop->off, aiop->dev, aiop->buf, 2244 INDEX_TO_DIOLEN(aiop->iolen), aiop->fd, str, &aiop->error); 2245 return (aiop->error.dr); 2246 } 2247 dr.count = dr.bits = 0; 2248 return (dr); 2249 } 2250 /* 2251 * Check to see if the buffer that has been read matches the previous 2252 * buffer that was written. This would spot if an write never got to 2253 * the disk. 2254 */ 2255 int 2256 check_previous_buffer(check_t check_sum, struct aio_str *aiop) 2257 { 2258 uchar_t *previous_buf_written; 2259 struct blks *blocks; 2260 time_t prev_time; 2261 2262 blocks = aio_attach(aiop); 2263 2264 previous_buf_written = blocks[AIO_BLOCK_INDEX(aiop)].r.w.prev_io; 2265 prev_time = blocks[AIO_BLOCK_INDEX(aiop)].u.prev_requested; 2266 2267 shm_ops->detach(AIO_BLOCK_HANDLE(aiop)); 2268 2269 if (previous_buf_written != NULL) { 2270 struct bufhdr_a hdr_a; 2271 struct bufhdr hdr; 2272 hdr = get_bufhdr(previous_buf_written); 2273 2274 hdr_a = conv_bufhdr(&hdr); 2275 2276 if (check_sum == hdr_a.chksum) { 2277 char *time_str; 2278 2279 time_str = alloc_time_str(prev_time); 2280 2281 pfprintf(stderr, "block %llx checksum matches" 2282 " the previous block written at %s\n", 2283 aio_str2byteoff(aiop), NOT_NULL(time_str)); 2284 not_null_free(time_str); 2285 return (1); 2286 } 2287 } 2288 return (0); 2289 } 2290 2291 static int 2292 check_old_data(struct aio_str *aiop) 2293 { 2294 const char *x; 2295 time_t tyme; 2296 pid_t did; 2297 2298 if (((x = get_buf_serial_and_provider(aiop->buf)) == NULL) || 2299 cmp_serial_and_provider(x) != 0) { 2300 char y[SIZEOF_SERIAL_AND_PROVIDER]; 2301 2302 get_serial_and_provider(y, SIZEOF_SERIAL_AND_PROVIDER); 2303 2304 if (x == NULL) { 2305 plog(LOG_NOTICE, "block %llx contains data that " 2306 "could not be recognized.\n", 2307 aio_str2byteoff(aiop)); 2308 } else { 2309 plog(LOG_NOTICE, "block %llx contains data written " 2310 "by host %.*s not %.*s\n", aio_str2byteoff(aiop), 2311 SIZEOF_SERIAL_AND_PROVIDER, x, 2312 SIZEOF_SERIAL_AND_PROVIDER, y); 2313 } 2314 return (1); 2315 } else if ((tyme = get_buf_time(aiop->buf)) < start_time.tv_sec) { 2316 char *t = alloc_time_str(tyme); 2317 2318 plog(LOG_NOTICE, "block %llx contains data written before " 2319 "this instance started. It was written at %s\n", 2320 aio_str2byteoff(aiop), NOT_NULL(t)); 2321 not_null_free(t); 2322 return (1); 2323 } else if ((did = get_buf_did(aiop->buf)) != master_pid()) { 2324 plog(LOG_NOTICE, "block at byte offset %llx not written by " 2325 "this instance, but by %ld\n", aio_str2byteoff(aiop), did); 2326 return (1); 2327 } 2328 return (0); 2329 } 2330 2331 int 2332 check_header(ullong_t start, struct aio_str *aiop) 2333 { 2334 if (!is_readonly()) { 2335 ushort16_t bufhdrchksum; 2336 ushort16_t hdrchksum; 2337 2338 bufhdrchksum = get_bufhdr_hdrchksum(aiop->buf); 2339 hdrchksum = check_bufhdr(aiop->buf, bufhdrchksum); 2340 2341 if (bufhdrchksum != hdrchksum || hdrchksum == 0) { 2342 plog(LOG_ERR, "block %llx bad header checksum\n", 2343 aio_str2byteoff(aiop)); 2344 return (0); 2345 } else { 2346 struct bufhdr_a hdr_a; 2347 ullong_t off; 2348 struct bufhdr hdr; 2349 hdr = get_bufhdr(aiop->buf); 2350 2351 (void) check_old_data(aiop); 2352 2353 hdr_a = conv_bufhdr(&hdr); 2354 2355 off = byteoff2diskomizer_off(hdr_a.off); 2356 2357 if (off != aiop->off) { 2358 plog(LOG_ERR, 2359 "On disk header says device byte offset " 2360 "%llx (0t%lld), which calculates " 2361 "diskomizer block %#llx (0t%lld), I " 2362 "requested diskomizer block " 2363 "%#llx (0t%lld)\n", 2364 hdr_a.off, hdr_a.off, off, off, 2365 aiop->off, aiop->off); 2366 return (0); 2367 } 2368 } 2369 } 2370 return (1); 2371 } 2372 2373 int 2374 do_memcmp(ullong_t start, struct aio_str *aiop) 2375 { 2376 check_t check_sum; 2377 uchar_t *last; 2378 struct blks *blocks; 2379 int status = 0; 2380 if (check_header(start, aiop) == 0) 2381 return (0); 2382 blocks = aio_attach(aiop); 2383 2384 if ((last = blocks[AIO_BLOCK_INDEX(aiop)].r.w.last_io) != NULL) { 2385 if (memcmp(get_buf_data(last), get_buf_data(aiop->buf), 2386 INDEX_TO_DIOLEN(aiop->iolen) - SIZEOF_BUFHDR)) { 2387 struct bufhdr_a hdr_a; 2388 struct bufhdr hdr = get_bufhdr(last); 2389 2390 hdr_a = conv_bufhdr(&hdr); 2391 /* the memcmp failed */ 2392 check_sum = check_aiobuf(aiop); 2393 dfprintf(stderr, "block %llx buf %#lx does not match " 2394 "what was written, what was read %#lx," 2395 " written %#lx\n", aio_str2byteoff(aiop), 2396 (ulong_t)last, check_sum, hdr_a.chksum); 2397 if (check_previous_buffer(check_sum, aiop) == 0) { 2398 check_old_data(aiop); 2399 } 2400 } else { 2401 status = 1; 2402 } 2403 } else { 2404 status = 1; 2405 } 2406 shm_ops->detach(AIO_BLOCK_HANDLE(aiop)); 2407 return (status); 2408 } 2409 2410 /* 2411 * print number of bytes will print the given number in full and then 2412 * convert it to a human readable form and print it to 2 decimal places. 2413 */ 2414 void 2415 print_number_of_bytes(unsigned long long x, char *singular, char *plural) 2416 { 2417 char *str = x != 1 ? plural : singular; 2418 const char *units; 2419 int j; 2420 int y; 2421 /* 2422 * All the units that fit in 64 bits: 2423 * kilo, mega, giga, tera, peta, exa 2424 */ 2425 static const char *all_units[] = { "K", "M", "G", "T", "P", "E" }; 2426 2427 (void) printf("\t%#llx, %lld, %s ", x, x, str); 2428 2429 units = NULL; 2430 2431 for (j = 0; j < (sizeof (all_units) / sizeof (all_units[0])); j++) { 2432 if (x / 1024) { 2433 y = ((x * 1000) / 1024) % 1000; 2434 x = x / 1024; 2435 units = all_units[j]; 2436 } else { 2437 break; 2438 } 2439 } 2440 2441 if (units) { 2442 /* Round up it necessary */ 2443 if (y % 10 >= 5) { 2444 y = y + 10; 2445 } 2446 /* loose the least significant digit */ 2447 y = y/10; 2448 if (y >= 100) { 2449 y -= 100; 2450 x++; 2451 } 2452 2453 (void) printf("(%lld.%.2d %s)\n", x, y, units); 2454 } else { 2455 (void) printf("\n"); 2456 } 2457 } 2458 2459 void 2460 print_number(unsigned long long i, char *singular, char *plural) 2461 { 2462 char *str = i != 1 ? plural : singular; 2463 (void) printf("\t%#llx, %lld, %s\n", i, i, str); 2464 } 2465 /* 2466 * given that the bufhdr for the io has a good check sum but is not 2467 * for this device find the correct device and offset for the io 2468 * and report this. 2469 */ 2470 struct fds * 2471 check_matching_path_io(struct bufhdr_a *hdr, struct fds *fd) 2472 { 2473 struct fds *x, *fdh; 2474 2475 fdh = fd; 2476 2477 for (x = fd->next; /* make cstyle happy */; x = x->next) { 2478 if (memcmp(&hdr->devid, &fd->devid, 2479 sizeof (struct device_id)) == 0) { 2480 return (fd); 2481 } 2482 if (x == fdh) { 2483 return (NULL); 2484 } else { 2485 fd = x; 2486 } 2487 } 2488 /*NOTREACHED*/ 2489 } 2490 void 2491 read_and_check(ullong_t start, struct device *dev, ullong_t off, 2492 struct aio_str *aiop) 2493 { 2494 uchar_t *buf; 2495 ullong_t status; 2496 ullong_t diskoff = diskomizer_off2byteoff(off); 2497 struct blks *blocks; 2498 struct blks *block; 2499 struct shadow_hdr const *shadow; 2500 int error_count = 0; 2501 2502 buf = calloc(1, INDEX_TO_DIOLEN(max_disk_io_len)); 2503 2504 if (buf == NULL) { 2505 CALLOC_ERROR(1L, (ulong_t)INDEX_TO_DIOLEN(max_disk_io_len)); 2506 return; 2507 } 2508 while ((blocks = shm_ops->attach(DEV_BLOCK_HANDLE(dev, off))) == NULL) { 2509 if (error_count++ % 10000 == 0) 2510 ATTACH_ERROR(DEV_BLOCK_HANDLE(dev, off)); 2511 } 2512 if (error_count > 0) 2513 plog(LOG_WARNING, "attached o.k.\n"); 2514 block = &blocks[DEV_BLOCK_INDEX(dev, off)]; 2515 aiop->daio_id.buf = block->r.w.last_io; 2516 aiop->daio_id.bufs = 2517 INDEX_TO_DIOLEN(opts.disk_io_sizes.vals[block->r.w.last_iolen]); 2518 aiop->daio_id.buf_id = get_write_buf_id(block->r.w.last_io); 2519 2520 aiop->hdr = build_bufhr(dev, start, off); 2521 aiop->daio_id.hdr = (uchar_t *)&aiop->hdr; 2522 aiop->daio_id.hdr_len = sizeof (aiop->hdr); 2523 aiop->daio_id.footer_len = 0; 2524 2525 shadow = get_shadow_hdr(aiop->daio_id.buf); 2526 aiop->daio_id.chksum = shadow->chksums[aiop->iolen]; 2527 2528 status = daio->pread(dev->fdhead->fd, buf, 2529 opts.disk_io_sizes.vals[block->r.w.last_iolen], diskoff, 2530 &aiop->daio_id); 2531 if (status == DAIO_CORRUPT) { 2532 int check_sum; 2533 struct error error; 2534 ulong_t shadow_chksum = check_bufbody(block->r.w.last_io, 2535 opts.disk_io_sizes.vals[block->r.w.last_iolen]); 2536 2537 ZERO_OBJ(error); 2538 2539 check_sum = check_buf(buf, 2540 INDEX_TO_DIOLEN(block->r.w.last_iolen), &error); 2541 2542 if (check_sum != shadow_chksum) { 2543 pfprintf(stderr, 2544 "Off %#llx (%lld) header differs " 2545 "on disk\n", diskoff, diskoff); 2546 aiop->error.dr = memdiff_buf(off, dev, buf, 2547 opts.disk_io_sizes.vals[block->r.w.last_iolen], 2548 dev->fdhead, "read and check", &aiop->error); 2549 } else { 2550 if (memcmp(get_buf_data(buf), 2551 get_buf_data(block->r.w.last_io), 2552 INDEX_TO_DIOLEN(block->r.w.last_iolen) - 2553 SIZEOF_BUFHDR) != 0) { 2554 pfprintf(stderr, 2555 "Off %#llx (%lld) body differs " 2556 "on disk\n", diskoff, diskoff); 2557 aiop->error.dr = memdiff_buf(off, dev, buf, 2558 opts.disk_io_sizes.vals[ 2559 block->r.w.last_iolen], 2560 dev->fdhead, "read and check", 2561 &aiop->error); 2562 } else { 2563 ullong_t off = aio_str2byteoff(aiop); 2564 2565 pfprintf(stderr, 2566 "Data at byte offset %#llx (%lld) " 2567 "on disk, matches the data just read " 2568 "from %#llx (%lld)\n", diskoff, diskoff, 2569 off, off); 2570 } 2571 } 2572 } else if (status != opts.disk_io_sizes.vals[block->r.w.last_iolen]) { 2573 PREAD_ERROR(dev->fdhead->fd, dev->fdhead->name, (ulong_t)buf, 2574 opts.disk_io_sizes.vals[block->r.w.last_iolen], diskoff); 2575 } 2576 shm_ops->detach(DEV_BLOCK_HANDLE(dev, off)); 2577 free(buf); 2578 } 2579 /* 2580 * Check the buffer contents matches an io which it internally thinks it 2581 * is. This error path is used when we have read a block X but the contents 2582 * of the block is not for block X but for block Y. So this routine gives 2583 * information regarding the last movements of block Y. 2584 * 2585 * This routine must be called with all the mutexs held, see 2586 * mutex->grab_all(). 2587 * 2588 * To Do: 2589 * bounds checking must be done on off before using it! 2590 */ 2591 void 2592 check_by_buffer(ullong_t start, struct device *dev, struct aio_str *aiop) 2593 { 2594 struct bufhdr_a read_hdr; 2595 struct bufhdr hdr; 2596 ullong_t off; 2597 struct blks *block; 2598 struct blks *blocks; 2599 ulong_t shadow_chksum; 2600 int error_count = 0; 2601 bitmap_t *map; 2602 2603 2604 hdr = get_bufhdr(aiop->buf); 2605 2606 read_hdr = conv_bufhdr(&hdr); 2607 2608 off = byteoff2diskomizer_off(read_hdr.off); 2609 if ((long long)off < 0 || off >= LEN_BYTES2BLOCKS(dev)) { 2610 return; 2611 } 2612 2613 aiop->error.doff = off; 2614 aiop->error.dev = dev; 2615 2616 if (aiop->dev == dev && off == aiop->off) { 2617 return; 2618 } 2619 2620 map = attach_dev_writemap(dev); 2621 2622 if (test_write(map, off, LEN_BYTES2BLOCKS(dev))) { 2623 /* 2624 * Grr. There is an IO outstanding on this block on this device, 2625 * I don't know whether it is a read or a write 2626 */ 2627 shm_ops->detach(dev->writemap_handle); 2628 aiop->error.desc.UNABLE_TO_LOCK = 1; 2629 dfprintf(stderr, 2630 "Block %#llx 0t%lld is currently locked for dev %s\n", 2631 diskomizer_off2byteoff(off), 2632 diskomizer_off2byteoff(off), dev->logicalname); 2633 return; 2634 2635 } 2636 shm_ops->detach(dev->writemap_handle); 2637 while ((blocks = shm_ops->attach(DEV_BLOCK_HANDLE(dev, off))) == NULL) { 2638 if (error_count++ % 10000 == 0) 2639 ATTACH_ERROR(DEV_BLOCK_HANDLE(dev, off)); 2640 } 2641 if (error_count > 0) 2642 plog(LOG_WARNING, "attached o.k.\n"); 2643 block = &blocks[DEV_BLOCK_INDEX(dev, off)]; 2644 2645 if (block->r.w.last_io != NULL && 2646 (shadow_chksum = check_bufbody(block->r.w.last_io, 2647 opts.disk_io_sizes.vals[block->r.w.last_iolen])) == 2648 read_hdr.chksum && 2649 memcmp(get_buf_data(aiop->buf), 2650 get_buf_data(block->r.w.last_io), 2651 opts.disk_io_sizes.vals[aiop->iolen] - SIZEOF_BUFHDR) == 0) { 2652 struct fds *fd; 2653 2654 fd = find_path(dev->fdhead, block->path_id); 2655 assert(fd != NULL); 2656 aiop->error.desc.MATCHING_LAST = 1; 2657 aiop->error.last_requested = block->last_requested; 2658 aiop->error.delta = block->last_returned_delta; 2659 aiop->error.doff = off; 2660 aiop->error.dev = dev; 2661 aiop->error.path_id = block->path_id; 2662 2663 dlog(LOG_ERR, "Buffer matches last write to block %#llx " 2664 "(0t%lld) (block %#llx 0t%lld) on dev %s path %s\n", 2665 off, off, diskomizer_off2byteoff(off), 2666 diskomizer_off2byteoff(off), dev->logicalname, 2667 fd->name); 2668 dtime_log(LOG_ERR, block->last_requested, 2669 "Last write to %s block %#llx (0t%lld) requested", 2670 dev->logicalname, diskomizer_off2byteoff(off), 2671 diskomizer_off2byteoff(off)); 2672 2673 dtime_log(LOG_ERR, 2674 block->last_requested + 2675 block->last_returned_delta, 2676 "Last write to %s block %#llx (0t%lld)" 2677 " returned ", dev->logicalname, 2678 diskomizer_off2byteoff(off), 2679 diskomizer_off2byteoff(off)); 2680 read_and_check(start, dev, off, aiop); 2681 2682 } else if (block->r.w.prev_io != NULL) { 2683 shadow_chksum = check_bufbody(block->r.w.prev_io, 2684 MIN(opts.disk_io_sizes.vals[block->r.w.last_iolen], 2685 opts.disk_io_sizes.vals[block->r.w.prev_iolen])); 2686 if (shadow_chksum == read_hdr.chksum && 2687 memcmp(get_buf_data(aiop->buf), 2688 get_buf_data(block->r.w.prev_io), 2689 opts.disk_io_sizes.vals[aiop->iolen] - 2690 SIZEOF_BUFHDR)) { 2691 2692 aiop->error.desc.MATCHING_PREV = 1; 2693 aiop->error.doff = off; 2694 aiop->error.last_requested = block->last_requested; 2695 2696 dlog(LOG_ERR, 2697 "Buffer matches block %#llx (block %lld) dev " 2698 "%s prev io\n", 2699 (ullong_t)off, diskomizer_off2byteoff(off), 2700 dev->logicalname); 2701 dtime_log(LOG_ERR, block->u.prev_requested, 2702 "Prev write to %s block %#llx (0t%lld)" 2703 " requested %s\n", 2704 dev->logicalname, 2705 diskomizer_off2byteoff(off), 2706 diskomizer_off2byteoff(off)); 2707 } else { 2708 dfprintf(stderr, "Buffer claiming to be from block " 2709 "%#llx dev %s does not match either of the " 2710 "last two ios.\n", 2711 (ullong_t)diskomizer_off2byteoff(off), 2712 dev->logicalname); 2713 } 2714 } 2715 shm_ops->detach(DEV_BLOCK_HANDLE(dev, off)); 2716 } 2717 /*ARGSUSED*/ 2718 void 2719 check_matching_io(ullong_t start, struct aio_str *aiop) 2720 { 2721 struct device *device; 2722 struct bufhdr_a hdr_a; 2723 struct bufhdr hdr = get_bufhdr(aiop->buf); 2724 2725 hdr_a = conv_bufhdr(&hdr); 2726 2727 if (hdr_a.hdrchksum != check_bufhdr(aiop->buf, hdr_a.hdrchksum)) { 2728 return; 2729 } 2730 mutex->grab_all(); 2731 for (device = devices; device != NULL; device = device->next) { 2732 struct fds *fd; 2733 if ((fd = check_matching_path_io(&hdr_a, 2734 device->fdhead)) != NULL) { 2735 /* 2736 * Now we have the device to which this io was sent 2737 */ 2738 aiop->error.desc.MATCHING_DEVICE = 1; 2739 aiop->error.fd = fd; 2740 aiop->error.doff = byteoff2diskomizer_off(hdr_a.off); 2741 aiop->error.last_requested = hdr_a.time; 2742 check_by_buffer(start, device, aiop); 2743 dfprintf(stderr, "Block read from %s matches block " 2744 "written to %s\n", aiop->fd->name, 2745 fd->name); 2746 } 2747 } 2748 mutex->drop_all(); 2749 } 2750 2751 static loop_type 2752 on_error_pause(ullong_t start, struct aio_str *aiop) 2753 { 2754 int isread = is_read_io(aiop); 2755 pfprintf(stderr, "On %s error pause %d seconds\n", 2756 isread ? "read" : "write", OPTION(pause_time)); 2757 (void) sleep(opts.pause_time); 2758 2759 if (!isread) { 2760 return (CONTINUE); 2761 } 2762 2763 if (!do_memcmp(start, aiop)) { 2764 check_matching_io(start, aiop); 2765 memdiff(aiop, "pause"); 2766 return (CONTINUE); 2767 } else { 2768 return (BREAK); 2769 } 2770 } 2771 static void 2772 bring_error_path_online(struct fds *fd) 2773 { 2774 struct fds *start = fd; 2775 2776 do { 2777 if (fd->error_path == 1) { 2778 pfprintf(stderr, 2779 "Path %s brought on line\n", fd->name); 2780 fd->error_path = 0; 2781 break; 2782 } 2783 fd = fd->next; 2784 } while (fd != start); 2785 } 2786 /*ARGSUSED*/ 2787 static loop_type 2788 on_error_fail_path(ullong_t start, struct aio_str *aiop) 2789 { 2790 char *name = aiop->fd->name; 2791 pfprintf(stderr, 2792 "On error fail path %s failed\n", aiop->fd->name); 2793 if (set_shared_stop_flag(aiop->fd->shared_data_handle) == -1) { 2794 aiop->fd->need_to_stop = 1; 2795 } else { 2796 aiop->fd->stop_flag = 1; 2797 cancel_all_io_byfd(aiop->fd); 2798 snapshot_recent(aiop->dev->recent); 2799 bring_error_path_online(aiop->fd); 2800 } 2801 newfd(aiop); 2802 if (aiop->fd->stop_flag == 1 || aiop->fd->need_to_stop == 1) { 2803 /* 2804 * All the paths have failed, we muddle on to complete any furhter 2805 * error action down this failed path. 2806 */ 2807 pfprintf(stderr, "On error fail path %s continuing\n", name); 2808 return (BREAK); 2809 } else { 2810 aiop->retrycnt = 0; 2811 pfprintf(stderr, "On error fail path %s retrying\n", name); 2812 return (RETRY); 2813 } 2814 } 2815 2816 /*ARGSUSED*/ 2817 static loop_type 2818 on_error_retry(ullong_t start, struct aio_str *aiop) 2819 { 2820 int isread = is_read_io(aiop); 2821 short max = (isread ? 2822 OPTION(max_read_retries) : OPTION(max_write_retries)); 2823 if (aiop->retrycnt < max) { 2824 pfprintf(stderr, 2825 "On %s error retry %d, %d remaining %s blk %#llx\n", 2826 isread ? "read": "write", 2827 1+aiop->retrycnt, max-(1+aiop->retrycnt), 2828 aiop->fd->name, aio_str2byteoff(aiop)); 2829 return (RETRY); 2830 } else { 2831 return (CONTINUE); 2832 } 2833 } 2834 2835 static loop_type 2836 on_error_rewrite(ullong_t start, struct aio_str *aiop) 2837 { 2838 ssize_t status; 2839 union err_info err_info; 2840 struct shadow_hdr const *shadow_hdr = get_shadow_hdr(aiop->buf); 2841 ullong_t offset = (ullong_t)start + 2842 (ullong_t)(INDEX_TO_DIOLEN(max_disk_io_len)*aiop->off); 2843 2844 err_info.str = "pwrite"; 2845 pfprintf(stderr, "%s On error rewrite\n", aiop->fd->name); 2846 2847 aiop->daio_id.buf = aiop->buf; 2848 aiop->daio_id.buf_id = get_write_buf_id(aiop->buf); 2849 aiop->daio_id.chksum = shadow_hdr->chksums[aiop->iolen]; 2850 aiop->daio_id.bufs = opts.disk_io_sizes.vals[aiop->iolen]; 2851 aiop->daio_id.hdr = (uchar_t *)&aiop->hdr; 2852 aiop->daio_id.footer_len = 0; 2853 aiop->daio_id.hdr_len = sizeof (aiop->hdr); 2854 (void) memcpy(aiop->daio_id.hdr, aiop->daio_id.buf, 2855 aiop->daio_id.hdr_len); 2856 2857 (void) my_gettimeofday(&aiop->tv, NULL); 2858 status = daio->pwrite(aiop->fd->fd, aiop->buf, 2859 opts.disk_io_sizes.vals[aiop->iolen], offset, &aiop->daio_id); 2860 DAIO_SET_RETURN(aiop->aio_res, status); 2861 DAIO_SET_ERROR(aiop->aio_res, errno); 2862 2863 if (status != opts.disk_io_sizes.vals[aiop->iolen]) { 2864 report_error(aiop, err_info, ERR_SYS); 2865 aiop->dev->errors += 1; 2866 return (CONTINUE); 2867 } 2868 return (BREAK); 2869 } 2870 2871 static loop_type 2872 on_error_reread(ullong_t start, struct aio_str *aiop) 2873 { 2874 ssize_t status; 2875 union err_info err_info; 2876 ullong_t offset = (ullong_t)start + 2877 (ullong_t)(INDEX_TO_DIOLEN(max_disk_io_len)*aiop->off); 2878 2879 err_info.str = "pread"; 2880 pfprintf(stderr, "%s On error re-read\n", aiop->fd->name); 2881 2882 (void) my_gettimeofday(&aiop->tv, NULL); 2883 status = daio->pread(aiop->fd->fd, aiop->buf, 2884 opts.disk_io_sizes.vals[aiop->iolen], offset, &aiop->daio_id); 2885 DAIO_SET_RETURN(aiop->aio_res, status); 2886 DAIO_SET_ERROR(aiop->aio_res, errno); 2887 if (status == DAIO_CORRUPT) { 2888 if (is_readonly() || !do_memcmp(start, aiop)) { 2889 report_error(aiop, err_info, ERR_CORRUPT); 2890 aiop->dev->errors += 1; 2891 memdiff(aiop, err_info.str); 2892 return (CONTINUE); 2893 } 2894 /* There should be an assert here */ 2895 } else if (status != opts.disk_io_sizes.vals[aiop->iolen]) { 2896 report_error(aiop, err_info, ERR_SYS); 2897 aiop->dev->errors += 1; 2898 return (CONTINUE); 2899 } 2900 return (BREAK); 2901 } 2902 2903 static int 2904 do_path_stop_check(struct fds *fd, struct device *dev) 2905 { 2906 if (fd->error_path == 0 && fd->stop_flag == 0) { 2907 if (fd->need_to_stop == 1 && 2908 set_shared_stop_flag(fd->shared_data_handle) != -1) { 2909 fd->need_to_stop = 0; 2910 /* 2911 * need to cancel all the io outstanding for this 2912 * path 2913 */ 2914 fd->stop_flag = 1; 2915 cancel_all_io_byfd(fd); 2916 snapshot_recent(dev->recent); 2917 bring_error_path_online(fd); 2918 } else { 2919 if (do_stop_check(fd->shared_data_handle) == 1) { 2920 fd->stop_flag = 1; 2921 cancel_all_io_byfd(fd); 2922 bring_error_path_online(fd); 2923 } else { 2924 return (0); 2925 } 2926 } 2927 } 2928 return (1); 2929 } 2930 2931 static int 2932 init_stop_check(void) 2933 { 2934 stop_check = do_stop_check; 2935 return (1); 2936 } 2937 2938 static int 2939 init_path_stop_check(void) 2940 { 2941 path_stop_check = do_path_stop_check; 2942 return (1); 2943 } 2944 2945 void 2946 newfd(struct aio_str *aiop) 2947 { 2948 struct fds *fd = aiop->fd; 2949 if (aiop->fd != aiop->fd->next) { 2950 while (aiop->fd->next != fd) { 2951 if (path_stop_check(aiop->fd->next, aiop->dev) == 0) { 2952 aiop->fd = aiop->fd->next; 2953 remove_from_aio_list(&fd->all_aios, aiop); 2954 add_to_all_aios(aiop); 2955 return; 2956 } 2957 aiop->fd = aiop->fd->next; 2958 } 2959 /* 2960 * To get here we searched them all and found none that 2961 * were not error paths or had been stopped. Reset the path 2962 * back to the original. 2963 */ 2964 aiop->fd = fd; 2965 } 2966 2967 } 2968 loop_type 2969 handle_err_generic(struct aio_str *aiop, ullong_t start, 2970 on_error_t *on_error_func) 2971 { 2972 struct blks *blocks = NULL; 2973 loop_type status = BREAK; 2974 2975 blocks = aio_attach(aiop); 2976 2977 aiop->dev->errors++; 2978 if (blocks[AIO_BLOCK_INDEX(aiop)].r.w.last_io != 2979 NULL && does_check(daio->what_checker())) { 2980 char i; 2981 union err_info err_info; 2982 2983 err_info.str = "aioread"; 2984 if (on_error_func == on_error_corrupt) { 2985 memdiff(aiop, (aiop->retrycnt == 0) ? 2986 "aioread" : "aioread RETRY"); 2987 check_matching_io(start, aiop); 2988 report_error(aiop, err_info, ERR_CORRUPT); 2989 } else { 2990 report_error(aiop, err_info, ERR_SYS); 2991 } 2992 2993 for (i = 0; on_error_func[i] != NULL; i++) { 2994 loop_type l; 2995 if ((l = on_error_func[i](start, aiop)) == BREAK) 2996 break; 2997 else if (l == RETRY) { 2998 status = RETRY; 2999 break; 3000 } 3001 } 3002 } 3003 if (incr_shared_device_error(aiop->dev->shared_data_handle, 3004 aiop->dev->errors) != -1) { 3005 aiop->dev->errors = 0; 3006 } 3007 shm_ops->detach(AIO_BLOCK_HANDLE(aiop)); 3008 return (status); 3009 } 3010 loop_type 3011 handle_write_error(struct aio_str *aiop, ullong_t start) 3012 { 3013 loop_type status = BREAK; 3014 int i; 3015 3016 aiop->dev->errors++; 3017 for (i = 0; on_write_error[i] != NULL; i++) { 3018 loop_type l; 3019 if ((l = on_write_error[i](start, aiop)) == BREAK) 3020 break; 3021 else if (l == RETRY) { 3022 status = RETRY; 3023 break; 3024 } 3025 } 3026 if (incr_shared_device_error(aiop->dev->shared_data_handle, 3027 aiop->dev->errors) != -1) { 3028 aiop->dev->errors = 0; 3029 } 3030 return (status); 3031 } 3032 loop_type 3033 handle_read_corrupt(struct aio_str *aiop, ullong_t start) 3034 { 3035 return (handle_err_generic(aiop, start, on_error_corrupt)); 3036 } 3037 3038 loop_type 3039 handle_read_short(struct aio_str *aiop, ullong_t start) 3040 { 3041 return (handle_err_generic(aiop, start, on_error_short)); 3042 } 3043 static time_t 3044 handle_readonly(struct aio_str *aiop, ullong_t start, read_type_t read_type) 3045 { 3046 struct blks *blocks = NULL; 3047 struct timeval tv; 3048 hrtime_t delta = DAIO_GET_TIME_TAKEN(aiop->aio_res); 3049 3050 3051 TNF_PROBE_2(handle_read, "handle_readonly", 3052 "sunw%cte%diskomizer%aio readonly wait", 3053 tnf_opaque, aiop, aiop, 3054 aio_tnf_str, *aiop, aiop); 3055 3056 while (my_gettimeofday(&tv, NULL) == -1) 3057 pperror("gettimeofday"); 3058 3059 if (aiop->fd == NULL) { 3060 aiop->fd = aiop->dev->fdhead; 3061 add_to_aio_list(&aiop->fd->all_aios, aiop); 3062 } else { 3063 if (DAIO_RETURN(aiop->aio_res) == DAIO_CORRUPT) { 3064 if (handle_read_corrupt(aiop, start) == RETRY) { 3065 return (do_new_read(aiop, start, RETRY_READ)); 3066 } 3067 } else if (DAIO_RETURN(aiop->aio_res) != 3068 opts.disk_io_sizes.vals[aiop->iolen]) { 3069 if (handle_read_short(aiop, start) == RETRY) { 3070 return (do_new_read(aiop, start, RETRY_READ)); 3071 } 3072 } else { 3073 struct blks *block; 3074 aiop->fd->last_read_time = delta; 3075 if (aiop->retrycnt != 0) { 3076 char *now_str; 3077 3078 now_str = alloc_time_str(tv.tv_sec); 3079 pprintf("Read retry %d of block 0x%llx " 3080 "on %s o.k. %s\n", 3081 aiop->retrycnt, 3082 aio_str2byteoff(aiop), 3083 aiop->fd->name, 3084 NIL(now_str)); 3085 not_null_free(now_str); 3086 } 3087 blocks = aio_attach(aiop); 3088 3089 block = &blocks[AIO_BLOCK_INDEX(aiop)]; 3090 block->r.o.prev_io = 3091 blocks[AIO_BLOCK_INDEX(aiop)].r.o.last_io; 3092 block->r.o.prev_chksum = 3093 blocks[AIO_BLOCK_INDEX(aiop)].r.o.last_chksum; 3094 block->r.o.last_io = 3095 (((ulong_t)aiop->daio_id.buf) & 0x1); 3096 block->r.o.last_chksum = aiop->daio_id.chksum; 3097 3098 block->u.prev_requested = block->last_requested; 3099 block->last_requested = aiop->tv.tv_sec; 3100 block->last_returned_delta = 3101 tv.tv_sec - aiop->tv.tv_sec; 3102 } 3103 if (blocks == NULL) { 3104 blocks = aio_attach(aiop); 3105 } 3106 3107 blocks[AIO_BLOCK_INDEX(aiop)].read_count += 1; 3108 shm_ops->detach(AIO_BLOCK_HANDLE(aiop)); 3109 3110 clear_writemap(aiop); 3111 newfd(aiop); 3112 } 3113 update_aio_read_stats(aiop); 3114 aiop->count++; 3115 return_aio_read_buf(aiop); 3116 3117 return (do_new_read(aiop, start, read_type)); 3118 } 3119 time_t 3120 handle_readonly_rand(struct aio_str *aiop, ullong_t start) 3121 { 3122 return (handle_readonly(aiop, start, READ_ONLY_RAND)); 3123 } 3124 time_t 3125 handle_readonly_seq(struct aio_str *aiop, ullong_t start) 3126 { 3127 return (handle_readonly(aiop, start, READ_ONLY_SEQ)); 3128 } 3129 3130 time_t 3131 handle_read(struct aio_str *aiop, ullong_t start) 3132 { 3133 struct blks *blocks = NULL; 3134 hrtime_t delta = DAIO_GET_TIME_TAKEN(aiop->aio_res); 3135 3136 TNF_PROBE_2(handle_read, "handle_read", 3137 "sunw%cte%diskomizer%aio read wait", 3138 tnf_opaque, aiop, aiop, 3139 aio_tnf_str, *aiop, aiop); 3140 3141 aiop->fd->total_read--; 3142 3143 if (DAIO_RETURN(aiop->aio_res) == DAIO_CORRUPT) { 3144 if (!do_memcmp(start, aiop) && 3145 handle_read_corrupt(aiop, start) == RETRY) { 3146 return (do_new_read(aiop, start, RETRY_READ)); 3147 } 3148 } else if (DAIO_RETURN(aiop->aio_res) != 3149 opts.disk_io_sizes.vals[aiop->iolen]) { 3150 if (handle_read_short(aiop, start) == RETRY) { 3151 return (do_new_read(aiop, start, RETRY_READ)); 3152 } 3153 } else { 3154 aiop->fd->last_read_time = delta; 3155 if (aiop->retrycnt != 0) { 3156 pprintf("Read retry %d of block 0x%llx on %s o.k.\n", 3157 aiop->retrycnt, aio_str2byteoff(aiop), 3158 aiop->fd->name); 3159 } 3160 if (opts.obscure_execute && is_executable(aiop->buf)) { 3161 run_func(aiop->buf, 3162 opts.disk_io_sizes.vals[aiop->iolen] - 3163 SIZEOF_BUFHDR); 3164 } 3165 } 3166 if (blocks == NULL) { 3167 blocks = aio_attach(aiop); 3168 } 3169 3170 blocks[AIO_BLOCK_INDEX(aiop)].read_count += 1; 3171 shm_ops->detach(AIO_BLOCK_HANDLE(aiop)); 3172 3173 clear_writemap(aiop); 3174 3175 update_aio_read_stats(aiop); 3176 aiop->count++; 3177 return_aio_read_buf(aiop); 3178 3179 newfd(aiop); 3180 return (do_new_read(aiop, start, NORMAL_READ)); 3181 } 3182 time_t 3183 handle_read_then_write(struct aio_str *aiop, ullong_t start) 3184 { 3185 hrtime_t delta = DAIO_GET_TIME_TAKEN(aiop->aio_res); 3186 struct timeval tv; 3187 3188 while (my_gettimeofday(&tv, NULL) == -1) 3189 pperror("gettimeofday"); 3190 3191 update_aio_read_stats(aiop); 3192 if (aiop->fd == NULL) { 3193 /* This is the first write so no read to check */ 3194 aiop->count++; 3195 aiop->buf = get_write_buf(); 3196 aiop->fd = aiop->dev->fdhead; 3197 add_to_aio_list(&aiop->fd->all_aios, aiop); 3198 } else { 3199 aiop->fd->total_read--; 3200 3201 if (DAIO_RETURN(aiop->aio_res) == DAIO_CORRUPT) { 3202 /* handle read error */ 3203 if (!do_memcmp(start, aiop) && 3204 handle_read_corrupt(aiop, start) == RETRY) { 3205 return (do_new_read(aiop, start, RETRY_READ)); 3206 } 3207 } else if (DAIO_RETURN(aiop->aio_res) != 3208 opts.disk_io_sizes.vals[aiop->iolen]) { 3209 /* handle read error */ 3210 if (handle_read_short(aiop, start) == RETRY) { 3211 return (do_new_read(aiop, start, RETRY_READ)); 3212 } 3213 } else { 3214 if (opts.obscure_execute && is_executable(aiop->buf)) { 3215 run_func(aiop->buf, 3216 opts.disk_io_sizes.vals[aiop->iolen] - 3217 SIZEOF_BUFHDR); 3218 } 3219 aiop->fd->last_read_time = delta; 3220 } 3221 return_read_buf(aiop->buf); 3222 /* 3223 * Need to return the disk block to the free list 3224 * 3225 * the use of clear_writemap_success() reflects the fact that 3226 * to get here the write to this block must have succeeded. 3227 */ 3228 clear_writemap_success(aiop); 3229 aiop->buf = get_write_buf(); 3230 (void) set_io_len(aiop); 3231 newfd(aiop); 3232 aiop->count++; 3233 } 3234 aiop->handler = handle_write_then_read; 3235 do_new_write(aiop, start, 0); 3236 return (tv.tv_sec); 3237 } 3238 time_t 3239 handle_write_then_read(struct aio_str *aiop, ullong_t start) 3240 { 3241 struct timeval tv; 3242 struct blks *block; 3243 struct blks *blocks; 3244 struct bufhdr hdr; 3245 3246 while (my_gettimeofday(&tv, NULL) == -1) 3247 pperror("gettimeofday"); 3248 3249 assert(aiop->buf == NULL || *aiop->buf == 0xAA || *aiop->buf == 0x55); 3250 3251 aiop->fd->total_write--; 3252 3253 if (DAIO_RETURN(aiop->aio_res) != 3254 opts.disk_io_sizes.vals[aiop->iolen]) { 3255 union err_info err_info; 3256 err_info.str = "aiowrite"; 3257 report_error(aiop, err_info, ERR_SYS); 3258 if (handle_write_error(aiop, start) == RETRY) { 3259 do_new_write(aiop, start, 1); 3260 } else { 3261 if (is_sequential(aiop) && 3262 (aiop->off % opts.nprocs) == this_proc()) { 3263 push_unwritten(aiop); 3264 } 3265 do_new_write(aiop, start, 0); 3266 } 3267 return (tv.tv_sec); 3268 } 3269 3270 update_aio_write_stats(aiop); 3271 aiop->count++; 3272 aiop->fd->last_write_time = DAIO_GET_TIME_TAKEN(aiop->aio_res); 3273 hdr = get_bufhdr(aiop->buf); 3274 blocks = aio_attach(aiop); 3275 3276 block = &blocks[AIO_BLOCK_INDEX(aiop)]; 3277 block->r.w.prev_io = block->r.w.last_io; 3278 block->r.w.prev_iolen = block->r.w.last_iolen; 3279 block->u.prev_requested = block->last_requested; 3280 block->last_requested = aiop->tv.tv_sec; 3281 block->last_returned_delta = tv.tv_sec - aiop->tv.tv_sec; 3282 block->r.w.last_io = aiop->buf; 3283 block->r.w.last_iolen = aiop->iolen; 3284 if (hdr.start == BUF_TYPE_A) 3285 block->ab = 1; 3286 else 3287 block->ab = 0; 3288 3289 block->read_count = 0; 3290 shm_ops->detach(AIO_BLOCK_HANDLE(aiop)); 3291 return_write_buf(aiop->buf); 3292 aiop->buf = get_read_buf(); 3293 aiop->handler = handle_read_then_write; 3294 return (do_new_read(aiop, start, WRITE_READ)); 3295 } 3296 time_t 3297 handle_write(struct aio_str *aiop, ullong_t start) 3298 { 3299 struct timeval tv; 3300 union err_info err_info; 3301 struct bufhdr hdr; 3302 struct blks *block; 3303 struct blks *blocks; 3304 3305 err_info.str = "aiowrite"; 3306 3307 if (aiop->buf != NULL) { 3308 if (*aiop->buf != 0xAA && *aiop->buf != 0x55) { 3309 char tmp = *aiop->buf; 3310 void *sig = expect_signal(SIGSEGV, 3311 "Buffer not mapped writable but was updated!", 3312 aiop->buf, sizeof (*aiop->buf)); 3313 *aiop->buf = 0; 3314 *aiop->buf = tmp; 3315 cancel_expected_signal(SIGSEGV, sig); 3316 exit(1); 3317 } 3318 if (aiop->fd != NULL) { 3319 aiop->fd->total_write--; 3320 } 3321 } else { 3322 if (aiop->fd != NULL) { 3323 aiop->fd->total_write--; 3324 plog(LOG_WARNING, "buf == NULL, off %#llx (0t%lld)\n", 3325 (ullong_t)aiop->off, (ullong_t)aiop->off); 3326 } 3327 } 3328 3329 TNF_PROBE_2(handle_write, "handle_write", 3330 "sunw%cte%diskomizer%aio write wait", 3331 tnf_opaque, aiop, aiop, 3332 aio_tnf_str, *aiop, aiop); 3333 while (my_gettimeofday(&tv, NULL) == -1) 3334 pperror("gettimeofday"); 3335 3336 if (DAIO_RETURN(aiop->aio_res) != 3337 opts.disk_io_sizes.vals[aiop->iolen]) { 3338 /* retry the write */ 3339 if (aiop->fd != NULL) { 3340 report_error(aiop, err_info, ERR_SYS); 3341 3342 if (handle_write_error(aiop, start) == RETRY) { 3343 do_new_write(aiop, start, 1); 3344 } else { 3345 if (is_sequential(aiop) && 3346 (aiop->off % opts.nprocs) == 3347 this_proc()) { 3348 push_unwritten(aiop); 3349 } 3350 do_new_write(aiop, start, 0); 3351 } 3352 return (tv.tv_sec); 3353 } else { 3354 aiop->fd = aiop->dev->fdhead; 3355 add_to_aio_list(&aiop->fd->all_aios, aiop); 3356 do_new_write(aiop, start, 0); 3357 return (tv.tv_sec); 3358 } 3359 } 3360 aiop->fd->last_write_time = DAIO_GET_TIME_TAKEN(aiop->aio_res); 3361 3362 update_aio_write_stats(aiop); 3363 aiop->count++; 3364 3365 hdr = get_bufhdr(aiop->buf); 3366 if (aiop->retrycnt) { 3367 ullong_t block = aio_str2byteoff(aiop); 3368 pfprintf(stderr, 3369 "%s Block 0t%lld (%#llx) retry %d succeeded\n", 3370 aiop->fd->name, (ullong_t)block, 3371 (ullong_t)block, ++aiop->retrycnt); 3372 } 3373 3374 blocks = aio_attach(aiop); 3375 3376 block = &blocks[AIO_BLOCK_INDEX(aiop)]; 3377 block->r.w.prev_io = block->r.w.last_io; 3378 block->r.w.prev_iolen = block->r.w.last_iolen; 3379 block->u.prev_requested = block->last_requested; 3380 block->last_requested = aiop->tv.tv_sec; 3381 block->last_returned_delta = tv.tv_sec - aiop->tv.tv_sec; 3382 block->r.w.last_io = aiop->buf; 3383 block->r.w.last_iolen = aiop->iolen; 3384 if (hdr.start == BUF_TYPE_A) 3385 block->ab = 1; 3386 else 3387 block->ab = 0; 3388 3389 block->read_count = 0; 3390 shm_ops->detach(AIO_BLOCK_HANDLE(aiop)); 3391 3392 clear_writemap_success(aiop); 3393 3394 assert(aiop->buf != NULL); 3395 if (aiop->count % 3396 opts.expert_release_write_buffers_after_n_uses == 0) { 3397 return_write_buf(aiop->buf); 3398 aiop->buf = NULL; 3399 } 3400 newfd(aiop); 3401 do_new_write(aiop, start, 0); 3402 return (tv.tv_sec); 3403 } 3404 3405 void 3406 init_all_aio(struct device *devices, struct aio_str *aio, 3407 int count) 3408 { 3409 int i, j; 3410 struct device *device; 3411 3412 for (j = i = 0; i < count; i++) { 3413 for (device = devices; device != NULL; device = device->next) { 3414 if (i == 0) 3415 device->block = this_proc(); 3416 aio[j].dev = device; 3417 aio[j].fd = NULL; 3418 aio[j].iolen = 0; 3419 add_to_aio_list(&device->stopped_ios, &aio[j]); 3420 while (my_gettimeofday(&device->state_ttl, NULL) == -1) 3421 pperror("gettimeofday"); 3422 3423 j++; 3424 } 3425 } 3426 } 3427 void 3428 cancel_all_io_byfd(struct fds *fd) 3429 { 3430 struct aioqtop not_cancelled; 3431 struct aioqtop cancelled; 3432 struct aio_str *io; 3433 struct device *devp; 3434 int count = 0; 3435 3436 ZERO_OBJ(not_cancelled); 3437 ZERO_OBJ(cancelled); 3438 3439 while ((io = pop_from_aio_list(&fd->all_aios)) != NULL) { 3440 devp = io->dev; 3441 if (daio->cancel(&io->aio_res) == -1) { 3442 io->count = errno; 3443 add_to_aio_list(¬_cancelled, io); 3444 } else { 3445 count++; 3446 add_to_aio_list(&cancelled, io); 3447 } 3448 } 3449 plog(LOG_WARNING, "%d io's cancelled to path %s\n", count, 3450 fd->name); 3451 while ((io = pop_from_aio_list(&cancelled)) != NULL) { 3452 DAIO_SET_ERROR(io->aio_res, ECANCELED); 3453 DAIO_SET_RETURN(io->aio_res, -1); 3454 add_to_aio_list(&devp->cancelled, io); 3455 cancelled_count++; 3456 } 3457 if (cancelled_count) { 3458 start_cancelled_io = do_start_cancelled_io; 3459 } 3460 fd->all_aios = not_cancelled; 3461 } 3462 void 3463 cancel_all_io(void) 3464 { 3465 int i = 0; 3466 int errors = 0; 3467 int total = 0; 3468 int reaped = 0; 3469 struct aio_str *io; 3470 struct aio_str *first_error_io = NULL; 3471 struct fds *fd; 3472 struct aioqtop not_cancelled; 3473 struct device *dev; 3474 /* 3475 * If daio is NULL, then no paths can be open so nothing more to do. 3476 */ 3477 if (daio == NULL) 3478 return; 3479 3480 time_now_log(LOG_NOTICE, gettext("cancelling all aios\n")); 3481 3482 ZERO_OBJ(not_cancelled); 3483 3484 for (dev = devices; dev != NULL; dev = dev->next) { 3485 for (fd = dev->fdhead; ; fd = fd->next) { 3486 while ((io = pop_from_aio_list( 3487 &fd->all_aios)) != NULL) { 3488 total++; 3489 if (daio->cancel(&io->aio_res) == -1) { 3490 io->count = errno; 3491 add_to_aio_list(¬_cancelled, io); 3492 } else { 3493 i++; 3494 } 3495 } 3496 if (fd->next == dev->fdhead) 3497 break; 3498 } 3499 } 3500 /* 3501 * Now reap all the remaining ios, popping them off the list of 3502 * ios that could not be cancelled. 3503 */ 3504 while ((io = (aio_str_t *)daio->wait(NULL)) != (aio_str_t *)-1 || 3505 errno != EINVAL) { 3506 if (io != (aio_str_t *)-1 && io != (aio_str_t *)0) { 3507 reaped++; 3508 remove_from_aio_list(¬_cancelled, io); 3509 } 3510 } 3511 /* 3512 * If the list contains more than one entry there was a problem, 3513 * probably in the internal logic of diskomizer. 3514 */ 3515 while ((io = pop_from_aio_list(¬_cancelled)) != NULL) { 3516 errno = io->count; 3517 /* 3518 * If we were interupted the signal might have come in while 3519 * we were handling an io so we could have just one io 3520 * that is not in the aio system. So only report errors 3521 * if there are more then one. If there are more than one 3522 * report them all. 3523 * 3524 */ 3525 if (errors++ > 0) { 3526 if (first_error_io != NULL) { 3527 AIOCANCEL_ERROR(first_error_io); 3528 first_error_io = NULL; 3529 } 3530 AIOCANCEL_ERROR(io); 3531 } else { 3532 first_error_io = io; 3533 } 3534 } 3535 for (dev = devices; dev != NULL; dev = dev->next) { 3536 close_and_free_paths(dev); 3537 } 3538 time_now_log(LOG_NOTICE, 3539 "%d/%d aios cancelled successfully, %d reaped\n", 3540 i, total, reaped); 3541 } 3542 pid_t 3543 master_pid() 3544 { 3545 return (parent_pid); 3546 } 3547 static int 3548 is_master() 3549 { 3550 return (parent_pid == getpid()); 3551 } 3552 static void 3553 register_death(pid_t pid) 3554 { 3555 int i; 3556 3557 for (i = 0; i < opts.nprocs; i++) { 3558 if (proc_store[i].pid != pid) { 3559 proc_store[i].pid = 0; 3560 break; 3561 } 3562 } 3563 } 3564 static int 3565 ischildless() 3566 { 3567 int i; 3568 3569 if (proc_store != NULL) { 3570 for (i = 0; i < opts.nprocs; i++) { 3571 if (proc_store[i].pid != 0) 3572 return (0); 3573 } 3574 } 3575 return (1); 3576 } 3577 static int 3578 haskids() 3579 { 3580 return (!ischildless()); 3581 } 3582 static void 3583 mourning(pid_t pid, int stat) 3584 { 3585 union { 3586 char dir[PATH_MAX]; 3587 char buf[SIG2STR_MAX]; 3588 } u; 3589 if (pid == -1) 3590 return; 3591 3592 if (!WIFEXITED(stat) && !WIFSIGNALED(stat)) 3593 return; 3594 3595 register_death(pid); 3596 3597 if (WIFSIGNALED(stat)) { 3598 char *x = strsignal(WTERMSIG(stat)); 3599 if (sig2str(WTERMSIG(stat), u.buf) == -1) { 3600 (void) strcpy(u.buf, "(Unknown)"); 3601 } 3602 plog(LOG_ERR, 3603 "Process %ld killed by signal %d %s,%s%s%s.\n", 3604 (ulong_t)pid, WTERMSIG(stat), u.buf, 3605 x == NULL ? "" : " ", x == NULL ? "" : x, 3606 WCOREDUMP(stat) ? " core dumped" : ""); 3607 } 3608 if (pid) { 3609 (void) snprintf(u.dir, sizeof (u.dir), "%s/%ld", 3610 opts.workingdir, (ulong_t)pid); 3611 plog(LOG_DEBUG, "removing %s\n", u.dir); 3612 if (rmdir(u.dir) == -1) 3613 pperror("rmdir(%s)", u.dir); 3614 } 3615 } 3616 3617 static int 3618 all_countdowns_zero(struct device *devp) 3619 { 3620 while (devp) { 3621 if (devp->countdown > 0) { 3622 return (0); 3623 } 3624 devp = devp->next; 3625 } 3626 return (1); 3627 } 3628 3629 static int 3630 stoptime_reached(void) 3631 { 3632 return (stoptime > 0 && stoptime < gethrtime()); 3633 } 3634 3635 static void 3636 report_exit_reason(void) 3637 { 3638 if (opts.nloops && all_countdowns_zero(devices)) { 3639 time_now_log(LOG_NOTICE, 3640 "All devices have completed %ld loops; exiting", 3641 opts.nloops); 3642 } else if (stoptime_reached()) { 3643 time_now_log(LOG_NOTICE, "stop time reached; exiting"); 3644 } 3645 } 3646 3647 void 3648 cleanup(void) 3649 { 3650 (void) sigignore(SIGTERM); 3651 (void) sigignore(SIGINT); 3652 new_log_transaction(stderr); 3653 if (is_master()) { 3654 int stat; 3655 pid_t pid; 3656 struct device *dev; 3657 3658 infantacide(); 3659 3660 while (haskids() && 3661 (pid = waitpid((pid_t)-1, &stat, WNOHANG)) != -1 && 3662 errno != ECHILD) { 3663 if (pid == 0) { 3664 sleep(1); 3665 infantacide(); 3666 } else { 3667 mourning(pid, stat); 3668 } 3669 } 3670 save_data_bufs(); 3671 3672 for (dev = devices; dev != NULL; dev = dev->next) { 3673 close_and_free_paths(dev); 3674 } 3675 if (rmdir(opts.workingdir) == -1) { 3676 pperror("rmdir(%s)", opts.workingdir); 3677 } 3678 shm_ops->fini(); 3679 } else { 3680 report_exit_reason(); 3681 cancel_all_io(); 3682 } 3683 3684 time_now_log(LOG_NOTICE, "exiting"); 3685 } 3686 /* 3687 * change_dir change into our own directory. 3688 */ 3689 3690 void 3691 change_dir() 3692 { 3693 char dir[PATH_MAX]; 3694 3695 (void) snprintf(dir, sizeof (dir), 3696 "%s/%ld", opts.workingdir, (ulong_t)getpid()); 3697 3698 if (mkdir(opts.workingdir, 0755) == -1 && errno != EEXIST) 3699 pperror("mkdir(%s, 0755)", opts.workingdir); 3700 if (mkdir(dir, 0755) == -1) 3701 pperror("mkdir(%s, 0755)", dir); 3702 3703 if (chdir(dir) == -1) { 3704 pperror("chdir(%s)", dir); 3705 (void) snprintf(dir, sizeof (dir), 3706 "%s.%ld", diffs_str, (ulong_t)getpid()); 3707 diffs = strdup(dir); 3708 if (diffs == NULL) 3709 diffs = diffs_str; 3710 } else { 3711 diffs = diffs_str; 3712 } 3713 } 3714 3715 /* 3716 * aios_outstanding_or_on_hold: 3717 * return 1 if there are aios outstanding. 3718 * return 0 if there are none. 3719 */ 3720 int 3721 aios_queued_to_fd(struct device *dev) 3722 { 3723 struct fds *fd; 3724 for (fd = dev->fdhead; ; fd = fd->next) { 3725 if (is_aio_on_list(&fd->all_aios)) 3726 return (1); 3727 if (fd->next == dev->fdhead) 3728 break; 3729 } 3730 return (0); 3731 } 3732 struct device * 3733 first_to_restart(struct device *devices) 3734 { 3735 struct device *dev; 3736 struct device *first_to_start; 3737 struct timeval now_tv; 3738 3739 do { 3740 while (my_gettimeofday(&now_tv, NULL) == -1) 3741 pperror("gettimeofday"); 3742 3743 for (first_to_start = devices; 3744 first_to_start != NULL && ( 3745 is_aio_on_list(&first_to_start->stopped_ios) == 0 || 3746 get_dev_state(first_to_start, &now_tv) == 3747 DEV_NOT_READY); 3748 first_to_start = first_to_start->next) { 3749 /*LINTED*/ 3750 } 3751 if (first_to_start == NULL) 3752 break; 3753 3754 for (dev = first_to_start->next; dev != NULL; 3755 dev = dev->next) { 3756 if (get_dev_state(dev, &now_tv) == DEV_NOT_READY) { 3757 continue; 3758 } 3759 if (is_aio_on_list(&first_to_start->stopped_ios) != 3760 0 && dev->state_ttl.tv_sec != -1 && 3761 (first_to_start->state_ttl.tv_sec == -1 || 3762 timeval_lt(dev->state_ttl, 3763 first_to_start->state_ttl))) { 3764 first_to_start = dev; 3765 } 3766 } 3767 } while (first_to_start != NULL && 3768 first_to_start->state_ttl.tv_sec == -1 && sleep(1) != 2); 3769 3770 return (first_to_start); 3771 } 3772 3773 struct aio_str * 3774 wait_to_restart(struct device *devices) 3775 { 3776 struct device *first_to_start; 3777 struct aio_str *aiop; 3778 struct timeval tv; 3779 3780 first_to_start = first_to_restart(devices); 3781 3782 if (first_to_start == NULL) 3783 return (NULL); 3784 3785 while (my_gettimeofday(&tv, NULL) == -1) 3786 pperror("gettimeofday"); 3787 3788 tv = timeval_timeval_sub(first_to_start->state_ttl, tv); 3789 3790 if (tv.tv_sec) { 3791 char buf[128]; 3792 (void) strftime(buf, 128, TIME_FORMAT, 3793 localtime(&first_to_start->state_ttl.tv_sec)); 3794 3795 if (tv.tv_sec > secs_till_exit()) { 3796 pprintf("All IO on hold until after our exit time.\n"); 3797 exit(0); 3798 } 3799 pprintf("Sleeping for %ld seconds, until %s\n", tv.tv_sec, buf); 3800 (void) sleep(tv.tv_sec); 3801 check_exit_flag(); 3802 } 3803 if (tv.tv_usec) { 3804 (void) usleep(tv.tv_usec); 3805 check_exit_flag(); 3806 } 3807 3808 aiop = pop_from_aio_list(&first_to_start->stopped_ios); 3809 if (aiop != NULL && aiop->fd != NULL) 3810 add_to_aio_list(&aiop->fd->all_aios, aiop); 3811 return (aiop); 3812 } 3813 int 3814 aios_outstanding(struct device *devices) 3815 { 3816 struct device *dev; 3817 3818 for (dev = devices; dev != NULL; dev = dev->next) { 3819 if (aios_queued_to_fd(dev)) { 3820 return (1); 3821 } 3822 } 3823 return (0); 3824 } 3825 #ifdef NOT_USED 3826 int 3827 aios_on_hold(struct device *devices) 3828 { 3829 struct device *dev; 3830 3831 for (dev = devices; dev != NULL; dev = dev->next) { 3832 if (is_aio_on_list(&dev->stopped_ios)) { 3833 return (1); 3834 } 3835 } 3836 return (0); 3837 } 3838 #endif 3839 int 3840 aios_outstanding_or_on_hold(struct device *devices) 3841 { 3842 struct device *dev; 3843 3844 for (dev = devices; dev != NULL; dev = dev->next) { 3845 if (is_aio_on_list(&dev->cancelled)) 3846 return (1); 3847 if (is_aio_on_list(&dev->stopped_ios)) 3848 return (1); 3849 if (aios_queued_to_fd(dev)) { 3850 return (1); 3851 } 3852 } 3853 return (0); 3854 } 3855 /* 3856 * Return true if diskomizer would exit before the time supplied is reached. 3857 */ 3858 int 3859 would_stop_before(time_t secs) 3860 { 3861 time_t xit; 3862 if (secs == -1 || (xit = secs_till_exit()) < 0) { 3863 return (0); 3864 } else { 3865 struct timeval tv; 3866 3867 while (my_gettimeofday(&tv, NULL) == -1) 3868 pperror("gettimeofday"); 3869 3870 return (secs >= tv.tv_sec + xit); 3871 } 3872 } 3873 /*ARGSUSED*/ 3874 static int 3875 has_cancelled(struct device *devices) 3876 { 3877 return (cancelled_count == 0 ? 0 : 1); 3878 #ifdef SLOW_BUT_SURE 3879 while (device != NULL) { 3880 if (is_aio_on_list(&device->cancelled)) { 3881 return (1); 3882 } 3883 device = device->next; 3884 } 3885 return (0); 3886 #endif 3887 } 3888 struct timeval 3889 get_timeout(struct device *devices, int report_time) 3890 { 3891 struct device *first_to_start; 3892 struct timeval tv; 3893 time_t secs_til_xit; 3894 3895 if (has_cancelled(devices)) { 3896 tv.tv_sec = tv.tv_usec = 0; 3897 return (tv); 3898 } 3899 secs_til_xit = secs_till_exit(); 3900 if (secs_til_xit < 0) { 3901 secs_til_xit = report_time; 3902 } 3903 3904 if (!all_running()) { 3905 first_to_start = first_to_restart(devices); 3906 } else { 3907 first_to_start = NULL; 3908 } 3909 3910 while (my_gettimeofday(&tv, NULL) == -1) 3911 pperror("gettimeofday"); 3912 3913 if (first_to_start == NULL) { 3914 tv.tv_sec = MIN(secs_til_xit, report_time); 3915 tv.tv_usec = 0; 3916 return (tv); 3917 } 3918 3919 tv = timeval_timeval_sub(first_to_start->state_ttl, tv); 3920 3921 if (tv.tv_sec > report_time || tv.tv_sec > secs_til_xit) { 3922 tv.tv_sec = MIN(secs_til_xit, report_time); 3923 tv.tv_usec = 0; 3924 } else if (tv.tv_sec < 0) { 3925 tv.tv_sec = tv.tv_usec = 0; 3926 } 3927 return (tv); 3928 } 3929 void 3930 init_all_blk_str(struct device *dev, int proc_no) 3931 { 3932 int i; 3933 struct timeval now_tv; 3934 3935 for (i = 0; dev != NULL; dev = dev->next, i++) { 3936 if ((i % opts.nprocs) == proc_no) { 3937 /* init_block_str(dev); */ 3938 while (my_gettimeofday(&now_tv, NULL) == -1) 3939 pperror("gettimeofday"); 3940 (void) set_dev_state(dev, DEV_NOT_READY, DEV_STOPPED, 3941 &now_tv); 3942 } 3943 } 3944 } 3945 /* 3946 * Start the first cancelled io for each device. 3947 */ 3948 void 3949 do_start_cancelled_io(struct device *devices, ullong_t start) 3950 { 3951 struct device *devp; 3952 3953 for (devp = devices; devp != NULL; devp = devp->next) { 3954 struct aio_str *aiop; 3955 3956 check_exit_flag(); 3957 3958 aiop = pop_from_aio_list(&devp->cancelled); 3959 if (aiop != NULL) { 3960 cancelled_count--; 3961 aiop->handler(aiop, start); 3962 } 3963 } 3964 if (!has_cancelled(devices)) { 3965 start_cancelled_io = 3966 (void (*)(struct device *devices, ullong_t start))nop; 3967 } 3968 } 3969 3970 static void 3971 report_times(void) 3972 { 3973 struct tms tms; 3974 3975 if (times(&tms) != (clock_t)-1) { 3976 plog(LOG_NOTICE, "User time %d seconds\n", 3977 tms.tms_cutime/CLK_TCK); 3978 plog(LOG_NOTICE, "System time %d seconds\n", 3979 tms.tms_cstime/CLK_TCK); 3980 } 3981 } 3982 3983 void 3984 do_aio(struct device *devices, ullong_t start, int report_time) 3985 { 3986 struct aio_str *aio_writes; 3987 int i; 3988 int rdflag = 0; 3989 pid_t pid; 3990 int ndevices = how_many_devices(devices); 3991 dev_state dev_state; 3992 3993 if (opts.wthreads + opts.wrthreads == 0) { 3994 plog(LOG_ERR, "WTHREADS and WRTHREADS can not both be zero\n"); 3995 exit(1); 3996 } 3997 3998 aio_writes = my_calloc((opts.wthreads + opts.wrthreads) * ndevices, 3999 sizeof (struct aio_str)); 4000 if (aio_writes == NULL) { 4001 pfprintf(stderr, "Can't allocate write structures\n"); 4002 exit(1); 4003 } 4004 proc_store = my_calloc(opts.nprocs, sizeof (struct proc_store)); 4005 if (proc_store == NULL) { 4006 pfprintf(stderr, "Can't allocate process store\n"); 4007 exit(1); 4008 } 4009 if (opts.seconds_to_run > 0) { 4010 secs_till_exit = do_secs_till_exit; 4011 stoptime = gethrtime() + (opts.seconds_to_run * 1000 * MILLION); 4012 } else { 4013 secs_till_exit = inf_secs_till_exit; 4014 stoptime = -1; 4015 } 4016 4017 (void) printf("\tPID = %ld\n", (ulong_t)getpid()); 4018 (void) printf("\t%s\n", gettext(checker_string(daio->what_checker()))); 4019 4020 if (opts_fini() != 0) { 4021 exit(EXIT_FAILURE); 4022 } 4023 4024 save_usage_tracking(usage_tracking_handle, opts.obscure_usage_file); 4025 send_usage_tracking(usage_tracking_handle); 4026 close_usage_tracking(usage_tracking_handle); 4027 4028 init_read_bufs(devices); 4029 init_all_write_bufs(aio_writes, devices); 4030 4031 shm_ops->complete(NULL); 4032 4033 report_uadmin(); 4034 4035 if (opts.debug_no_action) { 4036 exit(EXIT_SUCCESS); 4037 } 4038 4039 if (!is_readonly() && opts.expert_do_path_check && 4040 check_for_duplicate_paths(devices) == 0) { 4041 exit(EXIT_FAILURE); 4042 } 4043 4044 (void) sighold(SIGTERM); 4045 (void) sighold(SIGINT); 4046 proc_no = 0; 4047 NOTE(COMPETING_THREADS_NOW) 4048 for (i = 0; i < opts.nprocs; i++) { 4049 int forkcount = 0; 4050 (void) fflush(stdout); 4051 (void) fflush(stderr); 4052 do { 4053 pid = opts.use_fork1 == 0 ? fork() : fork1(); 4054 4055 if (pid == -1) { 4056 FORK_ERROR(opts.use_fork1 == 0 ? 4057 "" : "1"); 4058 if (forkcount >= opts.max_fork_failure) 4059 break; 4060 forkcount++; 4061 (void) sleep(opts.fork_failure_wait_time); 4062 } 4063 } while (pid == -1); 4064 if (pid == -1) 4065 FORK_ERROR(opts.use_fork1 == 0 ? "" : "1"); 4066 else if (pid == 0) { 4067 proc_no = i; 4068 break; 4069 } 4070 proc_store[i].pid = pid; 4071 plog(LOG_DEBUG, "fork%s %ld\n", opts.use_fork1 == 0 ? "" : "1", 4072 (ulong_t)pid); 4073 } 4074 if (pid != 0) { /* We are the parent */ 4075 int status; 4076 4077 (void) sigrelse(SIGINT); 4078 (void) sigrelse(SIGTERM); 4079 while ((pid = waitpid((pid_t)-1, &status, 0)) != -1 && 4080 errno != ECHILD) { 4081 if (WIFEXITED(status) || WIFSIGNALED(status)) { 4082 mourning(pid, status); 4083 if (WEXITSTATUS(status) != 0 || 4084 WIFSIGNALED(status)) { 4085 exit_status = EXIT_FAILURE; 4086 } 4087 } 4088 } 4089 report_times(); 4090 exit(exit_status); 4091 } 4092 free(proc_store); 4093 4094 daio->init((opts.wthreads + opts.rthreads + opts.wrthreads) * ndevices); 4095 (void) sigrelse(SIGINT); 4096 (void) sigrelse(SIGTERM); 4097 change_dir(); 4098 /* init_all_blk_str(devices, proc_no); */ 4099 if (usr1_exit) 4100 exit(0); 4101 init_all_aio(devices, aio_writes, opts.wthreads + opts.wrthreads); 4102 4103 assert(devices->block == this_proc()); 4104 new_log_transaction(stderr); 4105 4106 /* This is the main loop for the diskomizer. */ 4107 for (i = 0; aios_outstanding_or_on_hold(devices) != 0; i++) { 4108 struct aio_str *aiop; 4109 struct timeval timeout, now_tv; 4110 time_t tyme; 4111 int x; 4112 4113 check_exit_flag(); 4114 if (aios_outstanding(devices) != 0) { 4115 timeout = get_timeout(devices, report_time); 4116 4117 new_log_transaction(stderr); 4118 aiop = (struct aio_str *)daio->wait(&timeout); 4119 4120 x = errno; 4121 check_exit_flag(); 4122 while (my_gettimeofday(&now_tv, NULL) == -1) 4123 pperror("gettimeofday"); 4124 4125 if ((long)aiop == -1) { 4126 errno = x; 4127 if (errno == EINVAL && 4128 aios_outstanding_or_on_hold( 4129 devices) == 0) { 4130 AIOWAIT_ERROR(timeout); 4131 exit(1); 4132 } 4133 AIOWAIT_ERROR(timeout); 4134 continue; 4135 } else if ((long)aiop == 0) { 4136 /* the aiowait timed out */ 4137 report_all_hangers(devices, report_time); 4138 restart_stopped_devices(start, devices, 4139 &now_tv); 4140 continue; 4141 } 4142 } else { 4143 /* 4144 * If all the io requests have been stopped then we call 4145 * wait_to_restart which will return the first io to 4146 * restart from all the devices and it will sleep until 4147 * that io is due to be queued. It will return NULL if the 4148 * next io to start would be started after the process should 4149 * have exited or if there are no stopped devices or if all 4150 * the ios have been deferred. 4151 */ 4152 if ((aiop = wait_to_restart(devices)) != NULL) { 4153 while (my_gettimeofday(&now_tv, NULL) == -1) 4154 pperror("gettimeofday"); 4155 dev_state = set_dev_state(aiop->dev, 4156 DEV_STOPPED, DEV_STARTING, &now_tv); 4157 assert(dev_state == DEV_STOPPED || 4158 dev_state == DEV_STARTING); 4159 } else if ((aiop = get_deferred_io(devices)) == NULL) { 4160 exit(exit_status); 4161 } 4162 } 4163 if (aiop == NULL || 4164 (opts.nloops != 0 && aiop->dev->countdown == 0) || 4165 stoptime_reached()) { 4166 /* this stops the other processes being killed */ 4167 usr1_exit++; 4168 if (aiop != NULL) { 4169 if (aiop->fd != NULL) 4170 remove_from_aio_list( 4171 &aiop->fd->all_aios, aiop); 4172 if (aiop->next != NULL) 4173 aiop->next->prev = aiop->prev; 4174 } 4175 continue; 4176 } 4177 /* 4178 * Controls. 4179 */ 4180 dev_state = get_dev_state(aiop->dev, &now_tv); 4181 if (dev_state == DEV_RUNNING || dev_state == DEV_STARTING) { 4182 tyme = aiop->handler(aiop, start); 4183 } else { 4184 if (aiop->fd != NULL) { 4185 remove_from_aio_list(&aiop->fd->all_aios, 4186 aiop); 4187 } 4188 add_to_aio_list(&aiop->dev->stopped_ios, aiop); 4189 if (aios_queued_to_fd(aiop->dev) == 0) { 4190 (void) set_dev_state(aiop->dev, 4191 DEV_STOPPING, DEV_STOPPED, &now_tv); 4192 } 4193 } 4194 start_cancelled_io(devices, start); 4195 start_deferred(devices, start); 4196 report_hangers(aiop->dev, tyme, report_time); 4197 restart_stopped_devices(start, devices, &now_tv); 4198 if (aiop->off >= aiop->dev->read_start_block && 4199 aiop->handler != handle_read && 4200 i > (2*(opts.rthreads + opts.wthreads))) { 4201 if (opts.rthreads > aiop->dev->running_rthreads) { 4202 if (aiop->dev->running_rthreads == 0 && 4203 rdflag == 0) { 4204 4205 rdflag = 1; 4206 time_now_log(LOG_NOTICE, 4207 "Starting first %s reader %d", 4208 random_str, i); 4209 } 4210 if (init_read(aiop, start)) 4211 aiop->dev->running_rthreads++; 4212 } else if (rdflag == 1) { 4213 time_now_log(LOG_NOTICE, 4214 "All %sreaders started %d", 4215 random_str, i); 4216 rdflag = 2; 4217 } 4218 } 4219 } 4220 4221 exit(exit_status); 4222 } 4223 4224 4225 /* 4226 * select_error_func: 4227 * search the handlers array for an entry whose name matches the name 4228 * passed in. If the name passed in is NULL then default to using the 4229 * first handler in the list. 4230 */ 4231 static int 4232 select_error_func(const char *name, 4233 struct error_handlers *handlers, 4234 int nhandlers, 4235 on_error_t *oef, 4236 int rw) 4237 { 4238 int i; 4239 struct error_handlers *h; 4240 4241 if (name == NULL) { 4242 h = &handlers[0]; 4243 } else for (i = 0; i < nhandlers; i++) { 4244 h = &handlers[i]; 4245 if (((h->rw & rw) != 0) && strcasecmp(h->name, name) == 0) { 4246 break; 4247 } else { 4248 h = NULL; 4249 } 4250 } 4251 if (h != NULL) { 4252 *oef = h->func; 4253 if (h->setup() != 1) { 4254 fprintf(stderr, "Unable to init %s\n", h->name); 4255 } 4256 return (h->breaker == 0 ? 0 : 1); 4257 } 4258 return (-1); 4259 } 4260 4261 on_error_t * 4262 setup_onerror(char *prog, const char *str, int rw) 4263 { 4264 char *tmp; 4265 char *opaque; 4266 char *toogo; 4267 char i; 4268 on_error_t *oef = NULL; 4269 4270 if ((toogo = strdup(str)) == NULL) { 4271 (void) fprintf(stderr, "strdup(%s) failed: %s\n", 4272 str, strerror(errno)); 4273 return (NULL); 4274 } 4275 4276 for (i = 0, tmp = toogo; ; i++) { 4277 on_error_t *noef; 4278 int n; 4279 4280 if ((tmp = strtok_r(tmp, ",", &opaque)) == NULL) { 4281 break; 4282 } 4283 noef = realloc(oef, (i+2) * sizeof (on_error_t)); 4284 if (noef == NULL) { 4285 free(toogo); 4286 free(oef); 4287 return (NULL); 4288 } 4289 oef = noef; 4290 oef[i+1] = NULL; 4291 if ((n = select_error_func(tmp, on_error_table, 4292 ARRAY_LEN(on_error_table), &oef[i], rw)) != 0) { 4293 if (n == -1) { 4294 (void) fprintf(stderr, 4295 "bad on error option %s in %s\n", 4296 NIL(tmp), str); 4297 } 4298 break; 4299 } 4300 tmp = NULL; 4301 } 4302 free(toogo); 4303 if (oef == NULL) { 4304 if ((oef = malloc(sizeof (on_error_t))) != NULL) { 4305 oef[0] = NULL; 4306 } 4307 } 4308 return (oef); 4309 } 4310 4311 int 4312 how_many_devices(struct device *devices) 4313 { 4314 int i = 0; 4315 4316 while (devices != NULL) { 4317 i++; 4318 devices = devices->next; 4319 } 4320 return (i); 4321 } 4322 4323 ullong_t 4324 set_file_size(const char *dir) 4325 { 4326 struct statvfs buf; 4327 if (opts.expert_amount_to_leave_unused && opts.number_of_files && 4328 statvfs(dir, &buf) != -1) { 4329 ullong_t count; 4330 count = (buf.f_bavail * buf.f_frsize) - 4331 opts.expert_amount_to_leave_unused; 4332 count = (count/opts.number_of_files); 4333 4334 return (count); 4335 } 4336 return (opts.file_size); 4337 } 4338 4339 int 4340 set_number_of_files(const char *dir) 4341 { 4342 struct statvfs buf; 4343 if (opts.expert_amount_to_leave_unused && opts.file_size && 4344 statvfs(dir, &buf) != -1) { 4345 ullong_t count; 4346 longlong_t n; 4347 int i; 4348 count = (ullong_t)buf.f_bavail * (ullong_t)buf.f_frsize; 4349 count -= opts.expert_amount_to_leave_unused; 4350 n = count / opts.file_size; 4351 i = (int)(MIN(n, INT_MAX)); 4352 if (opts.number_of_files) 4353 return (MIN(i, opts.number_of_files)); 4354 else 4355 return (i); 4356 } 4357 return (opts.number_of_files); 4358 } 4359 /* 4360 * read the path as if it is a symbolic link and process that. 4361 */ 4362 static char * 4363 do_link(char *path) 4364 { 4365 char buf[PATH_MAX+1]; 4366 char *res; 4367 int x; 4368 4369 if ((x = readlink(path, &buf[0], sizeof (buf))) > 0) { 4370 buf[x] = NULL; 4371 res = full_path(path, &buf[0]); 4372 } else { 4373 res = my_strdup(path); 4374 } 4375 4376 if (res == NULL) { 4377 exit(EXIT_FAILURE); 4378 } 4379 return (res); 4380 } 4381 4382 static struct fds * 4383 open_path_count(struct device *devp, char *name, ullong_t size) 4384 { 4385 struct fds *fd; 4386 int i = 0; 4387 4388 do { 4389 if ((fd = open_path(devp, name, size)) != NULL) 4390 break; 4391 } while (i++ < opts.open_retries); 4392 4393 return (fd); 4394 } 4395 struct device * 4396 open_path_group(struct paths *paths, int paths_to_use, int error_paths) 4397 { 4398 int count; 4399 struct other_paths *opath; 4400 struct device *devp; 4401 struct fds *fd; 4402 int total_paths = paths_to_use + error_paths; 4403 4404 if ((devp = (struct device *)my_calloc(1, 4405 sizeof (struct device))) == NULL) { 4406 return (NULL); 4407 } 4408 4409 for (count = 0, opath = paths->op; 4410 count < total_paths && opath != NULL; /* */) { 4411 if ((fd = open_path_count(devp, 4412 opath->path, 0)) != NULL) { 4413 fd->error_path = 4414 count >= paths_to_use ? 1 : 0; 4415 fd->path_id = count++; 4416 } 4417 opath = opath->next; 4418 } 4419 return (devp); 4420 } 4421 /* 4422 * Open_devices 4423 * 4424 * ARGUMENT: char *name 4425 * A space seperated list of devices. Devices may be grouped by 4426 * putting curly brackets around them to sepficy multiple paths to 4427 * the same device. 4428 */ 4429 struct device * 4430 open_devices(char *name) 4431 { 4432 struct device *devp; 4433 struct device *newone; 4434 int brace_count = 0; 4435 int error_paths = opts.error_paths; 4436 int paths_to_use = opts.paths_to_use; 4437 struct paths *path_group = NULL; 4438 struct other_paths *op; 4439 char *tmp; 4440 char *toogo; 4441 char *opaque; 4442 4443 if ((toogo = strdup(name)) == NULL) { 4444 (void) fprintf(stderr, "strdup(%s) failed: %s\n", name, 4445 strerror(errno)); 4446 exit(1); 4447 } 4448 tmp = toogo; 4449 devp = NULL; 4450 4451 while ((tmp = strtok_r(tmp, "\t ", &opaque)) != NULL) { 4452 struct stat64 sbuf; 4453 if (usr1_exit) 4454 exit(0); 4455 4456 if (*tmp == OPEN_BRACE) { 4457 if (brace_count++ == 0) { 4458 error_paths = paths_to_use = 0; 4459 } 4460 if (path_group == NULL) { 4461 path_group = my_calloc( 4462 sizeof (struct paths), 1); 4463 if (path_group == NULL) { 4464 exit(EXIT_FAILURE); 4465 } 4466 } 4467 } else if (*tmp == CLOSE_BRACE) { 4468 if (--brace_count == 0) { 4469 if (path_group->logicalpath == NULL) { 4470 plog(LOG_WARNING, gettext( 4471 "Empty path device list " 4472 "found")); 4473 free(path_group); 4474 path_group = NULL; 4475 continue; 4476 } 4477 newone = open_device(NULL, 4478 path_group, opts.file_size, 4479 paths_to_use, error_paths); 4480 if (newone != NULL) { 4481 newone->next = devp; 4482 devp = newone; 4483 } 4484 free_paths(path_group); 4485 path_group = NULL; 4486 error_paths = opts.error_paths; 4487 paths_to_use = opts.paths_to_use; 4488 } 4489 if (brace_count < 0) 4490 plog(LOG_WARNING, 4491 "Unbalanced braces in device list\n"); 4492 } else if (*tmp == '-') { 4493 /* PATH options */ 4494 plog(LOG_WARNING, "Path options are not currently " 4495 "supported: \"%s\" ignored\n", tmp); 4496 } else if (path_group != NULL) { 4497 if (path_group->op == NULL) { 4498 op = my_calloc( 4499 sizeof (struct other_paths), 1); 4500 if (op == NULL) { 4501 exit(EXIT_FAILURE); 4502 } 4503 path_group->op = op; 4504 path_group->logicalpath = my_strdup(tmp); 4505 } else { 4506 /* lint does not like empty loops */ 4507 for (op = path_group->op; op->next != NULL; ) { 4508 op = op->next; 4509 } 4510 op->next = my_calloc( 4511 sizeof (struct other_paths), 1); 4512 if (op->next == NULL) { 4513 exit(EXIT_FAILURE); 4514 } 4515 op = op->next; 4516 } 4517 if (brace_count > 1) { 4518 if (path_stop_check == do_path_stop_check) { 4519 error_paths++; 4520 } 4521 } else { 4522 paths_to_use++; 4523 } 4524 op->path = do_link(tmp); 4525 } else if ((opts.number_of_files || 4526 opts.expert_amount_to_leave_unused) && 4527 daio->stat(tmp, &sbuf) != -1 && S_ISDIR(sbuf.st_mode)) { 4528 int len = strlen(tmp) + 4529 strlen(opts.obscure_data_file_basename) + 16; 4530 int i; 4531 char *x; 4532 int nf = set_number_of_files(tmp); 4533 ullong_t size = set_file_size(tmp); 4534 pprintf("%s %d files of %lld bytes\n", tmp, nf, 4535 (ullong_t)size); 4536 for (i = 0; i < nf; i++) { 4537 /* 4538 * If the open succeds then we just have to 4539 * "leak" this memory here as it is in use 4540 * in the device structures. 4541 */ 4542 if ((x = malloc(len)) == NULL) { 4543 MALLOC_ERROR(len); 4544 exit(1); 4545 } 4546 (void) sprintf(x, "%s/%s%d", tmp, 4547 opts.obscure_data_file_basename, i); 4548 4549 newone = open_device(x, NULL, size, 4550 paths_to_use, error_paths); 4551 4552 if (newone != NULL) { 4553 newone->next = devp; 4554 devp = newone; 4555 } else { 4556 free(x); 4557 } 4558 } 4559 } else { 4560 newone = open_device(tmp, NULL, opts.file_size, 4561 paths_to_use, error_paths); 4562 4563 if (newone != NULL) { 4564 newone->next = devp; 4565 devp = newone; 4566 } 4567 } 4568 tmp = NULL; 4569 } 4570 if (brace_count != 0) { 4571 plog(LOG_WARNING, "Unbalanced braces in device list\n"); 4572 } 4573 /* don't free toogo as it is being used in the devices structures. */ 4574 /* free(toogo); */ 4575 if (usr1_exit) { 4576 exit(0); 4577 } 4578 if (devp != NULL) { 4579 init_device_control(devp); 4580 } 4581 return (devp); 4582 } 4583 4584 void 4585 print_dev(struct device *dev) 4586 { 4587 struct fds *fds; 4588 static const char device_str[] = "device"; 4589 4590 (void) printf("Logical Device: %s\n", dev->logicalname); 4591 4592 USAGE_TRACKING_OPEN_KEY(device_str, NULL, dev->logicalname); 4593 4594 fds = dev->fdhead; 4595 (void) printf("Physical device%s:\n", fds->next == fds ? "" : "s"); 4596 4597 USAGE_TRACKING_OPEN_KEY("paths", NULL, NULL); 4598 4599 for (;;) { 4600 USAGE_TRACKING_STORE_KEY_VALUE("longname", fds->longname); 4601 USAGE_TRACKING_STORE_KEY_VALUE("created", TRUE_OR_FALSE( 4602 fds->created)); 4603 (void) printf("\t%s%s%s%s%s%s\n", fds->longname, 4604 fds->error_path || fds->created ? " (" : "", 4605 fds->error_path ? "error path" : "", 4606 fds->error_path && fds->created ? ", " : "", 4607 fds->created ? "created" : "", 4608 fds->error_path || fds->created ? ")" : ""); 4609 if (fds->longname != fds->shortname) { 4610 (void) printf("\t\t(%s)\n", fds->shortname); 4611 } 4612 if (fds->next != dev->fdhead) { 4613 fds = fds->next; 4614 } else { 4615 break; 4616 } 4617 } 4618 4619 USAGE_TRACKING_CLOSE_KEY(); 4620 4621 print_number_of_bytes(dev->device_block_size, 4622 "Device block size", "Device block size"); 4623 print_number_of_bytes(dev->length, "length", "length"); 4624 print_number(LEN_BYTES2BLOCKS(dev), "block", "blocks"); 4625 (void) fflush(stdout); 4626 if (write_loops) { 4627 print_number(dev->countdown, "write", "writes"); 4628 } else if (opts.nloops) { 4629 print_number(dev->countdown, "read", "reads"); 4630 } 4631 4632 USAGE_TRACKING_STORE_KEY_VALUE_INT("length", dev->length); 4633 4634 USAGE_TRACKING_STORE_KEY_VALUE_INT("blocks", LEN_BYTES2BLOCKS(dev)); 4635 4636 USAGE_TRACKING_CLOSE_KEY(); 4637 } 4638 /* 4639 * close all the fds and free the data associated with all the paths 4640 * for a device. 4641 */ 4642 void 4643 close_and_free_paths(struct device *dev) 4644 { 4645 struct fds *fd, *next; 4646 4647 for (fd = dev->fdhead, next = fd->next; ; fd = next, next = fd->next) { 4648 (void) daio->close(fd->fd); 4649 if (opts.expert_cleanup_created_files && fd->created && 4650 is_master() && exit_status == EXIT_SUCCESS && 4651 get_shared_device_error(dev->shared_data_handle) == 0) { 4652 pprintf(gettext("Removing %s\n"), fd->longname); 4653 if (daio->unlink(fd->longname) == -1) { 4654 pperror(gettext("unlink(%s)"), fd->longname); 4655 } 4656 } 4657 if (fd->shortname != fd->longname) { 4658 free(fd->longname); 4659 } 4660 free(fd->shortname); 4661 free(fd); 4662 if (dev->fdhead == next) { 4663 break; 4664 } 4665 } 4666 dev->fdhead = NULL; 4667 } 4668 struct fds * 4669 open_path(struct device *devp, char *name, ullong_t size) 4670 { 4671 struct fds *fd; 4672 struct dk_cinfo dk_cinfo; 4673 struct stat64 sbuf; 4674 char create; 4675 4676 check_exit_flag(); 4677 4678 if (daio->stat(name, &sbuf) == -1) { 4679 if (size == 0) { 4680 pfprintf(stderr, "stat(%s) == -1 errno = %d (%s)\n", 4681 name, errno, strerror(errno)); 4682 return (NULL); 4683 } else { 4684 create = 1; 4685 } 4686 } else { 4687 create = 0; 4688 } 4689 if ((fd = (struct fds *)calloc(1, sizeof (struct fds))) == NULL) { 4690 return (NULL); 4691 } 4692 4693 if (opts.debug_no_action == 0) { 4694 if ((fd->fd = daio->open(name, 4695 (is_readonly() ? O_RDONLY : O_RDWR)| 4696 (opts.o_sync ? O_SYNC : 0)| 4697 (opts.o_excl ? O_EXCL : 0) | 4698 (opts.o_ndelay ? O_NDELAY : 0) | 4699 (create ? O_CREAT : 0) | 4700 (opts.o_trunc ? O_TRUNC : 0), 0600)) == -1) { 4701 pperror("open(%s, %s%s%s%s%s%s)", 4702 name, (is_readonly() ? "O_RDONLY" : "O_RDWR"), 4703 (opts.o_excl ? "|O_EXCL": ""), 4704 (opts.o_sync ? "|O_SYNC": ""), 4705 (opts.o_ndelay ? "|O_NDELAY": ""), 4706 (opts.o_trunc ? "|O_TRUNC": ""), 4707 (create ? "|O_CREAT, 0600" : "")); 4708 free(fd); 4709 return (NULL); 4710 } 4711 if (daio->directio(fd->fd, opts.directio == 1 ? 4712 DIRECTIO_ON : DIRECTIO_OFF) == -1) { 4713 if (errno != ENOTTY || opts.directio == 1) { 4714 pperror("directio(\"%s\") failed", name); 4715 } 4716 } 4717 fd->created = create; 4718 if (create) { 4719 if (daio->fstat(fd->fd, &sbuf) == -1) { 4720 FSTAT_ERROR(fd->fd, name); 4721 (void) daio->close(fd->fd); 4722 free(fd); 4723 return (NULL); 4724 } 4725 if (S_ISREG(sbuf.st_mode) && 4726 daio->ftruncate(fd->fd, size) == -1) { 4727 (void) daio->close(fd->fd); 4728 free(fd); 4729 return (NULL); 4730 } 4731 } 4732 } 4733 if (sbuf.st_mode & (S_IFCHR|S_IFBLK)) { 4734 fd->devid.dev = sbuf.st_rdev; 4735 } else { 4736 fd->devid.dev = sbuf.st_dev; 4737 } 4738 fd->devid.ino = sbuf.st_ino; 4739 fd->read_times.str = read_str; 4740 fd->read_times.best = 0xffffffff; 4741 fd->write_times.str = write_str; 4742 fd->write_times.best = 0xffffffff; 4743 fd->last_read_time = fd->last_write_time = ~0; 4744 if ((fd->longname = strdup(name)) == NULL) { 4745 (void) daio->close(fd->fd); 4746 free(fd); 4747 return (NULL); 4748 } 4749 fd->stop_flag = 0; 4750 fd->shared_data_handle = init_shared_device_info(opts.nprocs); 4751 if (fd->shared_data_handle == NULL) { 4752 plog(LOG_ERR, gettext("Unable to allocate shared data " 4753 "handle for %s\n"), name); 4754 } 4755 4756 if (daio->ioctl(fd->fd, DKIOCINFO, &dk_cinfo) == -1) { 4757 fd->shortname = fd->longname; 4758 } else { 4759 fd->shortname = calloc(1, 4760 strlen(dk_cinfo.dki_dname) + (3 * 10)); 4761 if (fd->shortname == NULL) { 4762 fd->shortname = fd->longname; 4763 } else { 4764 (void) sprintf(fd->shortname, "%s%d:%c", 4765 dk_cinfo.dki_dname, dk_cinfo.dki_unit, 4766 dk_cinfo.dki_partition + 'a'); 4767 } 4768 } 4769 if (opts.use_long_names) { 4770 fd->name = fd->longname; 4771 } else { 4772 fd->name = fd->shortname; 4773 } 4774 Longest_device_name = MAX(Longest_device_name, strlen(fd->name)); 4775 if (devp->fdhead == NULL) { 4776 devp->fdhead = fd; 4777 fd->next = fd; 4778 } else { 4779 fd->next = devp->fdhead->next; 4780 devp->fdhead->next = fd; 4781 } 4782 return (fd); 4783 } 4784 4785 void * 4786 read_vtoc_all_paths(struct fds *fdhead) 4787 { 4788 struct fds *fd; 4789 void *handle = NULL; 4790 4791 fd = fdhead; 4792 do { 4793 if ((handle = daio->read_vtoc(fd->fd)) != NULL) { 4794 break; 4795 } 4796 fd = fd->next; 4797 4798 } while (fd->next != fdhead); /* do loop! */ 4799 4800 return (handle); 4801 } 4802 struct paths * 4803 do_ap(const char *inpath) 4804 { 4805 return (daio->findap(inpath, opts.dev_tree)); 4806 } 4807 4808 /* 4809 * Set the minimum possible block size that can be used for all the devices 4810 * in this test set. Typically the block sizes seen are 512 bytes, 2048 bytes 4811 * or 4096 bytes. It will choose the smallest common multiple of the block 4812 * sizes available. Typically this will just be the largest block size of 4813 * all the devices but if you had a 3K and 4K block sized device this will 4814 * return the smallest block size possible is 12k. 4815 */ 4816 static void 4817 set_minimum_block_size(int block_size) 4818 { 4819 if (min_block_size == 0) { 4820 min_block_size = block_size; 4821 } else { 4822 min_block_size = min_block_size * block_size / 4823 gcd(min_block_size, block_size); 4824 } 4825 } 4826 4827 struct device * 4828 open_device(char *name, struct paths *paths, ullong_t size, 4829 int paths_to_use, int error_paths) 4830 { 4831 struct stat64 sbuf; 4832 ullong_t nsize; 4833 void *vtoc_handle; 4834 struct device *devp; 4835 struct fds *fd; 4836 int total_paths = paths_to_use + error_paths; 4837 4838 4839 if (paths != NULL) { 4840 name = my_strdup(paths->logicalpath); 4841 if (name == NULL) { 4842 return (NULL); 4843 } 4844 devp = open_path_group(paths, paths_to_use, error_paths); 4845 } else if (total_paths > 1 && 4846 (paths = do_ap(name)) != NULL) { 4847 devp = open_path_group(paths, paths_to_use, error_paths); 4848 free_paths(paths); 4849 } else { 4850 if ((devp = (struct device *)calloc(1, 4851 sizeof (struct device))) == NULL) { 4852 return (NULL); 4853 } 4854 if ((fd = open_path_count(devp, name, size)) != NULL) { 4855 fd->path_id = 0; 4856 fd->error_path = 0; 4857 } 4858 } 4859 Longest_logical_name = MAX(Longest_logical_name, strlen(name)); 4860 4861 if (devp->fdhead == NULL) { 4862 free(devp); 4863 return (NULL); 4864 } else { 4865 devp->logicalname = name; 4866 } 4867 4868 if (opts.debug_no_action) { 4869 return (devp); 4870 } 4871 /* 4872 * this is a mess. 4873 */ 4874 4875 if (daio->fstat(devp->fdhead->fd, &sbuf) == -1) { 4876 FSTAT_ERROR(devp->fdhead->fd, devp->fdhead->name); 4877 close_and_free_paths(devp); 4878 free(devp); 4879 return (NULL); 4880 } 4881 devp->next = NULL; 4882 devp->choose_block = seq_block; 4883 4884 if (!(sbuf.st_mode & S_IFCHR)) { 4885 plog(LOG_DEBUG, "Not a character device\n"); 4886 nsize = (ullong_t)sbuf.st_size; 4887 devp->device_block_size = SIZEOF_BUF; 4888 } else if ((vtoc_handle = read_vtoc_all_paths(devp->fdhead)) == NULL) { 4889 nsize = (ullong_t)SIZEOF_BUF*(ullong_t)sbuf.st_blocks; 4890 devp->device_block_size = SIZEOF_BUF; 4891 } else { 4892 const struct disko_partition *part; 4893 4894 devp->device_block_size = disko_vtoc_sectorsz(vtoc_handle); 4895 4896 if (devp->device_block_size == 0) { 4897 devp->device_block_size = DEFAULT_BLOCK_SIZE; 4898 } 4899 4900 part = disko_vtoc_this_partition(vtoc_handle); 4901 4902 devp->v_part = malloc(sizeof (struct disko_partition)); 4903 4904 if (devp->v_part != NULL) { 4905 *devp->v_part = *part; 4906 } 4907 4908 nsize = (ullong_t)devp->device_block_size * part->p_size; 4909 4910 disko_vtoc_free(vtoc_handle); 4911 } 4912 set_minimum_block_size(devp->device_block_size); 4913 4914 if (size == 0 || (nsize > 0 && nsize < size)) { 4915 size = nsize; 4916 } 4917 4918 if (size == 0) { 4919 (void) fprintf(stderr, gettext("File size is zero on %s\n"), 4920 name); 4921 (void) fflush(stderr); 4922 close_and_free_paths(devp); 4923 free(devp); 4924 return (NULL); 4925 } 4926 devp->length = size-(opts.start_offset * 4927 INDEX_TO_DIOLEN(max_disk_io_len)); 4928 devp->read_start_block = opts.expert_recent_log_size + 4929 ((devp->length/INDEX_TO_DIOLEN(max_disk_io_len)) * 4930 opts.start_reads_percentage)/100; 4931 4932 if (devp->read_start_block < (opts.nprocs * (opts.rthreads + 4933 opts.wthreads))) { 4934 devp->read_start_block = (opts.nprocs * (opts.rthreads + 4935 opts.wthreads)); 4936 } 4937 if (devp->read_start_block > LEN_BYTES2BLOCKS(devp)) { 4938 devp->read_start_block = LEN_BYTES2BLOCKS(devp); 4939 } 4940 4941 if (opts.nloops) { 4942 devp->countdown = (opts.nloops * devp->length) / 4943 (opts.nprocs * INDEX_TO_DIOLEN(max_disk_io_len)); 4944 } else { 4945 devp->countdown = ~(uint64_t)0; 4946 } 4947 4948 print_dev(devp); 4949 print_number_of_bytes(size, "size", "size"); 4950 if (opts.start_offset * INDEX_TO_DIOLEN(max_disk_io_len) > size) { 4951 (void) printf("starting offset is greater than disk size! "); 4952 (void) printf("%llx > %llx\n", (ullong_t)(opts.start_offset * 4953 INDEX_TO_DIOLEN(max_disk_io_len)), (ullong_t)size); 4954 close_and_free_paths(devp); 4955 free(devp); 4956 return (NULL); 4957 } 4958 if (LEN_BYTES2BLOCKS(devp) < (opts.nprocs * 4959 (opts.wthreads + opts.rthreads + opts.wrthreads))) { 4960 4961 (void) printf("There are not enough blocks (%#llx) to support " 4962 "this many I/O's (%#lx) on device %s, device closed\n", 4963 (ullong_t)LEN_BYTES2BLOCKS(devp), 4964 (ulong_t)(opts.nprocs * (opts.wthreads + 4965 opts.rthreads + opts.wrthreads)), name); 4966 close_and_free_paths(devp); 4967 free(devp); 4968 return (NULL); 4969 } 4970 if (opts.expert_max_active_time == 0) { 4971 devp->state_ttl.tv_sec = 0; 4972 devp->state_ttl.tv_usec = 0; 4973 } else { 4974 while (my_gettimeofday(&devp->state_ttl, NULL) == -1) 4975 pperror("gettimeofday"); 4976 devp->state_ttl = set_ttl(devp->state_ttl, 4977 opts.expert_max_active_time, 4978 opts.expert_min_active_time); 4979 } 4980 devp->shared_data_handle = init_shared_device_info(opts.nprocs); 4981 if (devp->shared_data_handle == NULL) { 4982 plog(LOG_ERR, gettext("Unable to allocate shared data " 4983 "handle for %s\n"), name); 4984 } 4985 devp->seq_passes = opts.sequential_passes; 4986 devp->recent = init_recent(opts.expert_recent_log_size); 4987 return (devp); 4988 } 4989 4990 static int 4991 check_for_duplicate_paths(struct device *devp) 4992 { 4993 uchar_t *buf; 4994 int buflen = min_block_size; 4995 struct device *d; 4996 struct fds *fd; 4997 int status = 1; 4998 4999 if ((buf = malloc(buflen)) == NULL) { 5000 return (0); 5001 } 5002 memset(buf, NULL, buflen); 5003 5004 /* 5005 * first zero all the target blocks 5006 */ 5007 for (d = devp; d != NULL; d = d->next) { 5008 fd = d->fdhead; 5009 do { 5010 check_exit_flag(); 5011 if (daio->pwrite(fd->fd, buf, buflen, 5012 INDEX_TO_DIOLEN(max_disk_io_len) * 5013 OPTION(start_offset), NULL) != buflen) { 5014 PWRITE_ERROR(fd->fd, fd->name, 5015 (ulong_t)buf, 5016 buflen, 5017 INDEX_TO_DIOLEN( 5018 max_disk_io_len) * 5019 OPTION(start_offset)); 5020 status = 0; 5021 } 5022 5023 fd = fd->next; 5024 } while (fd != d->fdhead); 5025 } 5026 /* Now write the dev structure to the first path only */ 5027 for (d = devp; d != NULL; d = d->next) { 5028 check_exit_flag(); 5029 fd = d->fdhead; 5030 (void) memcpy(&buf[0], d, sizeof (struct device)); 5031 if (daio->pwrite(fd->fd, buf, buflen, 5032 INDEX_TO_DIOLEN(max_disk_io_len) * 5033 OPTION(start_offset), NULL) != buflen) { 5034 PWRITE_ERROR(fd->fd, fd->name, (ulong_t)buf, 5035 buflen, INDEX_TO_DIOLEN(max_disk_io_len) * 5036 OPTION(start_offset)); 5037 status = 0; 5038 } 5039 } 5040 /* 5041 * Now read all the blocks via each path and verify that they 5042 * are ok. 5043 */ 5044 for (d = devp; d != NULL; d = d->next) { 5045 fd = d->fdhead; 5046 do { 5047 check_exit_flag(); 5048 memset(buf, NULL, buflen); 5049 if (daio->pread(fd->fd, buf, buflen, 5050 INDEX_TO_DIOLEN(max_disk_io_len) * 5051 OPTION(start_offset), NULL) != buflen) { 5052 PREAD_ERROR(fd->fd, fd->name, 5053 (ulong_t)buf, buflen, 5054 INDEX_TO_DIOLEN(max_disk_io_len) * 5055 OPTION(start_offset)); 5056 status = 0; 5057 } else if (memcmp(buf, d, sizeof (struct device)) != 5058 0) { 5059 status = 0; 5060 pfprintf(stderr, 5061 "dev %s path %s failed path check\n", 5062 d->logicalname, fd->name); 5063 } 5064 fd = fd->next; 5065 } while (fd != d->fdhead); 5066 } 5067 free(buf); 5068 return (status); 5069 } 5070 5071 void 5072 print_uname(FILE *out) 5073 { 5074 static char uname_str[] = "uname"; 5075 5076 struct utsname name; 5077 char platform[255], hw_prov[255], domain[255]; 5078 (void) sysinfo(SI_PLATFORM, &platform[0], sizeof (platform)); 5079 (void) sysinfo(SI_HW_PROVIDER, &hw_prov[0], sizeof (hw_prov)); 5080 (void) sysinfo(SI_SRPC_DOMAIN, &domain[0], sizeof (domain)); 5081 if (uname(&name) == -1) { 5082 pperror(uname_str); 5083 } 5084 (void) fprintf(out, "System info:\n\t%s %s %s %s %s %s %s\n", 5085 name.sysname, name.nodename, 5086 name.release, name.version, name.machine, platform, 5087 hw_prov); 5088 } 5089 5090 void 5091 set_max_blocks(void) 5092 { 5093 int i; 5094 5095 for (i = 0; i < opts.disk_io_sizes.wlen; i++) { 5096 if (opts.disk_io_sizes.vals[max_disk_io_len] < 5097 opts.disk_io_sizes.vals[opts.disk_io_sizes.weightings[i]]) { 5098 max_disk_io_len = opts.disk_io_sizes.weightings[i]; 5099 } 5100 } 5101 } 5102 5103 int 5104 check_block_sizes(void) 5105 { 5106 int i; 5107 int bs; 5108 int ret = 0; 5109 5110 for (i = 0; i < opts.disk_io_sizes.wlen; i++) { 5111 bs = opts.disk_io_sizes.vals[opts.disk_io_sizes.weightings[i]]; 5112 if ((bs % min_block_size) != 0) { 5113 plog(LOG_ERR, "Disk IO size 0x%x (%d) is not a " 5114 "multiple of the minimum block size, 0x%x (%d)\n", 5115 bs, bs, min_block_size, min_block_size); 5116 ret = 1; 5117 } 5118 } 5119 return (ret); 5120 } 5121 5122 static void 5123 usr1(int sig, siginfo_t *info, void *v) 5124 { 5125 plog(LOG_DEBUG, "USR1 caught\n"); 5126 usr1_exit++; 5127 } 5128 static int exit_flag; 5129 /*ARGSUSED*/ 5130 static void 5131 set_exit_flag(int sig, siginfo_t *info, void *v) 5132 { 5133 plog(LOG_DEBUG, "Sig %d\n", sig); 5134 if (info == NULL) { 5135 /* 5136 * Keyboard generated SIGINT has no info pointer. 5137 */ 5138 if (sig == SIGINT) 5139 killer_pid = master_pid(); 5140 } else if (killer_pid == 0) 5141 killer_pid = info->si_pid; 5142 exit_flag++; 5143 } 5144 void 5145 check_exit_flag() 5146 { 5147 if (exit_flag) { 5148 (void) sighold(SIGTERM); 5149 exit(killer_pid == master_pid() ? exit_status : EXIT_FAILURE); 5150 } 5151 } 5152 static void 5153 print_startup_info(void) 5154 { 5155 (void) printf("Setting up to do:\n"); 5156 print_number(LONG_BIT, "Bit mode", "Bit mode"); 5157 print_number_of_bytes(min_block_size, 5158 "Common block size", "Common block size"); 5159 (void) printf("\tRead %s mode\n", is_readonly() ? "only" : write_str); 5160 random_str = is_readonly() ? "random " : ""; 5161 print_number(opts.wthreads, "write", "writes"); 5162 print_number(opts.wrthreads, "Write - read", "Write - reads"); 5163 print_number(opts.rthreads, "read", "reads"); 5164 print_number_of_bytes(INDEX_TO_DIOLEN(max_disk_io_len), 5165 "Max block size", "Max block size"); 5166 print_number(opts.nprocs, "proc", "procs"); 5167 print_number(opts.nlocks, "lock", "locks"); 5168 (void) printf("\t%d%% of disk written before reads start\n", 5169 opts.start_reads_percentage); 5170 if (!is_readonly() && opts.obscure_execute && 5171 does_check(daio->what_checker())) { 5172 (void) printf("\tWill execute code read into buffer\n"); 5173 } 5174 (void) printf("\tUsing %s as buffer allocator\n", 5175 shm_ops->longname(NULL)); 5176 5177 USAGE_TRACKING_STORE_KEY_VALUE("allocator", shm_ops->longname(NULL)); 5178 #define UT_KVS(A) USAGE_TRACKING_STORE_KEY_VALUE_INT(#A, opts.A); 5179 UT_KVS(nprocs); 5180 UT_KVS(wthreads); 5181 UT_KVS(wrthreads); 5182 UT_KVS(rthreads); 5183 #undef UT_KVS 5184 5185 } 5186 5187 static void 5188 setup_signals(void) 5189 { 5190 setup_signal_catcher(SIGTERM, set_exit_flag, SA_SIGINFO); 5191 setup_signal_catcher(SIGHUP, set_exit_flag, SA_SIGINFO); 5192 setup_signal_catcher(SIGINT, set_exit_flag, SA_SIGINFO); 5193 setup_signal_catcher(SIGUSR1, usr1, 0); 5194 } 5195 5196 int 5197 main(int argc, char **argv) 5198 { 5199 const char *path; 5200 srand48(getpid()); 5201 5202 path = set_diskomizer_path(); 5203 5204 if (do_args(argc, argv, pprintf, path) == 0) { 5205 usage(*argv); 5206 } 5207 /* 5208 * Usage tracking has to open after argument checking as we need 5209 * the values from the configuration files. 5210 */ 5211 usage_tracking_handle = open_usage_tracking( 5212 opts.obscure_usagetracking_domain, 5213 opts.obscure_sendmail, 5214 opts.obscure_usage_email, /* from */ 5215 "diskomizer", /* to */ 5216 diskomizer_str, /* tool */ 5217 VERSION); 5218 5219 setup_signal((int (*)(void *, const char *, ...))pfprintf, stderr); 5220 set_limits(); 5221 5222 set_max_blocks(); 5223 5224 if (opts.STDOUT != NULL) 5225 if (freopen(opts.STDOUT, "a+", stdout) == NULL) { 5226 pperror("Unable to open %s for stdout\n", 5227 opts.STDOUT); 5228 exit(1); 5229 } 5230 if (opts.STDERR != NULL) 5231 if (freopen(opts.STDERR, "a+", stderr) == NULL) { 5232 pperror("Unable to open %s for stderr\n", 5233 opts.STDERR); 5234 exit(1); 5235 } 5236 popenlog("diskomizer"); 5237 5238 5239 if (my_gettimeofday(&start_time, NULL) == -1) { 5240 plog(LOG_ERR, "Unable to get time of day\n"); 5241 exit(EXIT_FAILURE); 5242 } 5243 5244 set_serial_and_provider(); 5245 5246 if (opts.expert_write_cluster_length == 0) 5247 opts.expert_write_cluster_length = 1; 5248 if (opts.expert_read_cluster_length == 0) 5249 opts.expert_read_cluster_length = 1; 5250 if (opts.STDERR != NULL && opts.STDOUT != NULL && opts.background) 5251 background(); 5252 (void) printf("\tCopyright %s Sun Microsystems, Inc." 5253 " All Rights Reserved\n\tUse is subject to license terms.\n\t" 5254 "Version %s\n", THIS_YEAR, VERSION); 5255 5256 print_args(argc, argv, (void (*)(const char *, ...))printf); 5257 /* Check for values which mean we do nothing */ 5258 if (opts.nprocs < 1 || opts.start_reads_percentage > 100) { 5259 exit(1); 5260 } 5261 5262 if (opts.read_minimum > 0 && 5263 opts.rthreads < opts.wthreads * opts.read_minimum) { 5264 (void) printf("WARNING: The ratio of readers to writers with " 5265 "read_minimum set to %d\ncould lead to thrashing " 5266 "or deadlock\n", opts.read_minimum); 5267 } 5268 5269 /* 5270 * Set up all the functions to use. 5271 */ 5272 /* First what to do on error. */ 5273 if ((on_error_short = setup_onerror(*argv, opts.on_error_short, 5274 READ_ERR)) == NULL) { 5275 exit_status = EXIT_FAILURE; 5276 exit(exit_status); 5277 } 5278 5279 if ((on_error_corrupt = setup_onerror(*argv, opts.on_error_corrupt, 5280 READ_ERR)) == NULL) { 5281 exit_status = EXIT_FAILURE; 5282 exit(exit_status); 5283 } 5284 5285 if ((on_write_error = setup_onerror(*argv, opts.on_write_error, 5286 WRITE_ERR)) == NULL) { 5287 exit_status = EXIT_FAILURE; 5288 exit(exit_status); 5289 } 5290 5291 /* Now the type of allocator to be used */ 5292 5293 if ((init_uchar_func = setup_write_buf_initializer()) == NULL || 5294 (read_buffer_initializer = setup_read_buf_initializer()) == NULL) { 5295 usage(*argv); 5296 } 5297 5298 /* Choose a shared memmory allocator */ 5299 shm_ops = choose_shm_ops(opts.allocator); 5300 if (opts.device == NULL) { 5301 (void) prompt(); 5302 } 5303 if (opts.device == NULL) { 5304 pfprintf(stderr, "No devices specified.\n"); 5305 exit(1); 5306 } 5307 print_uname(stdout); 5308 USAGE_TRACKING_STORE_KEY_VALUE_INT("pid", getpid()); 5309 5310 print_bufhdr_offsets(stdout); 5311 5312 print_serial_and_provider(stdout); 5313 5314 if (opts.nlocks == 0) 5315 opts.nlocks = (opts.nprocs * 2) + 1; /* should be prime */ 5316 pgrp = setpgrp(); 5317 parent_pid = getpid(); 5318 setup_signals(); 5319 /* Now setup the locking primitives to use to protect the bit maps */ 5320 init_locks(); 5321 /* register a clean up routine. */ 5322 (void) atexit(cleanup); 5323 /* get our daio */ 5324 if ((daio = daio_choose_ops(opts.aio_routines)) == NULL) { 5325 char *reason = dlerror(); 5326 (void) pfprintf(stderr, 5327 "Unable to load daio routines(%s): %s\n", 5328 opts.aio_routines, 5329 reason == NULL ? "Unknown" : reason); 5330 exit(1); 5331 } 5332 5333 daio->init_master(opts.checker, INDEX_TO_DIOLEN(max_disk_io_len)); 5334 5335 if (opts.nloops && opts.rthreads == 0 && opts.wrthreads == 0) { 5336 write_loops = 1; 5337 } 5338 5339 USAGE_TRACKING_OPEN_KEY("devices", NULL, NULL); 5340 /* now open the devices */ 5341 if ((devices = open_devices(opts.device)) == NULL) { 5342 (void) pfprintf(stderr, "No devices opened\n"); 5343 exit(1); 5344 } 5345 USAGE_TRACKING_CLOSE_KEY(); 5346 findap_fini(); /* free up any data that was cached */ 5347 5348 if (check_block_sizes()) { 5349 exit(1); 5350 } 5351 5352 print_startup_info(); 5353 5354 /* and go! */ 5355 do_aio(devices, start_offset(), opts.report_time); 5356 /*NOTREACHED*/ 5357 return (1); 5358 } 5359 5360 long long 5361 convert_time(struct timeval tv) 5362 { 5363 long long tyme; 5364 long long mill = MILLION; 5365 5366 tyme = (long long)tv.tv_sec; 5367 assert(tyme >= 0); 5368 tyme *= mill; 5369 assert(tyme >= 0); 5370 tyme += tv.tv_usec; 5371 assert(tyme >= 0); 5372 return (tyme); 5373 } 5374 int 5375 longest_logical_name(void) 5376 { 5377 return (Longest_logical_name); 5378 } 5379 int 5380 longest_device_name(void) 5381 { 5382 return (Longest_device_name); 5383 } 5384 void 5385 update_time_stats(char off, struct times *tp, hrtime_t hrtyme, 5386 struct aio_str *aiop) 5387 { 5388 if (hrtyme < 0) { 5389 pfprintf(stderr, "Warning time appears to go backwards\n"); 5390 return; 5391 } 5392 5393 if (hrtyme > tp->worst) { 5394 tp->worst = hrtyme; 5395 } 5396 if (hrtyme < tp->best) { 5397 tp->best = hrtyme; 5398 } 5399 5400 tp->ave -= tp->last_few[tp->count % ARRAY_LEN(tp->last_few)]; 5401 tp->last_few[tp->count++ % ARRAY_LEN(tp->last_few)] = hrtyme; 5402 tp->ave += hrtyme; 5403 5404 if (opts.how_often_to_report && 5405 (tp->count % opts.how_often_to_report) == 0) { 5406 plog(LOG_INFO, "%-*s (%-*s) %s times (%.*f,%.*f,%.*f) %3d%%\n", 5407 longest_logical_name(), aiop->dev->logicalname, 5408 longest_device_name(), aiop->fd->name, 5409 tp->str, 5410 opts.expert_decimal_places, (double)tp->best/ACCURACY, 5411 opts.expert_decimal_places, 5412 (double)(tp->ave/MIN(tp->count, 5413 ARRAY_LEN(tp->last_few)))/ACCURACY, 5414 opts.expert_decimal_places, (double)tp->worst/ACCURACY, 5415 off); 5416 } 5417 } 5418