Home | History | Annotate | Download | only in aio
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * posix_aio.c implements the POSIX async. I/O functions.
     29  *
     30  *	aio_read
     31  *	aio_write
     32  *	aio_error
     33  *	aio_return
     34  *	aio_suspend
     35  *	lio_listio
     36  *	aio_fsync
     37  *	aio_cancel
     38  */
     39 
     40 #include "lint.h"
     41 #include "thr_uberdata.h"
     42 #include "asyncio.h"
     43 #include <atomic.h>
     44 #include <sys/file.h>
     45 #include <sys/port.h>
     46 
     47 extern int __fdsync(int, int);
     48 
     49 cond_t	_aio_waitn_cv = DEFAULTCV;	/* wait for end of aio_waitn */
     50 
     51 static int _aio_check_timeout(const timespec_t *, timespec_t *, int *);
     52 
     53 /* defines for timedwait in __aio_waitn()  and __aio_suspend() */
     54 #define	AIO_TIMEOUT_INDEF	-1
     55 #define	AIO_TIMEOUT_POLL	0
     56 #define	AIO_TIMEOUT_WAIT	1
     57 #define	AIO_TIMEOUT_UNDEF	2
     58 
     59 /*
     60  * List I/O stuff
     61  */
     62 static void _lio_list_decr(aio_lio_t *);
     63 static long aio_list_max = 0;
     64 
     65 int
     66 aio_read(aiocb_t *aiocbp)
     67 {
     68 	if (aiocbp == NULL || aiocbp->aio_reqprio != 0) {
     69 		errno = EINVAL;
     70 		return (-1);
     71 	}
     72 	if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
     73 		errno = EBUSY;
     74 		return (-1);
     75 	}
     76 	if (_aio_sigev_thread(aiocbp) != 0)
     77 		return (-1);
     78 	aiocbp->aio_lio_opcode = LIO_READ;
     79 	return (_aio_rw(aiocbp, NULL, &__nextworker_rw, AIOAREAD,
     80 	    (AIO_KAIO | AIO_NO_DUPS)));
     81 }
     82 
     83 int
     84 aio_write(aiocb_t *aiocbp)
     85 {
     86 	if (aiocbp == NULL || aiocbp->aio_reqprio != 0) {
     87 		errno = EINVAL;
     88 		return (-1);
     89 	}
     90 	if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
     91 		errno = EBUSY;
     92 		return (-1);
     93 	}
     94 	if (_aio_sigev_thread(aiocbp) != 0)
     95 		return (-1);
     96 	aiocbp->aio_lio_opcode = LIO_WRITE;
     97 	return (_aio_rw(aiocbp, NULL, &__nextworker_rw, AIOAWRITE,
     98 	    (AIO_KAIO | AIO_NO_DUPS)));
     99 }
    100 
    101 /*
    102  * __lio_listio() cancellation handler.
    103  */
    104 /* ARGSUSED */
    105 static void
    106 _lio_listio_cleanup(aio_lio_t *head)
    107 {
    108 	int freeit = 0;
    109 
    110 	ASSERT(MUTEX_HELD(&head->lio_mutex));
    111 	if (head->lio_refcnt == 0) {
    112 		ASSERT(head->lio_nent == 0);
    113 		freeit = 1;
    114 	}
    115 	head->lio_waiting = 0;
    116 	sig_mutex_unlock(&head->lio_mutex);
    117 	if (freeit)
    118 		_aio_lio_free(head);
    119 }
    120 
    121 int
    122 lio_listio(int mode, aiocb_t *_RESTRICT_KYWD const *_RESTRICT_KYWD list,
    123 	int nent, struct sigevent *_RESTRICT_KYWD sigevp)
    124 {
    125 	int 		aio_ufs = 0;
    126 	int 		oerrno = 0;
    127 	aio_lio_t	*head = NULL;
    128 	aiocb_t		*aiocbp;
    129 	int		state = 0;
    130 	int 		EIOflg = 0;
    131 	int 		rw;
    132 	int		do_kaio = 0;
    133 	int 		error;
    134 	int 		i;
    135 
    136 	if (!_kaio_ok)
    137 		_kaio_init();
    138 
    139 	if (aio_list_max == 0)
    140 		aio_list_max = sysconf(_SC_AIO_LISTIO_MAX);
    141 
    142 	if (nent <= 0 || nent > aio_list_max) {
    143 		errno = EINVAL;
    144 		return (-1);
    145 	}
    146 
    147 	switch (mode) {
    148 	case LIO_WAIT:
    149 		state = NOCHECK;
    150 		break;
    151 	case LIO_NOWAIT:
    152 		state = CHECK;
    153 		break;
    154 	default:
    155 		errno = EINVAL;
    156 		return (-1);
    157 	}
    158 
    159 	for (i = 0; i < nent; i++) {
    160 		if ((aiocbp = list[i]) == NULL)
    161 			continue;
    162 		if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
    163 			errno = EBUSY;
    164 			return (-1);
    165 		}
    166 		if (_aio_sigev_thread(aiocbp) != 0)
    167 			return (-1);
    168 		if (aiocbp->aio_lio_opcode == LIO_NOP)
    169 			aiocbp->aio_state = NOCHECK;
    170 		else {
    171 			aiocbp->aio_state = state;
    172 			if (KAIO_SUPPORTED(aiocbp->aio_fildes))
    173 				do_kaio++;
    174 			else
    175 				aiocbp->aio_resultp.aio_errno = ENOTSUP;
    176 		}
    177 	}
    178 	if (_aio_sigev_thread_init(sigevp) != 0)
    179 		return (-1);
    180 
    181 	if (do_kaio) {
    182 		error = (int)_kaio(AIOLIO, mode, list, nent, sigevp);
    183 		if (error == 0)
    184 			return (0);
    185 		oerrno = errno;
    186 	} else {
    187 		oerrno = errno = ENOTSUP;
    188 		error = -1;
    189 	}
    190 
    191 	if (error == -1 && errno == ENOTSUP) {
    192 		error = errno = 0;
    193 		/*
    194 		 * If LIO_WAIT, or notification required, allocate a list head.
    195 		 */
    196 		if (mode == LIO_WAIT ||
    197 		    (sigevp != NULL &&
    198 		    (sigevp->sigev_notify == SIGEV_SIGNAL ||
    199 		    sigevp->sigev_notify == SIGEV_THREAD ||
    200 		    sigevp->sigev_notify == SIGEV_PORT)))
    201 			head = _aio_lio_alloc();
    202 		if (head) {
    203 			sig_mutex_lock(&head->lio_mutex);
    204 			head->lio_mode = mode;
    205 			head->lio_largefile = 0;
    206 			if (mode == LIO_NOWAIT && sigevp != NULL) {
    207 				if (sigevp->sigev_notify == SIGEV_THREAD) {
    208 					head->lio_port = sigevp->sigev_signo;
    209 					head->lio_event = AIOLIO;
    210 					head->lio_sigevent = sigevp;
    211 					head->lio_sigval.sival_ptr =
    212 					    sigevp->sigev_value.sival_ptr;
    213 				} else if (sigevp->sigev_notify == SIGEV_PORT) {
    214 					port_notify_t *pn =
    215 					    sigevp->sigev_value.sival_ptr;
    216 					head->lio_port = pn->portnfy_port;
    217 					head->lio_event = AIOLIO;
    218 					head->lio_sigevent = sigevp;
    219 					head->lio_sigval.sival_ptr =
    220 					    pn->portnfy_user;
    221 				} else {	/* SIGEV_SIGNAL */
    222 					head->lio_signo = sigevp->sigev_signo;
    223 					head->lio_sigval.sival_ptr =
    224 					    sigevp->sigev_value.sival_ptr;
    225 				}
    226 			}
    227 			head->lio_nent = head->lio_refcnt = nent;
    228 			sig_mutex_unlock(&head->lio_mutex);
    229 		}
    230 		/*
    231 		 * find UFS requests, errno == ENOTSUP/EBADFD,
    232 		 */
    233 		for (i = 0; i < nent; i++) {
    234 			if ((aiocbp = list[i]) == NULL ||
    235 			    aiocbp->aio_lio_opcode == LIO_NOP ||
    236 			    (aiocbp->aio_resultp.aio_errno != ENOTSUP &&
    237 			    aiocbp->aio_resultp.aio_errno != EBADFD)) {
    238 				if (head)
    239 					_lio_list_decr(head);
    240 				continue;
    241 			}
    242 			if (aiocbp->aio_resultp.aio_errno == EBADFD)
    243 				SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes);
    244 			if (aiocbp->aio_reqprio != 0) {
    245 				aiocbp->aio_resultp.aio_errno = EINVAL;
    246 				aiocbp->aio_resultp.aio_return = -1;
    247 				EIOflg = 1;
    248 				if (head)
    249 					_lio_list_decr(head);
    250 				continue;
    251 			}
    252 			/*
    253 			 * submit an AIO request with flags AIO_NO_KAIO
    254 			 * to avoid the kaio() syscall in _aio_rw()
    255 			 */
    256 			switch (aiocbp->aio_lio_opcode) {
    257 			case LIO_READ:
    258 				rw = AIOAREAD;
    259 				break;
    260 			case LIO_WRITE:
    261 				rw = AIOAWRITE;
    262 				break;
    263 			}
    264 			error = _aio_rw(aiocbp, head, &__nextworker_rw, rw,
    265 			    (AIO_NO_KAIO | AIO_NO_DUPS));
    266 			if (error == 0)
    267 				aio_ufs++;
    268 			else {
    269 				if (head)
    270 					_lio_list_decr(head);
    271 				aiocbp->aio_resultp.aio_errno = error;
    272 				EIOflg = 1;
    273 			}
    274 		}
    275 	}
    276 	if (EIOflg) {
    277 		errno = EIO;
    278 		return (-1);
    279 	}
    280 	if (mode == LIO_WAIT && oerrno == ENOTSUP) {
    281 		/*
    282 		 * call kaio(AIOLIOWAIT) to get all outstanding
    283 		 * kernel AIO requests
    284 		 */
    285 		if ((nent - aio_ufs) > 0)
    286 			(void) _kaio(AIOLIOWAIT, mode, list, nent, sigevp);
    287 		if (head != NULL && head->lio_nent > 0) {
    288 			sig_mutex_lock(&head->lio_mutex);
    289 			while (head->lio_refcnt > 0) {
    290 				int err;
    291 				head->lio_waiting = 1;
    292 				pthread_cleanup_push(_lio_listio_cleanup, head);
    293 				err = sig_cond_wait(&head->lio_cond_cv,
    294 				    &head->lio_mutex);
    295 				pthread_cleanup_pop(0);
    296 				head->lio_waiting = 0;
    297 				if (err && head->lio_nent > 0) {
    298 					sig_mutex_unlock(&head->lio_mutex);
    299 					errno = err;
    300 					return (-1);
    301 				}
    302 			}
    303 			sig_mutex_unlock(&head->lio_mutex);
    304 			ASSERT(head->lio_nent == 0 && head->lio_refcnt == 0);
    305 			_aio_lio_free(head);
    306 			for (i = 0; i < nent; i++) {
    307 				if ((aiocbp = list[i]) != NULL &&
    308 				    aiocbp->aio_resultp.aio_errno) {
    309 					errno = EIO;
    310 					return (-1);
    311 				}
    312 			}
    313 		}
    314 		return (0);
    315 	}
    316 	return (error);
    317 }
    318 
    319 static void
    320 _lio_list_decr(aio_lio_t *head)
    321 {
    322 	sig_mutex_lock(&head->lio_mutex);
    323 	head->lio_nent--;
    324 	head->lio_refcnt--;
    325 	sig_mutex_unlock(&head->lio_mutex);
    326 }
    327 
    328 /*
    329  * __aio_suspend() cancellation handler.
    330  */
    331 /* ARGSUSED */
    332 static void
    333 _aio_suspend_cleanup(int *counter)
    334 {
    335 	ASSERT(MUTEX_HELD(&__aio_mutex));
    336 	(*counter)--;		/* _aio_kernel_suspend or _aio_suscv_cnt */
    337 	sig_mutex_unlock(&__aio_mutex);
    338 }
    339 
    340 static int
    341 __aio_suspend(void **list, int nent, const timespec_t *timo, int largefile)
    342 {
    343 	int		cv_err;	/* error code from cond_xxx() */
    344 	int		kerr;	/* error code from _kaio(AIOSUSPEND) */
    345 	int		i;
    346 	timespec_t	twait;	/* copy of timo for internal calculations */
    347 	timespec_t	*wait = NULL;
    348 	int		timedwait;
    349 	int		req_outstanding;
    350 	aiocb_t		**listp;
    351 	aiocb_t		*aiocbp;
    352 #if !defined(_LP64)
    353 	aiocb64_t	**listp64;
    354 	aiocb64_t	*aiocbp64;
    355 #endif
    356 	hrtime_t	hrtstart;
    357 	hrtime_t	hrtend;
    358 	hrtime_t	hrtres;
    359 
    360 #if defined(_LP64)
    361 	if (largefile)
    362 		aio_panic("__aio_suspend: largefile set when _LP64 defined");
    363 #endif
    364 
    365 	if (nent <= 0) {
    366 		errno = EINVAL;
    367 		return (-1);
    368 	}
    369 
    370 	if (timo) {
    371 		if (timo->tv_sec < 0 || timo->tv_nsec < 0 ||
    372 		    timo->tv_nsec >= NANOSEC) {
    373 			errno = EINVAL;
    374 			return (-1);
    375 		}
    376 		/* Initialize start time if time monitoring desired */
    377 		if (timo->tv_sec > 0 || timo->tv_nsec > 0) {
    378 			timedwait = AIO_TIMEOUT_WAIT;
    379 			hrtstart = gethrtime();
    380 		} else {
    381 			/* content of timeout = 0 : polling */
    382 			timedwait = AIO_TIMEOUT_POLL;
    383 		}
    384 	} else {
    385 		/* timeout pointer = NULL : wait indefinitely */
    386 		timedwait = AIO_TIMEOUT_INDEF;
    387 	}
    388 
    389 #if !defined(_LP64)
    390 	if (largefile) {
    391 		listp64 = (aiocb64_t **)list;
    392 		for (i = 0; i < nent; i++) {
    393 			if ((aiocbp64 = listp64[i]) != NULL &&
    394 			    aiocbp64->aio_state == CHECK)
    395 				aiocbp64->aio_state = CHECKED;
    396 		}
    397 	} else
    398 #endif	/* !_LP64 */
    399 	{
    400 		listp = (aiocb_t **)list;
    401 		for (i = 0; i < nent; i++) {
    402 			if ((aiocbp = listp[i]) != NULL &&
    403 			    aiocbp->aio_state == CHECK)
    404 				aiocbp->aio_state = CHECKED;
    405 		}
    406 	}
    407 
    408 	sig_mutex_lock(&__aio_mutex);
    409 
    410 	/*
    411 	 * The next "if -case" is required to accelerate the
    412 	 * access to completed RAW-IO requests.
    413 	 */
    414 	if ((_aio_doneq_cnt + _aio_outstand_cnt) == 0) {
    415 		/* Only kernel requests pending */
    416 
    417 		/*
    418 		 * _aio_kernel_suspend is used to detect completed non RAW-IO
    419 		 * requests.
    420 		 * As long as this thread resides in the kernel (_kaio) further
    421 		 * asynchronous non RAW-IO requests could be submitted.
    422 		 */
    423 		_aio_kernel_suspend++;
    424 
    425 		/*
    426 		 * Always do the kaio() call without using the KAIO_SUPPORTED()
    427 		 * checks because it is not mandatory to have a valid fd
    428 		 * set in the list entries, only the resultp must be set.
    429 		 *
    430 		 * _kaio(AIOSUSPEND ...) return values :
    431 		 *  0:  everythink ok, completed request found
    432 		 * -1:  error
    433 		 *  1:  no error : _aiodone awaked the _kaio(AIOSUSPEND,,)
    434 		 *	system call using  _kaio(AIONOTIFY). It means, that some
    435 		 *	non RAW-IOs completed inbetween.
    436 		 */
    437 
    438 		pthread_cleanup_push(_aio_suspend_cleanup,
    439 		    &_aio_kernel_suspend);
    440 		pthread_cleanup_push(sig_mutex_lock, &__aio_mutex);
    441 		sig_mutex_unlock(&__aio_mutex);
    442 		_cancel_prologue();
    443 		kerr = (int)_kaio(largefile? AIOSUSPEND64 : AIOSUSPEND,
    444 		    list, nent, timo, -1);
    445 		_cancel_epilogue();
    446 		pthread_cleanup_pop(1);	/* sig_mutex_lock(&__aio_mutex) */
    447 		pthread_cleanup_pop(0);
    448 
    449 		_aio_kernel_suspend--;
    450 
    451 		if (!kerr) {
    452 			sig_mutex_unlock(&__aio_mutex);
    453 			return (0);
    454 		}
    455 	} else {
    456 		kerr = 1;	/* simulation: _kaio detected AIONOTIFY */
    457 	}
    458 
    459 	/*
    460 	 * Return kernel error code if no other IOs are outstanding.
    461 	 */
    462 	req_outstanding = _aio_doneq_cnt + _aio_outstand_cnt;
    463 
    464 	sig_mutex_unlock(&__aio_mutex);
    465 
    466 	if (req_outstanding == 0) {
    467 		/* no IOs outstanding in the thread pool */
    468 		if (kerr == 1)
    469 			/* return "no IOs completed" */
    470 			errno = EAGAIN;
    471 		return (-1);
    472 	}
    473 
    474 	/*
    475 	 * IOs using the thread pool are outstanding.
    476 	 */
    477 	if (timedwait == AIO_TIMEOUT_WAIT) {
    478 		/* time monitoring */
    479 		hrtend = hrtstart + (hrtime_t)timo->tv_sec * (hrtime_t)NANOSEC +
    480 		    (hrtime_t)timo->tv_nsec;
    481 		hrtres = hrtend - gethrtime();
    482 		if (hrtres <= 0)
    483 			hrtres = 1;
    484 		twait.tv_sec = hrtres / (hrtime_t)NANOSEC;
    485 		twait.tv_nsec = hrtres % (hrtime_t)NANOSEC;
    486 		wait = &twait;
    487 	} else if (timedwait == AIO_TIMEOUT_POLL) {
    488 		twait = *timo;	/* content of timo = 0 : polling */
    489 		wait = &twait;
    490 	}
    491 
    492 	for (;;) {
    493 		int	error;
    494 		int	inprogress;
    495 
    496 		/* first scan file system requests */
    497 		inprogress = 0;
    498 		for (i = 0; i < nent; i++) {
    499 #if !defined(_LP64)
    500 			if (largefile) {
    501 				if ((aiocbp64 = listp64[i]) == NULL)
    502 					continue;
    503 				error = aiocbp64->aio_resultp.aio_errno;
    504 			} else
    505 #endif
    506 			{
    507 				if ((aiocbp = listp[i]) == NULL)
    508 					continue;
    509 				error = aiocbp->aio_resultp.aio_errno;
    510 			}
    511 			if (error == EINPROGRESS)
    512 				inprogress = 1;
    513 			else if (error != ECANCELED) {
    514 				errno = 0;
    515 				return (0);
    516 			}
    517 		}
    518 
    519 		sig_mutex_lock(&__aio_mutex);
    520 
    521 		/*
    522 		 * If there aren't outstanding I/Os in the thread pool then
    523 		 * we have to return here, provided that all kernel RAW-IOs
    524 		 * also completed.
    525 		 * If the kernel was notified to return, then we have to check
    526 		 * possible pending RAW-IOs.
    527 		 */
    528 		if (_aio_outstand_cnt == 0 && inprogress == 0 && kerr != 1) {
    529 			sig_mutex_unlock(&__aio_mutex);
    530 			errno = EAGAIN;
    531 			break;
    532 		}
    533 
    534 		/*
    535 		 * There are outstanding IOs in the thread pool or the kernel
    536 		 * was notified to return.
    537 		 * Check pending RAW-IOs first.
    538 		 */
    539 		if (kerr == 1) {
    540 			/*
    541 			 * _aiodone just notified the kernel about
    542 			 * completed non RAW-IOs (AIONOTIFY was detected).
    543 			 */
    544 			if (timedwait == AIO_TIMEOUT_WAIT) {
    545 				/* Update remaining timeout for the kernel */
    546 				hrtres = hrtend - gethrtime();
    547 				if (hrtres <= 0) {
    548 					/* timer expired */
    549 					sig_mutex_unlock(&__aio_mutex);
    550 					errno = EAGAIN;
    551 					break;
    552 				}
    553 				wait->tv_sec = hrtres / (hrtime_t)NANOSEC;
    554 				wait->tv_nsec = hrtres % (hrtime_t)NANOSEC;
    555 			}
    556 			_aio_kernel_suspend++;
    557 
    558 			pthread_cleanup_push(_aio_suspend_cleanup,
    559 			    &_aio_kernel_suspend);
    560 			pthread_cleanup_push(sig_mutex_lock, &__aio_mutex);
    561 			sig_mutex_unlock(&__aio_mutex);
    562 			_cancel_prologue();
    563 			kerr = (int)_kaio(largefile? AIOSUSPEND64 : AIOSUSPEND,
    564 			    list, nent, wait, -1);
    565 			_cancel_epilogue();
    566 			pthread_cleanup_pop(1);
    567 			pthread_cleanup_pop(0);
    568 
    569 			_aio_kernel_suspend--;
    570 
    571 			if (!kerr) {
    572 				sig_mutex_unlock(&__aio_mutex);
    573 				return (0);
    574 			}
    575 		}
    576 
    577 		if (timedwait == AIO_TIMEOUT_POLL) {
    578 			sig_mutex_unlock(&__aio_mutex);
    579 			errno = EAGAIN;
    580 			break;
    581 		}
    582 
    583 		if (timedwait == AIO_TIMEOUT_WAIT) {
    584 			/* Update remaining timeout */
    585 			hrtres = hrtend - gethrtime();
    586 			if (hrtres <= 0) {
    587 				/* timer expired */
    588 				sig_mutex_unlock(&__aio_mutex);
    589 				errno = EAGAIN;
    590 				break;
    591 			}
    592 			wait->tv_sec = hrtres / (hrtime_t)NANOSEC;
    593 			wait->tv_nsec = hrtres % (hrtime_t)NANOSEC;
    594 		}
    595 
    596 		if (_aio_outstand_cnt == 0) {
    597 			sig_mutex_unlock(&__aio_mutex);
    598 			continue;
    599 		}
    600 
    601 		_aio_suscv_cnt++;	/* ID for _aiodone (wake up) */
    602 
    603 		pthread_cleanup_push(_aio_suspend_cleanup, &_aio_suscv_cnt);
    604 		if (timedwait == AIO_TIMEOUT_WAIT) {
    605 			cv_err = sig_cond_reltimedwait(&_aio_iowait_cv,
    606 			    &__aio_mutex, wait);
    607 			if (cv_err == ETIME)
    608 				cv_err = EAGAIN;
    609 		} else {
    610 			/* wait indefinitely */
    611 			cv_err = sig_cond_wait(&_aio_iowait_cv, &__aio_mutex);
    612 		}
    613 		/* this decrements _aio_suscv_cnt and drops __aio_mutex */
    614 		pthread_cleanup_pop(1);
    615 
    616 		if (cv_err) {
    617 			errno = cv_err;
    618 			break;
    619 		}
    620 	}
    621 	return (-1);
    622 }
    623 
    624 int
    625 aio_suspend(const aiocb_t * const list[], int nent,
    626     const timespec_t *timeout)
    627 {
    628 	return (__aio_suspend((void **)list, nent, timeout, 0));
    629 }
    630 
    631 int
    632 aio_error(const aiocb_t *aiocbp)
    633 {
    634 	const aio_result_t *resultp = &aiocbp->aio_resultp;
    635 	aio_req_t *reqp;
    636 	int error;
    637 
    638 	if ((error = resultp->aio_errno) == EINPROGRESS) {
    639 		if (aiocbp->aio_state == CHECK) {
    640 			/*
    641 			 * Always do the kaio() call without using the
    642 			 * KAIO_SUPPORTED() checks because it is not
    643 			 * mandatory to have a valid fd set in the
    644 			 * aiocb, only the resultp must be set.
    645 			 */
    646 			if ((int)_kaio(AIOERROR, aiocbp) == EINVAL) {
    647 				errno = EINVAL;
    648 				return (-1);
    649 			}
    650 			error = resultp->aio_errno;
    651 		} else if (aiocbp->aio_state == CHECKED) {
    652 			((aiocb_t *)aiocbp)->aio_state = CHECK;
    653 		}
    654 	} else if (aiocbp->aio_state == USERAIO) {
    655 		sig_mutex_lock(&__aio_mutex);
    656 		if ((reqp = _aio_hash_del((aio_result_t *)resultp)) == NULL) {
    657 			sig_mutex_unlock(&__aio_mutex);
    658 			((aiocb_t *)aiocbp)->aio_state = CHECKED;
    659 		} else {
    660 			((aiocb_t *)aiocbp)->aio_state = NOCHECK;
    661 			ASSERT(reqp->req_head == NULL);
    662 			(void) _aio_req_remove(reqp);
    663 			sig_mutex_unlock(&__aio_mutex);
    664 			_aio_req_free(reqp);
    665 		}
    666 	}
    667 	return (error);
    668 }
    669 
    670 ssize_t
    671 aio_return(aiocb_t *aiocbp)
    672 {
    673 	aio_result_t *resultp = &aiocbp->aio_resultp;
    674 	aio_req_t *reqp;
    675 	int error;
    676 	ssize_t retval;
    677 
    678 	/*
    679 	 * The _aiodone() function stores resultp->aio_return before
    680 	 * storing resultp->aio_errno (with an membar_producer() in
    681 	 * between).  We use membar_consumer() below to ensure proper
    682 	 * memory ordering between _aiodone() and ourself.
    683 	 */
    684 	error = resultp->aio_errno;
    685 	membar_consumer();
    686 	retval = resultp->aio_return;
    687 
    688 	/*
    689 	 * we use this condition to indicate either that
    690 	 * aio_return() has been called before or should
    691 	 * not have been called yet.
    692 	 */
    693 	if ((retval == -1 && error == EINVAL) || error == EINPROGRESS) {
    694 		errno = error;
    695 		return (-1);
    696 	}
    697 
    698 	/*
    699 	 * Before we return, mark the result as being returned so that later
    700 	 * calls to aio_return() will return the fact that the result has
    701 	 * already been returned.
    702 	 */
    703 	sig_mutex_lock(&__aio_mutex);
    704 	/* retest, in case more than one thread actually got in here */
    705 	if (resultp->aio_return == -1 && resultp->aio_errno == EINVAL) {
    706 		sig_mutex_unlock(&__aio_mutex);
    707 		errno = EINVAL;
    708 		return (-1);
    709 	}
    710 	resultp->aio_return = -1;
    711 	resultp->aio_errno = EINVAL;
    712 	if ((reqp = _aio_hash_del(resultp)) == NULL)
    713 		sig_mutex_unlock(&__aio_mutex);
    714 	else {
    715 		aiocbp->aio_state = NOCHECK;
    716 		ASSERT(reqp->req_head == NULL);
    717 		(void) _aio_req_remove(reqp);
    718 		sig_mutex_unlock(&__aio_mutex);
    719 		_aio_req_free(reqp);
    720 	}
    721 
    722 	if (retval == -1)
    723 		errno = error;
    724 	return (retval);
    725 }
    726 
    727 void
    728 _lio_remove(aio_req_t *reqp)
    729 {
    730 	aio_lio_t *head;
    731 	int refcnt;
    732 
    733 	if ((head = reqp->req_head) != NULL) {
    734 		sig_mutex_lock(&head->lio_mutex);
    735 		ASSERT(head->lio_refcnt == head->lio_nent);
    736 		refcnt = --head->lio_nent;
    737 		head->lio_refcnt--;
    738 		sig_mutex_unlock(&head->lio_mutex);
    739 		if (refcnt == 0)
    740 			_aio_lio_free(head);
    741 		reqp->req_head = NULL;
    742 	}
    743 }
    744 
    745 /*
    746  * This function returns the number of asynchronous I/O requests submitted.
    747  */
    748 static int
    749 __aio_fsync_bar(aiocb_t *aiocbp, aio_lio_t *head, aio_worker_t *aiowp,
    750     int workerscnt)
    751 {
    752 	int i;
    753 	int error;
    754 	aio_worker_t *next = aiowp;
    755 
    756 	for (i = 0; i < workerscnt; i++) {
    757 		error = _aio_rw(aiocbp, head, &next, AIOFSYNC, AIO_NO_KAIO);
    758 		if (error != 0) {
    759 			sig_mutex_lock(&head->lio_mutex);
    760 			head->lio_mode = LIO_DESTROY;	/* ignore fsync */
    761 			head->lio_nent -= workerscnt - i;
    762 			head->lio_refcnt -= workerscnt - i;
    763 			sig_mutex_unlock(&head->lio_mutex);
    764 			errno = EAGAIN;
    765 			return (i);
    766 		}
    767 		next = next->work_forw;
    768 	}
    769 	return (i);
    770 }
    771 
    772 int
    773 aio_fsync(int op, aiocb_t *aiocbp)
    774 {
    775 	aio_lio_t *head;
    776 	struct stat statb;
    777 	int fret;
    778 
    779 	if (aiocbp == NULL)
    780 		return (0);
    781 	if (op != O_DSYNC && op != O_SYNC) {
    782 		errno = EINVAL;
    783 		return (-1);
    784 	}
    785 	if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
    786 		errno = EBUSY;
    787 		return (-1);
    788 	}
    789 	if (fstat(aiocbp->aio_fildes, &statb) < 0)
    790 		return (-1);
    791 	if (_aio_sigev_thread(aiocbp) != 0)
    792 		return (-1);
    793 
    794 	/*
    795 	 * Kernel aio_fsync() is not supported.
    796 	 * We force user-level aio_fsync() just
    797 	 * for the notification side-effect.
    798 	 */
    799 	if (!__uaio_ok && __uaio_init() == -1)
    800 		return (-1);
    801 
    802 	/*
    803 	 * The first asynchronous I/O request in the current process will
    804 	 * create a bunch of workers (via __uaio_init()).  If the number
    805 	 * of workers is zero then the number of pending asynchronous I/O
    806 	 * requests is zero.  In such a case only execute the standard
    807 	 * fsync(3C) or fdatasync(3RT) as appropriate.
    808 	 */
    809 	if (__rw_workerscnt == 0) {
    810 		if (op == O_DSYNC)
    811 			return (__fdsync(aiocbp->aio_fildes, FDSYNC));
    812 		else
    813 			return (__fdsync(aiocbp->aio_fildes, FSYNC));
    814 	}
    815 
    816 	/*
    817 	 * re-use aio_offset as the op field.
    818 	 * 	O_DSYNC - fdatasync()
    819 	 * 	O_SYNC - fsync()
    820 	 */
    821 	aiocbp->aio_offset = op;
    822 	aiocbp->aio_lio_opcode = AIOFSYNC;
    823 
    824 	/*
    825 	 * Create a list of fsync requests.  The worker that
    826 	 * gets the last request will do the fsync request.
    827 	 */
    828 	head = _aio_lio_alloc();
    829 	if (head == NULL) {
    830 		errno = EAGAIN;
    831 		return (-1);
    832 	}
    833 	head->lio_mode = LIO_FSYNC;
    834 	head->lio_nent = head->lio_refcnt = __rw_workerscnt;
    835 	head->lio_largefile = 0;
    836 
    837 	/*
    838 	 * Insert an fsync request on every worker's queue.
    839 	 */
    840 	fret = __aio_fsync_bar(aiocbp, head, __workers_rw, __rw_workerscnt);
    841 	if (fret != __rw_workerscnt) {
    842 		/*
    843 		 * Fewer fsync requests than workers means that it was
    844 		 * not possible to submit fsync requests to all workers.
    845 		 * Actions:
    846 		 * a) number of fsync requests submitted is 0:
    847 		 *    => free allocated memory (aio_lio_t).
    848 		 * b) number of fsync requests submitted is > 0:
    849 		 *    => the last worker executing the fsync request
    850 		 *	 will free the aio_lio_t struct.
    851 		 */
    852 		if (fret == 0)
    853 			_aio_lio_free(head);
    854 		return (-1);
    855 	}
    856 	return (0);
    857 }
    858 
    859 int
    860 aio_cancel(int fd, aiocb_t *aiocbp)
    861 {
    862 	aio_req_t *reqp;
    863 	aio_worker_t *aiowp;
    864 	int done = 0;
    865 	int canceled = 0;
    866 	struct stat buf;
    867 
    868 	if (fstat(fd, &buf) < 0)
    869 		return (-1);
    870 
    871 	if (aiocbp != NULL) {
    872 		if (fd != aiocbp->aio_fildes) {
    873 			errno = EINVAL;
    874 			return (-1);
    875 		}
    876 		if (aiocbp->aio_state == USERAIO) {
    877 			sig_mutex_lock(&__aio_mutex);
    878 			reqp = _aio_hash_find(&aiocbp->aio_resultp);
    879 			if (reqp == NULL) {
    880 				sig_mutex_unlock(&__aio_mutex);
    881 				return (AIO_ALLDONE);
    882 			}
    883 			aiowp = reqp->req_worker;
    884 			sig_mutex_lock(&aiowp->work_qlock1);
    885 			(void) _aio_cancel_req(aiowp, reqp, &canceled, &done);
    886 			sig_mutex_unlock(&aiowp->work_qlock1);
    887 			sig_mutex_unlock(&__aio_mutex);
    888 			if (done)
    889 				return (AIO_ALLDONE);
    890 			if (canceled)
    891 				return (AIO_CANCELED);
    892 			return (AIO_NOTCANCELED);
    893 		}
    894 		if (aiocbp->aio_state == USERAIO_DONE)
    895 			return (AIO_ALLDONE);
    896 		return ((int)_kaio(AIOCANCEL, fd, aiocbp));
    897 	}
    898 
    899 	return (aiocancel_all(fd));
    900 }
    901 
    902 /*
    903  * __aio_waitn() cancellation handler.
    904  */
    905 /* ARGSUSED */
    906 static void
    907 _aio_waitn_cleanup(void *arg)
    908 {
    909 	ASSERT(MUTEX_HELD(&__aio_mutex));
    910 
    911 	/* check for pending aio_waitn() calls */
    912 	_aio_flags &= ~(AIO_LIB_WAITN | AIO_WAIT_INPROGRESS | AIO_IO_WAITING);
    913 	if (_aio_flags & AIO_LIB_WAITN_PENDING) {
    914 		_aio_flags &= ~AIO_LIB_WAITN_PENDING;
    915 		(void) cond_signal(&_aio_waitn_cv);
    916 	}
    917 
    918 	sig_mutex_unlock(&__aio_mutex);
    919 }
    920 
    921 /*
    922  * aio_waitn can be used to reap the results of several I/O operations that
    923  * were submitted asynchronously. The submission of I/Os can be done using
    924  * existing POSIX interfaces: lio_listio, aio_write or aio_read.
    925  * aio_waitn waits until "nwait" I/Os (supplied as a parameter) have
    926  * completed and it returns the descriptors for these I/Os in "list". The
    927  * maximum size of this list is given by "nent" and the actual number of I/Os
    928  * completed is returned in "nwait". Otherwise aio_waitn might also
    929  * return if the timeout expires. Additionally, aio_waitn returns 0 if
    930  * successful or -1 if an error occurred.
    931  */
    932 static int
    933 __aio_waitn(void **list, uint_t nent, uint_t *nwait, const timespec_t *utimo)
    934 {
    935 	int error = 0;
    936 	uint_t dnwait = 0;	/* amount of requests in the waitn-done list */
    937 	uint_t kwaitcnt;	/* expected "done" requests from kernel */
    938 	uint_t knentcnt;	/* max. expected "done" requests from kernel */
    939 	int uerrno = 0;
    940 	int kerrno = 0;		/* save errno from _kaio() call */
    941 	int timedwait = AIO_TIMEOUT_UNDEF;
    942 	aio_req_t *reqp;
    943 	timespec_t end;
    944 	timespec_t twait;	/* copy of utimo for internal calculations */
    945 	timespec_t *wait = NULL;
    946 
    947 	if (nent == 0 || *nwait == 0 || *nwait > nent) {
    948 		errno = EINVAL;
    949 		return (-1);
    950 	}
    951 
    952 	/*
    953 	 * Only one running aio_waitn call per process allowed.
    954 	 * Further calls will be blocked here until the running
    955 	 * call finishes.
    956 	 */
    957 
    958 	sig_mutex_lock(&__aio_mutex);
    959 
    960 	while (_aio_flags & AIO_LIB_WAITN) {
    961 		if (utimo && utimo->tv_sec == 0 && utimo->tv_nsec == 0) {
    962 			sig_mutex_unlock(&__aio_mutex);
    963 			*nwait = 0;
    964 			return (0);
    965 		}
    966 		_aio_flags |= AIO_LIB_WAITN_PENDING;
    967 		pthread_cleanup_push(sig_mutex_unlock, &__aio_mutex);
    968 		error = sig_cond_wait(&_aio_waitn_cv, &__aio_mutex);
    969 		pthread_cleanup_pop(0);
    970 		if (error != 0) {
    971 			sig_mutex_unlock(&__aio_mutex);
    972 			*nwait = 0;
    973 			errno = error;
    974 			return (-1);
    975 		}
    976 	}
    977 
    978 	pthread_cleanup_push(_aio_waitn_cleanup, NULL);
    979 
    980 	_aio_flags |= AIO_LIB_WAITN;
    981 
    982 	if (_aio_check_timeout(utimo, &end, &timedwait) != 0) {
    983 		error = -1;
    984 		dnwait = 0;
    985 		goto out;
    986 	}
    987 	if (timedwait != AIO_TIMEOUT_INDEF) {
    988 		twait = *utimo;
    989 		wait = &twait;
    990 	}
    991 
    992 	/*
    993 	 * If both counters are still set to zero, then only
    994 	 * kernel requests are currently outstanding (raw-I/Os).
    995 	 */
    996 	if ((_aio_doneq_cnt + _aio_outstand_cnt) == 0) {
    997 		for (;;) {
    998 			kwaitcnt = *nwait - dnwait;
    999 			knentcnt = nent - dnwait;
   1000 			if (knentcnt > AIO_WAITN_MAXIOCBS)
   1001 				knentcnt = AIO_WAITN_MAXIOCBS;
   1002 			kwaitcnt = (kwaitcnt > knentcnt) ? knentcnt : kwaitcnt;
   1003 
   1004 			pthread_cleanup_push(sig_mutex_lock, &__aio_mutex);
   1005 			sig_mutex_unlock(&__aio_mutex);
   1006 			_cancel_prologue();
   1007 			error = (int)_kaio(AIOWAITN, &list[dnwait], knentcnt,
   1008 			    &kwaitcnt, wait);
   1009 			_cancel_epilogue();
   1010 			pthread_cleanup_pop(1);
   1011 
   1012 			if (error == 0) {
   1013 				dnwait += kwaitcnt;
   1014 				if (dnwait >= *nwait ||
   1015 				    *nwait < AIO_WAITN_MAXIOCBS)
   1016 					break;
   1017 				if (timedwait == AIO_TIMEOUT_WAIT) {
   1018 					error = _aio_get_timedelta(&end, wait);
   1019 					if (error ==  -1) {
   1020 						/* timer expired */
   1021 						errno = ETIME;
   1022 						break;
   1023 					}
   1024 				}
   1025 				continue;
   1026 			}
   1027 			if (errno == EAGAIN) {
   1028 				if (dnwait > 0)
   1029 					error = 0;
   1030 				break;
   1031 			}
   1032 			if (errno == ETIME || errno == EINTR) {
   1033 				dnwait += kwaitcnt;
   1034 				break;
   1035 			}
   1036 			/* fatal error */
   1037 			break;
   1038 		}
   1039 
   1040 		goto out;
   1041 	}
   1042 
   1043 	/* File system I/Os outstanding ... */
   1044 
   1045 	if (timedwait == AIO_TIMEOUT_UNDEF) {
   1046 		if (_aio_check_timeout(utimo, &end, &timedwait) != 0) {
   1047 			error = -1;
   1048 			dnwait = 0;
   1049 			goto out;
   1050 		}
   1051 		if (timedwait != AIO_TIMEOUT_INDEF) {
   1052 			twait = *utimo;
   1053 			wait = &twait;
   1054 		}
   1055 	}
   1056 
   1057 	for (;;) {
   1058 		uint_t	sum_reqs;
   1059 
   1060 		/*
   1061 		 * Calculate sum of active non RAW-IO requests (sum_reqs).
   1062 		 * If the expected amount of completed requests (*nwait) is
   1063 		 * greater than the calculated sum (sum_reqs) then
   1064 		 * use _kaio to check pending RAW-IO requests.
   1065 		 */
   1066 		sum_reqs = _aio_doneq_cnt + dnwait + _aio_outstand_cnt;
   1067 		kwaitcnt = (*nwait > sum_reqs) ? *nwait - sum_reqs : 0;
   1068 
   1069 		if (kwaitcnt != 0) {
   1070 			/* possibly some kernel I/Os outstanding */
   1071 			knentcnt = nent - dnwait;
   1072 			if (knentcnt > AIO_WAITN_MAXIOCBS)
   1073 				knentcnt = AIO_WAITN_MAXIOCBS;
   1074 			kwaitcnt = (kwaitcnt > knentcnt) ? knentcnt : kwaitcnt;
   1075 
   1076 			_aio_flags |= AIO_WAIT_INPROGRESS;
   1077 
   1078 			pthread_cleanup_push(sig_mutex_lock, &__aio_mutex);
   1079 			sig_mutex_unlock(&__aio_mutex);
   1080 			_cancel_prologue();
   1081 			error = (int)_kaio(AIOWAITN, &list[dnwait], knentcnt,
   1082 			    &kwaitcnt, wait);
   1083 			_cancel_epilogue();
   1084 			pthread_cleanup_pop(1);
   1085 
   1086 			_aio_flags &= ~AIO_WAIT_INPROGRESS;
   1087 
   1088 			if (error == 0) {
   1089 				dnwait += kwaitcnt;
   1090 			} else {
   1091 				switch (errno) {
   1092 				case EINVAL:
   1093 				case EAGAIN:
   1094 					/* don't wait for kernel I/Os */
   1095 					kerrno = 0; /* ignore _kaio() errno */
   1096 					*nwait = _aio_doneq_cnt +
   1097 					    _aio_outstand_cnt + dnwait;
   1098 					error = 0;
   1099 					break;
   1100 				case EINTR:
   1101 				case ETIME:
   1102 					/* just scan for completed LIB I/Os */
   1103 					dnwait += kwaitcnt;
   1104 					timedwait = AIO_TIMEOUT_POLL;
   1105 					kerrno = errno;	/* save _kaio() errno */
   1106 					error = 0;
   1107 					break;
   1108 				default:
   1109 					kerrno = errno;	/* save _kaio() errno */
   1110 					break;
   1111 				}
   1112 			}
   1113 			if (error)
   1114 				break;		/* fatal kernel error */
   1115 		}
   1116 
   1117 		/* check completed FS requests in the "done" queue */
   1118 
   1119 		while (_aio_doneq_cnt && dnwait < nent) {
   1120 			/* get done requests */
   1121 			if ((reqp = _aio_req_remove(NULL)) != NULL) {
   1122 				(void) _aio_hash_del(reqp->req_resultp);
   1123 				list[dnwait++] = reqp->req_aiocbp;
   1124 				_aio_req_mark_done(reqp);
   1125 				_lio_remove(reqp);
   1126 				_aio_req_free(reqp);
   1127 			}
   1128 		}
   1129 
   1130 		if (dnwait >= *nwait) {
   1131 			/* min. requested amount of completed I/Os satisfied */
   1132 			break;
   1133 		}
   1134 		if (timedwait == AIO_TIMEOUT_WAIT &&
   1135 		    (error = _aio_get_timedelta(&end, wait)) == -1) {
   1136 			/* timer expired */
   1137 			uerrno = ETIME;
   1138 			break;
   1139 		}
   1140 
   1141 		/*
   1142 		 * If some I/Os are outstanding and we have to wait for them,
   1143 		 * then sleep here.  _aiodone() will call _aio_waitn_wakeup()
   1144 		 * to wakeup this thread as soon as the required amount of
   1145 		 * completed I/Os is done.
   1146 		 */
   1147 		if (_aio_outstand_cnt > 0 && timedwait != AIO_TIMEOUT_POLL) {
   1148 			/*
   1149 			 * _aio_waitn_wakeup() will wake up this thread when:
   1150 			 * - _aio_waitncnt requests are completed or
   1151 			 * - _aio_outstand_cnt becomes zero.
   1152 			 * sig_cond_reltimedwait() could also return with
   1153 			 * a timeout error (ETIME).
   1154 			 */
   1155 			if (*nwait < _aio_outstand_cnt)
   1156 				_aio_waitncnt = *nwait;
   1157 			else
   1158 				_aio_waitncnt = _aio_outstand_cnt;
   1159 
   1160 			_aio_flags |= AIO_IO_WAITING;
   1161 
   1162 			if (wait)
   1163 				uerrno = sig_cond_reltimedwait(&_aio_iowait_cv,
   1164 				    &__aio_mutex, wait);
   1165 			else
   1166 				uerrno = sig_cond_wait(&_aio_iowait_cv,
   1167 				    &__aio_mutex);
   1168 
   1169 			_aio_flags &= ~AIO_IO_WAITING;
   1170 
   1171 			if (uerrno == ETIME) {
   1172 				timedwait = AIO_TIMEOUT_POLL;
   1173 				continue;
   1174 			}
   1175 			if (uerrno != 0)
   1176 				timedwait = AIO_TIMEOUT_POLL;
   1177 		}
   1178 
   1179 		if (timedwait == AIO_TIMEOUT_POLL) {
   1180 			/* polling or timer expired */
   1181 			break;
   1182 		}
   1183 	}
   1184 
   1185 	errno = uerrno == 0 ? kerrno : uerrno;
   1186 	if (errno)
   1187 		error = -1;
   1188 	else
   1189 		error = 0;
   1190 
   1191 out:
   1192 	*nwait = dnwait;
   1193 
   1194 	pthread_cleanup_pop(1);		/* drops __aio_mutex */
   1195 
   1196 	return (error);
   1197 }
   1198 
   1199 int
   1200 aio_waitn(aiocb_t *list[], uint_t nent, uint_t *nwait,
   1201 	const timespec_t *timeout)
   1202 {
   1203 	return (__aio_waitn((void **)list, nent, nwait, timeout));
   1204 }
   1205 
   1206 void
   1207 _aio_waitn_wakeup(void)
   1208 {
   1209 	/*
   1210 	 * __aio_waitn() sets AIO_IO_WAITING to notify _aiodone() that
   1211 	 * it is waiting for completed I/Os. The number of required
   1212 	 * completed I/Os is stored into "_aio_waitncnt".
   1213 	 * aio_waitn() is woken up when
   1214 	 * - there are no further outstanding I/Os
   1215 	 *   (_aio_outstand_cnt == 0) or
   1216 	 * - the expected number of I/Os has completed.
   1217 	 * Only one __aio_waitn() function waits for completed I/Os at
   1218 	 * a time.
   1219 	 *
   1220 	 * __aio_suspend() increments "_aio_suscv_cnt" to notify
   1221 	 * _aiodone() that at least one __aio_suspend() call is
   1222 	 * waiting for completed I/Os.
   1223 	 * There could be more than one __aio_suspend() function
   1224 	 * waiting for completed I/Os. Because every function should
   1225 	 * be waiting for different I/Os, _aiodone() has to wake up all
   1226 	 * __aio_suspend() functions each time.
   1227 	 * Every __aio_suspend() function will compare the recently
   1228 	 * completed I/O with its own list.
   1229 	 */
   1230 	ASSERT(MUTEX_HELD(&__aio_mutex));
   1231 	if (_aio_flags & AIO_IO_WAITING) {
   1232 		if (_aio_waitncnt > 0)
   1233 			_aio_waitncnt--;
   1234 		if (_aio_outstand_cnt == 0 || _aio_waitncnt == 0 ||
   1235 		    _aio_suscv_cnt > 0)
   1236 			(void) cond_broadcast(&_aio_iowait_cv);
   1237 	} else {
   1238 		/* Wake up waiting aio_suspend calls */
   1239 		if (_aio_suscv_cnt > 0)
   1240 			(void) cond_broadcast(&_aio_iowait_cv);
   1241 	}
   1242 }
   1243 
   1244 /*
   1245  * timedwait values :
   1246  * AIO_TIMEOUT_POLL 	: polling
   1247  * AIO_TIMEOUT_WAIT 	: timeout
   1248  * AIO_TIMEOUT_INDEF	: wait indefinitely
   1249  */
   1250 static int
   1251 _aio_check_timeout(const timespec_t *utimo, timespec_t *end, int *timedwait)
   1252 {
   1253 	struct	timeval	curtime;
   1254 
   1255 	if (utimo) {
   1256 		if (utimo->tv_sec < 0 || utimo->tv_nsec < 0 ||
   1257 		    utimo->tv_nsec >= NANOSEC) {
   1258 			errno = EINVAL;
   1259 			return (-1);
   1260 		}
   1261 		if (utimo->tv_sec > 0 || utimo->tv_nsec > 0) {
   1262 			(void) gettimeofday(&curtime, NULL);
   1263 			end->tv_sec = utimo->tv_sec + curtime.tv_sec;
   1264 			end->tv_nsec = utimo->tv_nsec + 1000 * curtime.tv_usec;
   1265 			if (end->tv_nsec >= NANOSEC) {
   1266 				end->tv_nsec -= NANOSEC;
   1267 				end->tv_sec += 1;
   1268 			}
   1269 			*timedwait = AIO_TIMEOUT_WAIT;
   1270 		} else {
   1271 			/* polling */
   1272 			*timedwait = AIO_TIMEOUT_POLL;
   1273 		}
   1274 	} else {
   1275 		*timedwait = AIO_TIMEOUT_INDEF;		/* wait indefinitely */
   1276 	}
   1277 	return (0);
   1278 }
   1279 
   1280 #if !defined(_LP64)
   1281 
   1282 int
   1283 aio_read64(aiocb64_t *aiocbp)
   1284 {
   1285 	if (aiocbp == NULL || aiocbp->aio_reqprio != 0) {
   1286 		errno = EINVAL;
   1287 		return (-1);
   1288 	}
   1289 	if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
   1290 		errno = EBUSY;
   1291 		return (-1);
   1292 	}
   1293 	if (_aio_sigev_thread64(aiocbp) != 0)
   1294 		return (-1);
   1295 	aiocbp->aio_lio_opcode = LIO_READ;
   1296 	return (_aio_rw64(aiocbp, NULL, &__nextworker_rw, AIOAREAD64,
   1297 	    (AIO_KAIO | AIO_NO_DUPS)));
   1298 }
   1299 
   1300 int
   1301 aio_write64(aiocb64_t *aiocbp)
   1302 {
   1303 	if (aiocbp == NULL || aiocbp->aio_reqprio != 0) {
   1304 		errno = EINVAL;
   1305 		return (-1);
   1306 	}
   1307 	if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
   1308 		errno = EBUSY;
   1309 		return (-1);
   1310 	}
   1311 	if (_aio_sigev_thread64(aiocbp) != 0)
   1312 		return (-1);
   1313 	aiocbp->aio_lio_opcode = LIO_WRITE;
   1314 	return (_aio_rw64(aiocbp, NULL, &__nextworker_rw, AIOAWRITE64,
   1315 	    (AIO_KAIO | AIO_NO_DUPS)));
   1316 }
   1317 
   1318 int
   1319 lio_listio64(int mode, aiocb64_t *_RESTRICT_KYWD const *_RESTRICT_KYWD list,
   1320 	int nent, struct sigevent *_RESTRICT_KYWD sigevp)
   1321 {
   1322 	int 		aio_ufs = 0;
   1323 	int 		oerrno = 0;
   1324 	aio_lio_t	*head = NULL;
   1325 	aiocb64_t	*aiocbp;
   1326 	int		state = 0;
   1327 	int 		EIOflg = 0;
   1328 	int 		rw;
   1329 	int		do_kaio = 0;
   1330 	int 		error;
   1331 	int 		i;
   1332 
   1333 	if (!_kaio_ok)
   1334 		_kaio_init();
   1335 
   1336 	if (aio_list_max == 0)
   1337 		aio_list_max = sysconf(_SC_AIO_LISTIO_MAX);
   1338 
   1339 	if (nent <= 0 || nent > aio_list_max) {
   1340 		errno = EINVAL;
   1341 		return (-1);
   1342 	}
   1343 
   1344 	switch (mode) {
   1345 	case LIO_WAIT:
   1346 		state = NOCHECK;
   1347 		break;
   1348 	case LIO_NOWAIT:
   1349 		state = CHECK;
   1350 		break;
   1351 	default:
   1352 		errno = EINVAL;
   1353 		return (-1);
   1354 	}
   1355 
   1356 	for (i = 0; i < nent; i++) {
   1357 		if ((aiocbp = list[i]) == NULL)
   1358 			continue;
   1359 		if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
   1360 			errno = EBUSY;
   1361 			return (-1);
   1362 		}
   1363 		if (_aio_sigev_thread64(aiocbp) != 0)
   1364 			return (-1);
   1365 		if (aiocbp->aio_lio_opcode == LIO_NOP)
   1366 			aiocbp->aio_state = NOCHECK;
   1367 		else {
   1368 			aiocbp->aio_state = state;
   1369 			if (KAIO_SUPPORTED(aiocbp->aio_fildes))
   1370 				do_kaio++;
   1371 			else
   1372 				aiocbp->aio_resultp.aio_errno = ENOTSUP;
   1373 		}
   1374 	}
   1375 	if (_aio_sigev_thread_init(sigevp) != 0)
   1376 		return (-1);
   1377 
   1378 	if (do_kaio) {
   1379 		error = (int)_kaio(AIOLIO64, mode, list, nent, sigevp);
   1380 		if (error == 0)
   1381 			return (0);
   1382 		oerrno = errno;
   1383 	} else {
   1384 		oerrno = errno = ENOTSUP;
   1385 		error = -1;
   1386 	}
   1387 
   1388 	if (error == -1 && errno == ENOTSUP) {
   1389 		error = errno = 0;
   1390 		/*
   1391 		 * If LIO_WAIT, or notification required, allocate a list head.
   1392 		 */
   1393 		if (mode == LIO_WAIT ||
   1394 		    (sigevp != NULL &&
   1395 		    (sigevp->sigev_notify == SIGEV_SIGNAL ||
   1396 		    sigevp->sigev_notify == SIGEV_THREAD ||
   1397 		    sigevp->sigev_notify == SIGEV_PORT)))
   1398 			head = _aio_lio_alloc();
   1399 		if (head) {
   1400 			sig_mutex_lock(&head->lio_mutex);
   1401 			head->lio_mode = mode;
   1402 			head->lio_largefile = 1;
   1403 			if (mode == LIO_NOWAIT && sigevp != NULL) {
   1404 				if (sigevp->sigev_notify == SIGEV_THREAD) {
   1405 					head->lio_port = sigevp->sigev_signo;
   1406 					head->lio_event = AIOLIO64;
   1407 					head->lio_sigevent = sigevp;
   1408 					head->lio_sigval.sival_ptr =
   1409 					    sigevp->sigev_value.sival_ptr;
   1410 				} else if (sigevp->sigev_notify == SIGEV_PORT) {
   1411 					port_notify_t *pn =
   1412 					    sigevp->sigev_value.sival_ptr;
   1413 					head->lio_port = pn->portnfy_port;
   1414 					head->lio_event = AIOLIO64;
   1415 					head->lio_sigevent = sigevp;
   1416 					head->lio_sigval.sival_ptr =
   1417 					    pn->portnfy_user;
   1418 				} else {	/* SIGEV_SIGNAL */
   1419 					head->lio_signo = sigevp->sigev_signo;
   1420 					head->lio_sigval.sival_ptr =
   1421 					    sigevp->sigev_value.sival_ptr;
   1422 				}
   1423 			}
   1424 			head->lio_nent = head->lio_refcnt = nent;
   1425 			sig_mutex_unlock(&head->lio_mutex);
   1426 		}
   1427 		/*
   1428 		 * find UFS requests, errno == ENOTSUP/EBADFD,
   1429 		 */
   1430 		for (i = 0; i < nent; i++) {
   1431 			if ((aiocbp = list[i]) == NULL ||
   1432 			    aiocbp->aio_lio_opcode == LIO_NOP ||
   1433 			    (aiocbp->aio_resultp.aio_errno != ENOTSUP &&
   1434 			    aiocbp->aio_resultp.aio_errno != EBADFD)) {
   1435 				if (head)
   1436 					_lio_list_decr(head);
   1437 				continue;
   1438 			}
   1439 			if (aiocbp->aio_resultp.aio_errno == EBADFD)
   1440 				SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes);
   1441 			if (aiocbp->aio_reqprio != 0) {
   1442 				aiocbp->aio_resultp.aio_errno = EINVAL;
   1443 				aiocbp->aio_resultp.aio_return = -1;
   1444 				EIOflg = 1;
   1445 				if (head)
   1446 					_lio_list_decr(head);
   1447 				continue;
   1448 			}
   1449 			/*
   1450 			 * submit an AIO request with flags AIO_NO_KAIO
   1451 			 * to avoid the kaio() syscall in _aio_rw()
   1452 			 */
   1453 			switch (aiocbp->aio_lio_opcode) {
   1454 			case LIO_READ:
   1455 				rw = AIOAREAD64;
   1456 				break;
   1457 			case LIO_WRITE:
   1458 				rw = AIOAWRITE64;
   1459 				break;
   1460 			}
   1461 			error = _aio_rw64(aiocbp, head, &__nextworker_rw, rw,
   1462 			    (AIO_NO_KAIO | AIO_NO_DUPS));
   1463 			if (error == 0)
   1464 				aio_ufs++;
   1465 			else {
   1466 				if (head)
   1467 					_lio_list_decr(head);
   1468 				aiocbp->aio_resultp.aio_errno = error;
   1469 				EIOflg = 1;
   1470 			}
   1471 		}
   1472 	}
   1473 	if (EIOflg) {
   1474 		errno = EIO;
   1475 		return (-1);
   1476 	}
   1477 	if (mode == LIO_WAIT && oerrno == ENOTSUP) {
   1478 		/*
   1479 		 * call kaio(AIOLIOWAIT) to get all outstanding
   1480 		 * kernel AIO requests
   1481 		 */
   1482 		if ((nent - aio_ufs) > 0)
   1483 			(void) _kaio(AIOLIOWAIT, mode, list, nent, sigevp);
   1484 		if (head != NULL && head->lio_nent > 0) {
   1485 			sig_mutex_lock(&head->lio_mutex);
   1486 			while (head->lio_refcnt > 0) {
   1487 				int err;
   1488 				head->lio_waiting = 1;
   1489 				pthread_cleanup_push(_lio_listio_cleanup, head);
   1490 				err = sig_cond_wait(&head->lio_cond_cv,
   1491 				    &head->lio_mutex);
   1492 				pthread_cleanup_pop(0);
   1493 				head->lio_waiting = 0;
   1494 				if (err && head->lio_nent > 0) {
   1495 					sig_mutex_unlock(&head->lio_mutex);
   1496 					errno = err;
   1497 					return (-1);
   1498 				}
   1499 			}
   1500 			sig_mutex_unlock(&head->lio_mutex);
   1501 			ASSERT(head->lio_nent == 0 && head->lio_refcnt == 0);
   1502 			_aio_lio_free(head);
   1503 			for (i = 0; i < nent; i++) {
   1504 				if ((aiocbp = list[i]) != NULL &&
   1505 				    aiocbp->aio_resultp.aio_errno) {
   1506 					errno = EIO;
   1507 					return (-1);
   1508 				}
   1509 			}
   1510 		}
   1511 		return (0);
   1512 	}
   1513 	return (error);
   1514 }
   1515 
   1516 int
   1517 aio_suspend64(const aiocb64_t * const list[], int nent,
   1518     const timespec_t *timeout)
   1519 {
   1520 	return (__aio_suspend((void **)list, nent, timeout, 1));
   1521 }
   1522 
   1523 int
   1524 aio_error64(const aiocb64_t *aiocbp)
   1525 {
   1526 	const aio_result_t *resultp = &aiocbp->aio_resultp;
   1527 	int error;
   1528 
   1529 	if ((error = resultp->aio_errno) == EINPROGRESS) {
   1530 		if (aiocbp->aio_state == CHECK) {
   1531 			/*
   1532 			 * Always do the kaio() call without using the
   1533 			 * KAIO_SUPPORTED() checks because it is not
   1534 			 * mandatory to have a valid fd set in the
   1535 			 * aiocb, only the resultp must be set.
   1536 			 */
   1537 			if ((int)_kaio(AIOERROR64, aiocbp) == EINVAL) {
   1538 				errno = EINVAL;
   1539 				return (-1);
   1540 			}
   1541 			error = resultp->aio_errno;
   1542 		} else if (aiocbp->aio_state == CHECKED) {
   1543 			((aiocb64_t *)aiocbp)->aio_state = CHECK;
   1544 		}
   1545 	}
   1546 	return (error);
   1547 }
   1548 
   1549 ssize_t
   1550 aio_return64(aiocb64_t *aiocbp)
   1551 {
   1552 	aio_result_t *resultp = &aiocbp->aio_resultp;
   1553 	aio_req_t *reqp;
   1554 	int error;
   1555 	ssize_t retval;
   1556 
   1557 	/*
   1558 	 * The _aiodone() function stores resultp->aio_return before
   1559 	 * storing resultp->aio_errno (with an membar_producer() in
   1560 	 * between).  We use membar_consumer() below to ensure proper
   1561 	 * memory ordering between _aiodone() and ourself.
   1562 	 */
   1563 	error = resultp->aio_errno;
   1564 	membar_consumer();
   1565 	retval = resultp->aio_return;
   1566 
   1567 	/*
   1568 	 * we use this condition to indicate either that
   1569 	 * aio_return() has been called before or should
   1570 	 * not have been called yet.
   1571 	 */
   1572 	if ((retval == -1 && error == EINVAL) || error == EINPROGRESS) {
   1573 		errno = error;
   1574 		return (-1);
   1575 	}
   1576 
   1577 	/*
   1578 	 * Before we return, mark the result as being returned so that later
   1579 	 * calls to aio_return() will return the fact that the result has
   1580 	 * already been returned.
   1581 	 */
   1582 	sig_mutex_lock(&__aio_mutex);
   1583 	/* retest, in case more than one thread actually got in here */
   1584 	if (resultp->aio_return == -1 && resultp->aio_errno == EINVAL) {
   1585 		sig_mutex_unlock(&__aio_mutex);
   1586 		errno = EINVAL;
   1587 		return (-1);
   1588 	}
   1589 	resultp->aio_return = -1;
   1590 	resultp->aio_errno = EINVAL;
   1591 	if ((reqp = _aio_hash_del(resultp)) == NULL)
   1592 		sig_mutex_unlock(&__aio_mutex);
   1593 	else {
   1594 		aiocbp->aio_state = NOCHECK;
   1595 		ASSERT(reqp->req_head == NULL);
   1596 		(void) _aio_req_remove(reqp);
   1597 		sig_mutex_unlock(&__aio_mutex);
   1598 		_aio_req_free(reqp);
   1599 	}
   1600 
   1601 	if (retval == -1)
   1602 		errno = error;
   1603 	return (retval);
   1604 }
   1605 
   1606 static int
   1607 __aio_fsync_bar64(aiocb64_t *aiocbp, aio_lio_t *head, aio_worker_t *aiowp,
   1608     int workerscnt)
   1609 {
   1610 	int i;
   1611 	int error;
   1612 	aio_worker_t *next = aiowp;
   1613 
   1614 	for (i = 0; i < workerscnt; i++) {
   1615 		error = _aio_rw64(aiocbp, head, &next, AIOFSYNC, AIO_NO_KAIO);
   1616 		if (error != 0) {
   1617 			sig_mutex_lock(&head->lio_mutex);
   1618 			head->lio_mode = LIO_DESTROY;	/* ignore fsync */
   1619 			head->lio_nent -= workerscnt - i;
   1620 			head->lio_refcnt -= workerscnt - i;
   1621 			sig_mutex_unlock(&head->lio_mutex);
   1622 			errno = EAGAIN;
   1623 			return (i);
   1624 		}
   1625 		next = next->work_forw;
   1626 	}
   1627 	return (i);
   1628 }
   1629 
   1630 int
   1631 aio_fsync64(int op, aiocb64_t *aiocbp)
   1632 {
   1633 	aio_lio_t *head;
   1634 	struct stat64 statb;
   1635 	int fret;
   1636 
   1637 	if (aiocbp == NULL)
   1638 		return (0);
   1639 	if (op != O_DSYNC && op != O_SYNC) {
   1640 		errno = EINVAL;
   1641 		return (-1);
   1642 	}
   1643 	if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
   1644 		errno = EBUSY;
   1645 		return (-1);
   1646 	}
   1647 	if (fstat64(aiocbp->aio_fildes, &statb) < 0)
   1648 		return (-1);
   1649 	if (_aio_sigev_thread64(aiocbp) != 0)
   1650 		return (-1);
   1651 
   1652 	/*
   1653 	 * Kernel aio_fsync() is not supported.
   1654 	 * We force user-level aio_fsync() just
   1655 	 * for the notification side-effect.
   1656 	 */
   1657 	if (!__uaio_ok && __uaio_init() == -1)
   1658 		return (-1);
   1659 
   1660 	/*
   1661 	 * The first asynchronous I/O request in the current process will
   1662 	 * create a bunch of workers (via __uaio_init()).  If the number
   1663 	 * of workers is zero then the number of pending asynchronous I/O
   1664 	 * requests is zero.  In such a case only execute the standard
   1665 	 * fsync(3C) or fdatasync(3RT) as appropriate.
   1666 	 */
   1667 	if (__rw_workerscnt == 0) {
   1668 		if (op == O_DSYNC)
   1669 			return (__fdsync(aiocbp->aio_fildes, FDSYNC));
   1670 		else
   1671 			return (__fdsync(aiocbp->aio_fildes, FSYNC));
   1672 	}
   1673 
   1674 	/*
   1675 	 * re-use aio_offset as the op field.
   1676 	 * 	O_DSYNC - fdatasync()
   1677 	 * 	O_SYNC - fsync()
   1678 	 */
   1679 	aiocbp->aio_offset = op;
   1680 	aiocbp->aio_lio_opcode = AIOFSYNC;
   1681 
   1682 	/*
   1683 	 * Create a list of fsync requests.  The worker that
   1684 	 * gets the last request will do the fsync request.
   1685 	 */
   1686 	head = _aio_lio_alloc();
   1687 	if (head == NULL) {
   1688 		errno = EAGAIN;
   1689 		return (-1);
   1690 	}
   1691 	head->lio_mode = LIO_FSYNC;
   1692 	head->lio_nent = head->lio_refcnt = __rw_workerscnt;
   1693 	head->lio_largefile = 1;
   1694 
   1695 	/*
   1696 	 * Insert an fsync request on every worker's queue.
   1697 	 */
   1698 	fret = __aio_fsync_bar64(aiocbp, head, __workers_rw, __rw_workerscnt);
   1699 	if (fret != __rw_workerscnt) {
   1700 		/*
   1701 		 * Fewer fsync requests than workers means that it was
   1702 		 * not possible to submit fsync requests to all workers.
   1703 		 * Actions:
   1704 		 * a) number of fsync requests submitted is 0:
   1705 		 *    => free allocated memory (aio_lio_t).
   1706 		 * b) number of fsync requests submitted is > 0:
   1707 		 *    => the last worker executing the fsync request
   1708 		 *	 will free the aio_lio_t struct.
   1709 		 */
   1710 		if (fret == 0)
   1711 			_aio_lio_free(head);
   1712 		return (-1);
   1713 	}
   1714 	return (0);
   1715 }
   1716 
   1717 int
   1718 aio_cancel64(int fd, aiocb64_t *aiocbp)
   1719 {
   1720 	aio_req_t *reqp;
   1721 	aio_worker_t *aiowp;
   1722 	int done = 0;
   1723 	int canceled = 0;
   1724 	struct stat64 buf;
   1725 
   1726 	if (fstat64(fd, &buf) < 0)
   1727 		return (-1);
   1728 
   1729 	if (aiocbp != NULL) {
   1730 		if (fd != aiocbp->aio_fildes) {
   1731 			errno = EINVAL;
   1732 			return (-1);
   1733 		}
   1734 		if (aiocbp->aio_state == USERAIO) {
   1735 			sig_mutex_lock(&__aio_mutex);
   1736 			reqp = _aio_hash_find(&aiocbp->aio_resultp);
   1737 			if (reqp == NULL) {
   1738 				sig_mutex_unlock(&__aio_mutex);
   1739 				return (AIO_ALLDONE);
   1740 			}
   1741 			aiowp = reqp->req_worker;
   1742 			sig_mutex_lock(&aiowp->work_qlock1);
   1743 			(void) _aio_cancel_req(aiowp, reqp, &canceled, &done);
   1744 			sig_mutex_unlock(&aiowp->work_qlock1);
   1745 			sig_mutex_unlock(&__aio_mutex);
   1746 			if (done)
   1747 				return (AIO_ALLDONE);
   1748 			if (canceled)
   1749 				return (AIO_CANCELED);
   1750 			return (AIO_NOTCANCELED);
   1751 		}
   1752 		if (aiocbp->aio_state == USERAIO_DONE)
   1753 			return (AIO_ALLDONE);
   1754 		return ((int)_kaio(AIOCANCEL, fd, aiocbp));
   1755 	}
   1756 
   1757 	return (aiocancel_all(fd));
   1758 }
   1759 
   1760 int
   1761 aio_waitn64(aiocb64_t *list[], uint_t nent, uint_t *nwait,
   1762 	const timespec_t *timeout)
   1763 {
   1764 	return (__aio_waitn((void **)list, nent, nwait, timeout));
   1765 }
   1766 
   1767 #endif /* !defined(_LP64) */
   1768