Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * Kernel asynchronous I/O.
     29  * This is only for raw devices now (as of Nov. 1993).
     30  */
     31 
     32 #include <sys/types.h>
     33 #include <sys/errno.h>
     34 #include <sys/conf.h>
     35 #include <sys/file.h>
     36 #include <sys/fs/snode.h>
     37 #include <sys/unistd.h>
     38 #include <sys/cmn_err.h>
     39 #include <vm/as.h>
     40 #include <vm/faultcode.h>
     41 #include <sys/sysmacros.h>
     42 #include <sys/procfs.h>
     43 #include <sys/kmem.h>
     44 #include <sys/autoconf.h>
     45 #include <sys/ddi_impldefs.h>
     46 #include <sys/sunddi.h>
     47 #include <sys/aio_impl.h>
     48 #include <sys/debug.h>
     49 #include <sys/param.h>
     50 #include <sys/systm.h>
     51 #include <sys/vmsystm.h>
     52 #include <sys/fs/pxfs_ki.h>
     53 #include <sys/contract/process_impl.h>
     54 
     55 /*
     56  * external entry point.
     57  */
     58 #ifdef _LP64
     59 static int64_t kaioc(long, long, long, long, long, long);
     60 #endif
     61 static int kaio(ulong_t *, rval_t *);
     62 
     63 
     64 #define	AIO_64	0
     65 #define	AIO_32	1
     66 #define	AIO_LARGEFILE	2
     67 
     68 /*
     69  * implementation specific functions (private)
     70  */
     71 #ifdef _LP64
     72 static int alio(int, aiocb_t **, int, struct sigevent *);
     73 #endif
     74 static int aionotify(void);
     75 static int aioinit(void);
     76 static int aiostart(void);
     77 static void alio_cleanup(aio_t *, aiocb_t **, int, int);
     78 static int (*check_vp(struct vnode *, int))(vnode_t *, struct aio_req *,
     79     cred_t *);
     80 static void lio_set_error(aio_req_t *, int portused);
     81 static aio_t *aio_aiop_alloc();
     82 static int aio_req_alloc(aio_req_t **, aio_result_t *);
     83 static int aio_lio_alloc(aio_lio_t **);
     84 static aio_req_t *aio_req_done(void *);
     85 static aio_req_t *aio_req_remove(aio_req_t *);
     86 static int aio_req_find(aio_result_t *, aio_req_t **);
     87 static int aio_hash_insert(struct aio_req_t *, aio_t *);
     88 static int aio_req_setup(aio_req_t **, aio_t *, aiocb_t *,
     89     aio_result_t *, vnode_t *, int);
     90 static int aio_cleanup_thread(aio_t *);
     91 static aio_lio_t *aio_list_get(aio_result_t *);
     92 static void lio_set_uerror(void *, int);
     93 extern void aio_zerolen(aio_req_t *);
     94 static int aiowait(struct timeval *, int, long	*);
     95 static int aiowaitn(void *, uint_t, uint_t *, timespec_t *);
     96 static int aio_unlock_requests(caddr_t iocblist, int iocb_index,
     97     aio_req_t *reqlist, aio_t *aiop, model_t model);
     98 static int aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max);
     99 static int aiosuspend(void *, int, struct  timespec *, int,
    100     long	*, int);
    101 static int aliowait(int, void *, int, void *, int);
    102 static int aioerror(void *, int);
    103 static int aio_cancel(int, void *, long	*, int);
    104 static int arw(int, int, char *, int, offset_t, aio_result_t *, int);
    105 static int aiorw(int, void *, int, int);
    106 
    107 static int alioLF(int, void *, int, void *);
    108 static int aio_req_setupLF(aio_req_t **, aio_t *, aiocb64_32_t *,
    109     aio_result_t *, vnode_t *, int);
    110 static int alio32(int, void *, int, void *);
    111 static int driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
    112 static int driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
    113 
    114 #ifdef  _SYSCALL32_IMPL
    115 static void aiocb_LFton(aiocb64_32_t *, aiocb_t *);
    116 void	aiocb_32ton(aiocb32_t *, aiocb_t *);
    117 #endif /* _SYSCALL32_IMPL */
    118 
    119 /*
    120  * implementation specific functions (external)
    121  */
    122 void aio_req_free(aio_t *, aio_req_t *);
    123 
    124 /*
    125  * Event Port framework
    126  */
    127 
    128 void aio_req_free_port(aio_t *, aio_req_t *);
    129 static int aio_port_callback(void *, int *, pid_t, int, void *);
    130 
    131 /*
    132  * This is the loadable module wrapper.
    133  */
    134 #include <sys/modctl.h>
    135 #include <sys/syscall.h>
    136 
    137 #ifdef _LP64
    138 
    139 static struct sysent kaio_sysent = {
    140 	6,
    141 	SE_NOUNLOAD | SE_64RVAL | SE_ARGC,
    142 	(int (*)())kaioc
    143 };
    144 
    145 #ifdef _SYSCALL32_IMPL
    146 static struct sysent kaio_sysent32 = {
    147 	7,
    148 	SE_NOUNLOAD | SE_64RVAL,
    149 	kaio
    150 };
    151 #endif  /* _SYSCALL32_IMPL */
    152 
    153 #else   /* _LP64 */
    154 
    155 static struct sysent kaio_sysent = {
    156 	7,
    157 	SE_NOUNLOAD | SE_32RVAL1,
    158 	kaio
    159 };
    160 
    161 #endif  /* _LP64 */
    162 
    163 /*
    164  * Module linkage information for the kernel.
    165  */
    166 
    167 static struct modlsys modlsys = {
    168 	&mod_syscallops,
    169 	"kernel Async I/O",
    170 	&kaio_sysent
    171 };
    172 
    173 #ifdef  _SYSCALL32_IMPL
    174 static struct modlsys modlsys32 = {
    175 	&mod_syscallops32,
    176 	"kernel Async I/O for 32 bit compatibility",
    177 	&kaio_sysent32
    178 };
    179 #endif  /* _SYSCALL32_IMPL */
    180 
    181 
    182 static struct modlinkage modlinkage = {
    183 	MODREV_1,
    184 	&modlsys,
    185 #ifdef  _SYSCALL32_IMPL
    186 	&modlsys32,
    187 #endif
    188 	NULL
    189 };
    190 
    191 int
    192 _init(void)
    193 {
    194 	int retval;
    195 
    196 	if ((retval = mod_install(&modlinkage)) != 0)
    197 		return (retval);
    198 
    199 	return (0);
    200 }
    201 
    202 int
    203 _fini(void)
    204 {
    205 	int retval;
    206 
    207 	retval = mod_remove(&modlinkage);
    208 
    209 	return (retval);
    210 }
    211 
    212 int
    213 _info(struct modinfo *modinfop)
    214 {
    215 	return (mod_info(&modlinkage, modinfop));
    216 }
    217 
    218 #ifdef	_LP64
    219 static int64_t
    220 kaioc(
    221 	long	a0,
    222 	long	a1,
    223 	long	a2,
    224 	long	a3,
    225 	long	a4,
    226 	long	a5)
    227 {
    228 	int	error;
    229 	long	rval = 0;
    230 
    231 	switch ((int)a0 & ~AIO_POLL_BIT) {
    232 	case AIOREAD:
    233 		error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
    234 		    (offset_t)a4, (aio_result_t *)a5, FREAD);
    235 		break;
    236 	case AIOWRITE:
    237 		error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
    238 		    (offset_t)a4, (aio_result_t *)a5, FWRITE);
    239 		break;
    240 	case AIOWAIT:
    241 		error = aiowait((struct timeval *)a1, (int)a2, &rval);
    242 		break;
    243 	case AIOWAITN:
    244 		error = aiowaitn((void *)a1, (uint_t)a2, (uint_t *)a3,
    245 		    (timespec_t *)a4);
    246 		break;
    247 	case AIONOTIFY:
    248 		error = aionotify();
    249 		break;
    250 	case AIOINIT:
    251 		error = aioinit();
    252 		break;
    253 	case AIOSTART:
    254 		error = aiostart();
    255 		break;
    256 	case AIOLIO:
    257 		error = alio((int)a1, (aiocb_t **)a2, (int)a3,
    258 		    (struct sigevent *)a4);
    259 		break;
    260 	case AIOLIOWAIT:
    261 		error = aliowait((int)a1, (void *)a2, (int)a3,
    262 		    (struct sigevent *)a4, AIO_64);
    263 		break;
    264 	case AIOSUSPEND:
    265 		error = aiosuspend((void *)a1, (int)a2, (timespec_t *)a3,
    266 		    (int)a4, &rval, AIO_64);
    267 		break;
    268 	case AIOERROR:
    269 		error = aioerror((void *)a1, AIO_64);
    270 		break;
    271 	case AIOAREAD:
    272 		error = aiorw((int)a0, (void *)a1, FREAD, AIO_64);
    273 		break;
    274 	case AIOAWRITE:
    275 		error = aiorw((int)a0, (void *)a1, FWRITE, AIO_64);
    276 		break;
    277 	case AIOCANCEL:
    278 		error = aio_cancel((int)a1, (void *)a2, &rval, AIO_64);
    279 		break;
    280 
    281 	/*
    282 	 * The large file related stuff is valid only for
    283 	 * 32 bit kernel and not for 64 bit kernel
    284 	 * On 64 bit kernel we convert large file calls
    285 	 * to regular 64bit calls.
    286 	 */
    287 
    288 	default:
    289 		error = EINVAL;
    290 	}
    291 	if (error)
    292 		return ((int64_t)set_errno(error));
    293 	return (rval);
    294 }
    295 #endif
    296 
    297 static int
    298 kaio(
    299 	ulong_t *uap,
    300 	rval_t *rvp)
    301 {
    302 	long rval = 0;
    303 	int	error = 0;
    304 	offset_t	off;
    305 
    306 
    307 		rvp->r_vals = 0;
    308 #if defined(_LITTLE_ENDIAN)
    309 	off = ((u_offset_t)uap[5] << 32) | (u_offset_t)uap[4];
    310 #else
    311 	off = ((u_offset_t)uap[4] << 32) | (u_offset_t)uap[5];
    312 #endif
    313 
    314 	switch (uap[0] & ~AIO_POLL_BIT) {
    315 	/*
    316 	 * It must be the 32 bit system call on 64 bit kernel
    317 	 */
    318 	case AIOREAD:
    319 		return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
    320 		    (int)uap[3], off, (aio_result_t *)uap[6], FREAD));
    321 	case AIOWRITE:
    322 		return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
    323 		    (int)uap[3], off, (aio_result_t *)uap[6], FWRITE));
    324 	case AIOWAIT:
    325 		error = aiowait((struct	timeval *)uap[1], (int)uap[2],
    326 		    &rval);
    327 		break;
    328 	case AIOWAITN:
    329 		error = aiowaitn((void *)uap[1], (uint_t)uap[2],
    330 		    (uint_t *)uap[3], (timespec_t *)uap[4]);
    331 		break;
    332 	case AIONOTIFY:
    333 		return (aionotify());
    334 	case AIOINIT:
    335 		return (aioinit());
    336 	case AIOSTART:
    337 		return (aiostart());
    338 	case AIOLIO:
    339 		return (alio32((int)uap[1], (void *)uap[2], (int)uap[3],
    340 		    (void *)uap[4]));
    341 	case AIOLIOWAIT:
    342 		return (aliowait((int)uap[1], (void *)uap[2],
    343 		    (int)uap[3], (struct sigevent *)uap[4], AIO_32));
    344 	case AIOSUSPEND:
    345 		error = aiosuspend((void *)uap[1], (int)uap[2],
    346 		    (timespec_t *)uap[3], (int)uap[4],
    347 		    &rval, AIO_32);
    348 		break;
    349 	case AIOERROR:
    350 		return (aioerror((void *)uap[1], AIO_32));
    351 	case AIOAREAD:
    352 		return (aiorw((int)uap[0], (void *)uap[1],
    353 		    FREAD, AIO_32));
    354 	case AIOAWRITE:
    355 		return (aiorw((int)uap[0], (void *)uap[1],
    356 		    FWRITE, AIO_32));
    357 	case AIOCANCEL:
    358 		error = (aio_cancel((int)uap[1], (void *)uap[2], &rval,
    359 		    AIO_32));
    360 		break;
    361 	case AIOLIO64:
    362 		return (alioLF((int)uap[1], (void *)uap[2],
    363 		    (int)uap[3], (void *)uap[4]));
    364 	case AIOLIOWAIT64:
    365 		return (aliowait(uap[1], (void *)uap[2],
    366 		    (int)uap[3], (void *)uap[4], AIO_LARGEFILE));
    367 	case AIOSUSPEND64:
    368 		error = aiosuspend((void *)uap[1], (int)uap[2],
    369 		    (timespec_t *)uap[3], (int)uap[4], &rval,
    370 		    AIO_LARGEFILE);
    371 		break;
    372 	case AIOERROR64:
    373 		return (aioerror((void *)uap[1], AIO_LARGEFILE));
    374 	case AIOAREAD64:
    375 		return (aiorw((int)uap[0], (void *)uap[1], FREAD,
    376 		    AIO_LARGEFILE));
    377 	case AIOAWRITE64:
    378 		return (aiorw((int)uap[0], (void *)uap[1], FWRITE,
    379 		    AIO_LARGEFILE));
    380 	case AIOCANCEL64:
    381 		error = (aio_cancel((int)uap[1], (void *)uap[2],
    382 		    &rval, AIO_LARGEFILE));
    383 		break;
    384 	default:
    385 		return (EINVAL);
    386 	}
    387 
    388 	rvp->r_val1 = rval;
    389 	return (error);
    390 }
    391 
    392 /*
    393  * wake up LWPs in this process that are sleeping in
    394  * aiowait().
    395  */
    396 static int
    397 aionotify(void)
    398 {
    399 	aio_t	*aiop;
    400 
    401 	aiop = curproc->p_aio;
    402 	if (aiop == NULL)
    403 		return (0);
    404 
    405 	mutex_enter(&aiop->aio_mutex);
    406 	aiop->aio_notifycnt++;
    407 	cv_broadcast(&aiop->aio_waitcv);
    408 	mutex_exit(&aiop->aio_mutex);
    409 
    410 	return (0);
    411 }
    412 
    413 static int
    414 timeval2reltime(struct timeval *timout, timestruc_t *rqtime,
    415 	timestruc_t **rqtp, int *blocking)
    416 {
    417 #ifdef	_SYSCALL32_IMPL
    418 	struct timeval32 wait_time_32;
    419 #endif
    420 	struct timeval wait_time;
    421 	model_t	model = get_udatamodel();
    422 
    423 	*rqtp = NULL;
    424 	if (timout == NULL) {		/* wait indefinitely */
    425 		*blocking = 1;
    426 		return (0);
    427 	}
    428 
    429 	/*
    430 	 * Need to correctly compare with the -1 passed in for a user
    431 	 * address pointer, with both 32 bit and 64 bit apps.
    432 	 */
    433 	if (model == DATAMODEL_NATIVE) {
    434 		if ((intptr_t)timout == (intptr_t)-1) {	/* don't wait */
    435 			*blocking = 0;
    436 			return (0);
    437 		}
    438 
    439 		if (copyin(timout, &wait_time, sizeof (wait_time)))
    440 			return (EFAULT);
    441 	}
    442 #ifdef	_SYSCALL32_IMPL
    443 	else {
    444 		/*
    445 		 * -1 from a 32bit app. It will not get sign extended.
    446 		 * don't wait if -1.
    447 		 */
    448 		if ((intptr_t)timout == (intptr_t)((uint32_t)-1)) {
    449 			*blocking = 0;
    450 			return (0);
    451 		}
    452 
    453 		if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
    454 			return (EFAULT);
    455 		TIMEVAL32_TO_TIMEVAL(&wait_time, &wait_time_32);
    456 	}
    457 #endif  /* _SYSCALL32_IMPL */
    458 
    459 	if (wait_time.tv_sec == 0 && wait_time.tv_usec == 0) {	/* don't wait */
    460 		*blocking = 0;
    461 		return (0);
    462 	}
    463 
    464 	if (wait_time.tv_sec < 0 ||
    465 	    wait_time.tv_usec < 0 || wait_time.tv_usec >= MICROSEC)
    466 		return (EINVAL);
    467 
    468 	rqtime->tv_sec = wait_time.tv_sec;
    469 	rqtime->tv_nsec = wait_time.tv_usec * 1000;
    470 	*rqtp = rqtime;
    471 	*blocking = 1;
    472 
    473 	return (0);
    474 }
    475 
    476 static int
    477 timespec2reltime(timespec_t *timout, timestruc_t *rqtime,
    478 	timestruc_t **rqtp, int *blocking)
    479 {
    480 #ifdef	_SYSCALL32_IMPL
    481 	timespec32_t wait_time_32;
    482 #endif
    483 	model_t	model = get_udatamodel();
    484 
    485 	*rqtp = NULL;
    486 	if (timout == NULL) {
    487 		*blocking = 1;
    488 		return (0);
    489 	}
    490 
    491 	if (model == DATAMODEL_NATIVE) {
    492 		if (copyin(timout, rqtime, sizeof (*rqtime)))
    493 			return (EFAULT);
    494 	}
    495 #ifdef	_SYSCALL32_IMPL
    496 	else {
    497 		if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
    498 			return (EFAULT);
    499 		TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32);
    500 	}
    501 #endif  /* _SYSCALL32_IMPL */
    502 
    503 	if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) {
    504 		*blocking = 0;
    505 		return (0);
    506 	}
    507 
    508 	if (rqtime->tv_sec < 0 ||
    509 	    rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC)
    510 		return (EINVAL);
    511 
    512 	*rqtp = rqtime;
    513 	*blocking = 1;
    514 
    515 	return (0);
    516 }
    517 
    518 /*ARGSUSED*/
    519 static int
    520 aiowait(
    521 	struct timeval	*timout,
    522 	int	dontblockflg,
    523 	long	*rval)
    524 {
    525 	int 		error;
    526 	aio_t		*aiop;
    527 	aio_req_t	*reqp;
    528 	clock_t		status;
    529 	int		blocking;
    530 	int		timecheck;
    531 	timestruc_t	rqtime;
    532 	timestruc_t	*rqtp;
    533 
    534 	aiop = curproc->p_aio;
    535 	if (aiop == NULL)
    536 		return (EINVAL);
    537 
    538 	/*
    539 	 * Establish the absolute future time for the timeout.
    540 	 */
    541 	error = timeval2reltime(timout, &rqtime, &rqtp, &blocking);
    542 	if (error)
    543 		return (error);
    544 	if (rqtp) {
    545 		timestruc_t now;
    546 		timecheck = timechanged;
    547 		gethrestime(&now);
    548 		timespecadd(rqtp, &now);
    549 	}
    550 
    551 	mutex_enter(&aiop->aio_mutex);
    552 	for (;;) {
    553 		/* process requests on poll queue */
    554 		if (aiop->aio_pollq) {
    555 			mutex_exit(&aiop->aio_mutex);
    556 			aio_cleanup(0);
    557 			mutex_enter(&aiop->aio_mutex);
    558 		}
    559 		if ((reqp = aio_req_remove(NULL)) != NULL) {
    560 			*rval = (long)reqp->aio_req_resultp;
    561 			break;
    562 		}
    563 		/* user-level done queue might not be empty */
    564 		if (aiop->aio_notifycnt > 0) {
    565 			aiop->aio_notifycnt--;
    566 			*rval = 1;
    567 			break;
    568 		}
    569 		/* don't block if no outstanding aio */
    570 		if (aiop->aio_outstanding == 0 && dontblockflg) {
    571 			error = EINVAL;
    572 			break;
    573 		}
    574 		if (blocking) {
    575 			status = cv_waituntil_sig(&aiop->aio_waitcv,
    576 			    &aiop->aio_mutex, rqtp, timecheck);
    577 
    578 			if (status > 0)		/* check done queue again */
    579 				continue;
    580 			if (status == 0) {	/* interrupted by a signal */
    581 				error = EINTR;
    582 				*rval = -1;
    583 			} else {		/* timer expired */
    584 				error = ETIME;
    585 			}
    586 		}
    587 		break;
    588 	}
    589 	mutex_exit(&aiop->aio_mutex);
    590 	if (reqp) {
    591 		aphysio_unlock(reqp);
    592 		aio_copyout_result(reqp);
    593 		mutex_enter(&aiop->aio_mutex);
    594 		aio_req_free(aiop, reqp);
    595 		mutex_exit(&aiop->aio_mutex);
    596 	}
    597 	return (error);
    598 }
    599 
    600 /*
    601  * aiowaitn can be used to reap completed asynchronous requests submitted with
    602  * lio_listio, aio_read or aio_write.
    603  * This function only reaps asynchronous raw I/Os.
    604  */
    605 
    606 /*ARGSUSED*/
    607 static int
    608 aiowaitn(void *uiocb, uint_t nent, uint_t *nwait, timespec_t *timout)
    609 {
    610 	int 		error = 0;
    611 	aio_t		*aiop;
    612 	aio_req_t	*reqlist = NULL;
    613 	caddr_t		iocblist = NULL;	/* array of iocb ptr's */
    614 	uint_t		waitcnt, cnt = 0;	/* iocb cnt */
    615 	size_t		iocbsz;			/* users iocb size */
    616 	size_t		riocbsz;		/* returned iocb size */
    617 	int		iocb_index = 0;
    618 	model_t		model = get_udatamodel();
    619 	int		blocking = 1;
    620 	int		timecheck;
    621 	timestruc_t	rqtime;
    622 	timestruc_t	*rqtp;
    623 
    624 	aiop = curproc->p_aio;
    625 	if (aiop == NULL || nent == 0 || nent > _AIO_LISTIO_MAX)
    626 		return (EINVAL);
    627 
    628 	if (aiop->aio_outstanding == 0)
    629 		return (EAGAIN);
    630 
    631 	if (copyin(nwait, &waitcnt, sizeof (uint_t)))
    632 		return (EFAULT);
    633 
    634 	/* set *nwait to zero, if we must return prematurely */
    635 	if (copyout(&cnt, nwait, sizeof (uint_t)))
    636 		return (EFAULT);
    637 
    638 	if (waitcnt == 0) {
    639 		blocking = 0;
    640 		rqtp = NULL;
    641 		waitcnt = nent;
    642 	} else {
    643 		error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
    644 		if (error)
    645 			return (error);
    646 	}
    647 
    648 	if (model == DATAMODEL_NATIVE)
    649 		iocbsz = (sizeof (aiocb_t *) * nent);
    650 #ifdef	_SYSCALL32_IMPL
    651 	else
    652 		iocbsz = (sizeof (caddr32_t) * nent);
    653 #endif  /* _SYSCALL32_IMPL */
    654 
    655 	/*
    656 	 * Only one aio_waitn call is allowed at a time.
    657 	 * The active aio_waitn will collect all requests
    658 	 * out of the "done" list and if necessary it will wait
    659 	 * for some/all pending requests to fulfill the nwait
    660 	 * parameter.
    661 	 * A second or further aio_waitn calls will sleep here
    662 	 * until the active aio_waitn finishes and leaves the kernel
    663 	 * If the second call does not block (poll), then return
    664 	 * immediately with the error code : EAGAIN.
    665 	 * If the second call should block, then sleep here, but
    666 	 * do not touch the timeout. The timeout starts when this
    667 	 * aio_waitn-call becomes active.
    668 	 */
    669 
    670 	mutex_enter(&aiop->aio_mutex);
    671 
    672 	while (aiop->aio_flags & AIO_WAITN) {
    673 		if (blocking == 0) {
    674 			mutex_exit(&aiop->aio_mutex);
    675 			return (EAGAIN);
    676 		}
    677 
    678 		/* block, no timeout */
    679 		aiop->aio_flags |= AIO_WAITN_PENDING;
    680 		if (!cv_wait_sig(&aiop->aio_waitncv, &aiop->aio_mutex)) {
    681 			mutex_exit(&aiop->aio_mutex);
    682 			return (EINTR);
    683 		}
    684 	}
    685 
    686 	/*
    687 	 * Establish the absolute future time for the timeout.
    688 	 */
    689 	if (rqtp) {
    690 		timestruc_t now;
    691 		timecheck = timechanged;
    692 		gethrestime(&now);
    693 		timespecadd(rqtp, &now);
    694 	}
    695 
    696 	if (iocbsz > aiop->aio_iocbsz && aiop->aio_iocb != NULL) {
    697 		kmem_free(aiop->aio_iocb, aiop->aio_iocbsz);
    698 		aiop->aio_iocb = NULL;
    699 	}
    700 
    701 	if (aiop->aio_iocb == NULL) {
    702 		iocblist = kmem_zalloc(iocbsz, KM_NOSLEEP);
    703 		if (iocblist == NULL) {
    704 			mutex_exit(&aiop->aio_mutex);
    705 			return (ENOMEM);
    706 		}
    707 		aiop->aio_iocb = (aiocb_t **)iocblist;
    708 		aiop->aio_iocbsz = iocbsz;
    709 	} else {
    710 		iocblist = (char *)aiop->aio_iocb;
    711 	}
    712 
    713 	aiop->aio_waitncnt = waitcnt;
    714 	aiop->aio_flags |= AIO_WAITN;
    715 
    716 	for (;;) {
    717 		/* push requests on poll queue to done queue */
    718 		if (aiop->aio_pollq) {
    719 			mutex_exit(&aiop->aio_mutex);
    720 			aio_cleanup(0);
    721 			mutex_enter(&aiop->aio_mutex);
    722 		}
    723 
    724 		/* check for requests on done queue */
    725 		if (aiop->aio_doneq) {
    726 			cnt += aio_reqlist_concat(aiop, &reqlist, nent - cnt);
    727 			aiop->aio_waitncnt = waitcnt - cnt;
    728 		}
    729 
    730 		/* user-level done queue might not be empty */
    731 		if (aiop->aio_notifycnt > 0) {
    732 			aiop->aio_notifycnt--;
    733 			error = 0;
    734 			break;
    735 		}
    736 
    737 		/*
    738 		 * if we are here second time as a result of timer
    739 		 * expiration, we reset error if there are enough
    740 		 * aiocb's to satisfy request.
    741 		 * We return also if all requests are already done
    742 		 * and we picked up the whole done queue.
    743 		 */
    744 
    745 		if ((cnt >= waitcnt) || (cnt > 0 && aiop->aio_pending == 0 &&
    746 		    aiop->aio_doneq == NULL)) {
    747 			error = 0;
    748 			break;
    749 		}
    750 
    751 		if ((cnt < waitcnt) && blocking) {
    752 			int rval = cv_waituntil_sig(&aiop->aio_waitcv,
    753 			    &aiop->aio_mutex, rqtp, timecheck);
    754 			if (rval > 0)
    755 				continue;
    756 			if (rval < 0) {
    757 				error = ETIME;
    758 				blocking = 0;
    759 				continue;
    760 			}
    761 			error = EINTR;
    762 		}
    763 		break;
    764 	}
    765 
    766 	mutex_exit(&aiop->aio_mutex);
    767 
    768 	if (cnt > 0) {
    769 
    770 		iocb_index = aio_unlock_requests(iocblist, iocb_index, reqlist,
    771 		    aiop, model);
    772 
    773 		if (model == DATAMODEL_NATIVE)
    774 			riocbsz = (sizeof (aiocb_t *) * cnt);
    775 #ifdef	_SYSCALL32_IMPL
    776 		else
    777 			riocbsz = (sizeof (caddr32_t) * cnt);
    778 #endif  /* _SYSCALL32_IMPL */
    779 
    780 		if (copyout(iocblist, uiocb, riocbsz) ||
    781 		    copyout(&cnt, nwait, sizeof (uint_t)))
    782 			error = EFAULT;
    783 	}
    784 
    785 	/* check if there is another thread waiting for execution */
    786 	mutex_enter(&aiop->aio_mutex);
    787 	aiop->aio_flags &= ~AIO_WAITN;
    788 	if (aiop->aio_flags & AIO_WAITN_PENDING) {
    789 		aiop->aio_flags &= ~AIO_WAITN_PENDING;
    790 		cv_signal(&aiop->aio_waitncv);
    791 	}
    792 	mutex_exit(&aiop->aio_mutex);
    793 
    794 	return (error);
    795 }
    796 
    797 /*
    798  * aio_unlock_requests
    799  * copyouts the result of the request as well as the return value.
    800  * It builds the list of completed asynchronous requests,
    801  * unlocks the allocated memory ranges and
    802  * put the aio request structure back into the free list.
    803  */
    804 
    805 static int
    806 aio_unlock_requests(
    807 	caddr_t	iocblist,
    808 	int	iocb_index,
    809 	aio_req_t *reqlist,
    810 	aio_t	*aiop,
    811 	model_t	model)
    812 {
    813 	aio_req_t	*reqp, *nreqp;
    814 
    815 	if (model == DATAMODEL_NATIVE) {
    816 		for (reqp = reqlist; reqp != NULL;  reqp = nreqp) {
    817 			(((caddr_t *)iocblist)[iocb_index++]) =
    818 			    reqp->aio_req_iocb.iocb;
    819 			nreqp = reqp->aio_req_next;
    820 			aphysio_unlock(reqp);
    821 			aio_copyout_result(reqp);
    822 			mutex_enter(&aiop->aio_mutex);
    823 			aio_req_free(aiop, reqp);
    824 			mutex_exit(&aiop->aio_mutex);
    825 		}
    826 	}
    827 #ifdef	_SYSCALL32_IMPL
    828 	else {
    829 		for (reqp = reqlist; reqp != NULL;  reqp = nreqp) {
    830 			((caddr32_t *)iocblist)[iocb_index++] =
    831 			    reqp->aio_req_iocb.iocb32;
    832 			nreqp = reqp->aio_req_next;
    833 			aphysio_unlock(reqp);
    834 			aio_copyout_result(reqp);
    835 			mutex_enter(&aiop->aio_mutex);
    836 			aio_req_free(aiop, reqp);
    837 			mutex_exit(&aiop->aio_mutex);
    838 		}
    839 	}
    840 #endif	/* _SYSCALL32_IMPL */
    841 	return (iocb_index);
    842 }
    843 
    844 /*
    845  * aio_reqlist_concat
    846  * moves "max" elements from the done queue to the reqlist queue and removes
    847  * the AIO_DONEQ flag.
    848  * - reqlist queue is a simple linked list
    849  * - done queue is a double linked list
    850  */
    851 
    852 static int
    853 aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max)
    854 {
    855 	aio_req_t *q2, *q2work, *list;
    856 	int count = 0;
    857 
    858 	list = *reqlist;
    859 	q2 = aiop->aio_doneq;
    860 	q2work = q2;
    861 	while (max-- > 0) {
    862 		q2work->aio_req_flags &= ~AIO_DONEQ;
    863 		q2work = q2work->aio_req_next;
    864 		count++;
    865 		if (q2work == q2)
    866 			break;
    867 	}
    868 
    869 	if (q2work == q2) {
    870 		/* all elements revised */
    871 		q2->aio_req_prev->aio_req_next = list;
    872 		list = q2;
    873 		aiop->aio_doneq = NULL;
    874 	} else {
    875 		/*
    876 		 * max < elements in the doneq
    877 		 * detach only the required amount of elements
    878 		 * out of the doneq
    879 		 */
    880 		q2work->aio_req_prev->aio_req_next = list;
    881 		list = q2;
    882 
    883 		aiop->aio_doneq = q2work;
    884 		q2work->aio_req_prev = q2->aio_req_prev;
    885 		q2->aio_req_prev->aio_req_next = q2work;
    886 	}
    887 	*reqlist = list;
    888 	return (count);
    889 }
    890 
    891 /*ARGSUSED*/
    892 static int
    893 aiosuspend(
    894 	void	*aiocb,
    895 	int	nent,
    896 	struct	timespec	*timout,
    897 	int	flag,
    898 	long	*rval,
    899 	int	run_mode)
    900 {
    901 	int 		error;
    902 	aio_t		*aiop;
    903 	aio_req_t	*reqp, *found, *next;
    904 	caddr_t		cbplist = NULL;
    905 	aiocb_t		*cbp, **ucbp;
    906 #ifdef	_SYSCALL32_IMPL
    907 	aiocb32_t	*cbp32;
    908 	caddr32_t	*ucbp32;
    909 #endif  /* _SYSCALL32_IMPL */
    910 	aiocb64_32_t	*cbp64;
    911 	int		rv;
    912 	int		i;
    913 	size_t		ssize;
    914 	model_t		model = get_udatamodel();
    915 	int		blocking;
    916 	int		timecheck;
    917 	timestruc_t	rqtime;
    918 	timestruc_t	*rqtp;
    919 
    920 	aiop = curproc->p_aio;
    921 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
    922 		return (EINVAL);
    923 
    924 	/*
    925 	 * Establish the absolute future time for the timeout.
    926 	 */
    927 	error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
    928 	if (error)
    929 		return (error);
    930 	if (rqtp) {
    931 		timestruc_t now;
    932 		timecheck = timechanged;
    933 		gethrestime(&now);
    934 		timespecadd(rqtp, &now);
    935 	}
    936 
    937 	/*
    938 	 * If we are not blocking and there's no IO complete
    939 	 * skip aiocb copyin.
    940 	 */
    941 	if (!blocking && (aiop->aio_pollq == NULL) &&
    942 	    (aiop->aio_doneq == NULL)) {
    943 		return (EAGAIN);
    944 	}
    945 
    946 	if (model == DATAMODEL_NATIVE)
    947 		ssize = (sizeof (aiocb_t *) * nent);
    948 #ifdef	_SYSCALL32_IMPL
    949 	else
    950 		ssize = (sizeof (caddr32_t) * nent);
    951 #endif  /* _SYSCALL32_IMPL */
    952 
    953 	cbplist = kmem_alloc(ssize, KM_NOSLEEP);
    954 	if (cbplist == NULL)
    955 		return (ENOMEM);
    956 
    957 	if (copyin(aiocb, cbplist, ssize)) {
    958 		error = EFAULT;
    959 		goto done;
    960 	}
    961 
    962 	found = NULL;
    963 	/*
    964 	 * we need to get the aio_cleanupq_mutex since we call
    965 	 * aio_req_done().
    966 	 */
    967 	mutex_enter(&aiop->aio_cleanupq_mutex);
    968 	mutex_enter(&aiop->aio_mutex);
    969 	for (;;) {
    970 		/* push requests on poll queue to done queue */
    971 		if (aiop->aio_pollq) {
    972 			mutex_exit(&aiop->aio_mutex);
    973 			mutex_exit(&aiop->aio_cleanupq_mutex);
    974 			aio_cleanup(0);
    975 			mutex_enter(&aiop->aio_cleanupq_mutex);
    976 			mutex_enter(&aiop->aio_mutex);
    977 		}
    978 		/* check for requests on done queue */
    979 		if (aiop->aio_doneq) {
    980 			if (model == DATAMODEL_NATIVE)
    981 				ucbp = (aiocb_t **)cbplist;
    982 #ifdef	_SYSCALL32_IMPL
    983 			else
    984 				ucbp32 = (caddr32_t *)cbplist;
    985 #endif  /* _SYSCALL32_IMPL */
    986 			for (i = 0; i < nent; i++) {
    987 				if (model == DATAMODEL_NATIVE) {
    988 					if ((cbp = *ucbp++) == NULL)
    989 						continue;
    990 					if (run_mode != AIO_LARGEFILE)
    991 						reqp = aio_req_done(
    992 						    &cbp->aio_resultp);
    993 					else {
    994 						cbp64 = (aiocb64_32_t *)cbp;
    995 						reqp = aio_req_done(
    996 						    &cbp64->aio_resultp);
    997 					}
    998 				}
    999 #ifdef	_SYSCALL32_IMPL
   1000 				else {
   1001 					if (run_mode == AIO_32) {
   1002 						if ((cbp32 =
   1003 						    (aiocb32_t *)(uintptr_t)
   1004 						    *ucbp32++) == NULL)
   1005 							continue;
   1006 						reqp = aio_req_done(
   1007 						    &cbp32->aio_resultp);
   1008 					} else if (run_mode == AIO_LARGEFILE) {
   1009 						if ((cbp64 =
   1010 						    (aiocb64_32_t *)(uintptr_t)
   1011 						    *ucbp32++) == NULL)
   1012 							continue;
   1013 						reqp = aio_req_done(
   1014 						    &cbp64->aio_resultp);
   1015 					}
   1016 
   1017 				}
   1018 #endif  /* _SYSCALL32_IMPL */
   1019 				if (reqp) {
   1020 					reqp->aio_req_next = found;
   1021 					found = reqp;
   1022 				}
   1023 				if (aiop->aio_doneq == NULL)
   1024 					break;
   1025 			}
   1026 			if (found)
   1027 				break;
   1028 		}
   1029 		if (aiop->aio_notifycnt > 0) {
   1030 			/*
   1031 			 * nothing on the kernel's queue. the user
   1032 			 * has notified the kernel that it has items
   1033 			 * on a user-level queue.
   1034 			 */
   1035 			aiop->aio_notifycnt--;
   1036 			*rval = 1;
   1037 			error = 0;
   1038 			break;
   1039 		}
   1040 		/* don't block if nothing is outstanding */
   1041 		if (aiop->aio_outstanding == 0) {
   1042 			error = EAGAIN;
   1043 			break;
   1044 		}
   1045 		if (blocking) {
   1046 			/*
   1047 			 * drop the aio_cleanupq_mutex as we are
   1048 			 * going to block.
   1049 			 */
   1050 			mutex_exit(&aiop->aio_cleanupq_mutex);
   1051 			rv = cv_waituntil_sig(&aiop->aio_waitcv,
   1052 			    &aiop->aio_mutex, rqtp, timecheck);
   1053 			/*
   1054 			 * we have to drop aio_mutex and
   1055 			 * grab it in the right order.
   1056 			 */
   1057 			mutex_exit(&aiop->aio_mutex);
   1058 			mutex_enter(&aiop->aio_cleanupq_mutex);
   1059 			mutex_enter(&aiop->aio_mutex);
   1060 			if (rv > 0)	/* check done queue again */
   1061 				continue;
   1062 			if (rv == 0)	/* interrupted by a signal */
   1063 				error = EINTR;
   1064 			else		/* timer expired */
   1065 				error = ETIME;
   1066 		} else {
   1067 			error = EAGAIN;
   1068 		}
   1069 		break;
   1070 	}
   1071 	mutex_exit(&aiop->aio_mutex);
   1072 	mutex_exit(&aiop->aio_cleanupq_mutex);
   1073 	for (reqp = found; reqp != NULL; reqp = next) {
   1074 		next = reqp->aio_req_next;
   1075 		aphysio_unlock(reqp);
   1076 		aio_copyout_result(reqp);
   1077 		mutex_enter(&aiop->aio_mutex);
   1078 		aio_req_free(aiop, reqp);
   1079 		mutex_exit(&aiop->aio_mutex);
   1080 	}
   1081 done:
   1082 	kmem_free(cbplist, ssize);
   1083 	return (error);
   1084 }
   1085 
   1086 /*
   1087  * initialize aio by allocating an aio_t struct for this
   1088  * process.
   1089  */
   1090 static int
   1091 aioinit(void)
   1092 {
   1093 	proc_t *p = curproc;
   1094 	aio_t *aiop;
   1095 	mutex_enter(&p->p_lock);
   1096 	if ((aiop = p->p_aio) == NULL) {
   1097 		aiop = aio_aiop_alloc();
   1098 		p->p_aio = aiop;
   1099 	}
   1100 	mutex_exit(&p->p_lock);
   1101 	if (aiop == NULL)
   1102 		return (ENOMEM);
   1103 	return (0);
   1104 }
   1105 
   1106 /*
   1107  * start a special thread that will cleanup after aio requests
   1108  * that are preventing a segment from being unmapped. as_unmap()
   1109  * blocks until all phsyio to this segment is completed. this
   1110  * doesn't happen until all the pages in this segment are not
   1111  * SOFTLOCKed. Some pages will be SOFTLOCKed when there are aio
   1112  * requests still outstanding. this special thread will make sure
   1113  * that these SOFTLOCKed pages will eventually be SOFTUNLOCKed.
   1114  *
   1115  * this function will return an error if the process has only
   1116  * one LWP. the assumption is that the caller is a separate LWP
   1117  * that remains blocked in the kernel for the life of this process.
   1118  */
   1119 static int
   1120 aiostart(void)
   1121 {
   1122 	proc_t *p = curproc;
   1123 	aio_t *aiop;
   1124 	int first, error = 0;
   1125 
   1126 	if (p->p_lwpcnt == 1)
   1127 		return (EDEADLK);
   1128 	mutex_enter(&p->p_lock);
   1129 	if ((aiop = p->p_aio) == NULL)
   1130 		error = EINVAL;
   1131 	else {
   1132 		first = aiop->aio_ok;
   1133 		if (aiop->aio_ok == 0)
   1134 			aiop->aio_ok = 1;
   1135 	}
   1136 	mutex_exit(&p->p_lock);
   1137 	if (error == 0 && first == 0) {
   1138 		return (aio_cleanup_thread(aiop));
   1139 		/* should return only to exit */
   1140 	}
   1141 	return (error);
   1142 }
   1143 
   1144 /*
   1145  * Associate an aiocb with a port.
   1146  * This function is used by aiorw() to associate a transaction with a port.
   1147  * Allocate an event port structure (port_alloc_event()) and store the
   1148  * delivered user pointer (portnfy_user) in the portkev_user field of the
   1149  * port_kevent_t structure..
   1150  * The aio_req_portkev pointer in the aio_req_t structure was added to identify
   1151  * the port association.
   1152  */
   1153 
   1154 static int
   1155 aio_req_assoc_port_rw(port_notify_t *pntfy, aiocb_t *cbp,
   1156 	aio_req_t *reqp, int event)
   1157 {
   1158 	port_kevent_t	*pkevp = NULL;
   1159 	int		error;
   1160 
   1161 	error = port_alloc_event(pntfy->portnfy_port, PORT_ALLOC_DEFAULT,
   1162 	    PORT_SOURCE_AIO, &pkevp);
   1163 	if (error) {
   1164 		if ((error == ENOMEM) || (error == EAGAIN))
   1165 			error = EAGAIN;
   1166 		else
   1167 			error = EINVAL;
   1168 	} else {
   1169 		port_init_event(pkevp, (uintptr_t)cbp, pntfy->portnfy_user,
   1170 		    aio_port_callback, reqp);
   1171 		pkevp->portkev_events = event;
   1172 		reqp->aio_req_portkev = pkevp;
   1173 		reqp->aio_req_port = pntfy->portnfy_port;
   1174 	}
   1175 	return (error);
   1176 }
   1177 
   1178 #ifdef _LP64
   1179 
   1180 /*
   1181  * Asynchronous list IO. A chain of aiocb's are copied in
   1182  * one at a time. If the aiocb is invalid, it is skipped.
   1183  * For each aiocb, the appropriate driver entry point is
   1184  * called. Optimize for the common case where the list
   1185  * of requests is to the same file descriptor.
   1186  *
   1187  * One possible optimization is to define a new driver entry
   1188  * point that supports a list of IO requests. Whether this
   1189  * improves performance depends somewhat on the driver's
   1190  * locking strategy. Processing a list could adversely impact
   1191  * the driver's interrupt latency.
   1192  */
   1193 static int
   1194 alio(
   1195 	int		mode_arg,
   1196 	aiocb_t		**aiocb_arg,
   1197 	int		nent,
   1198 	struct sigevent	*sigev)
   1199 {
   1200 	file_t		*fp;
   1201 	file_t		*prev_fp = NULL;
   1202 	int		prev_mode = -1;
   1203 	struct vnode	*vp;
   1204 	aio_lio_t	*head;
   1205 	aio_req_t	*reqp;
   1206 	aio_t		*aiop;
   1207 	caddr_t		cbplist;
   1208 	aiocb_t		cb;
   1209 	aiocb_t		*aiocb = &cb;
   1210 	aiocb_t		*cbp;
   1211 	aiocb_t		**ucbp;
   1212 	struct sigevent sigevk;
   1213 	sigqueue_t	*sqp;
   1214 	int		(*aio_func)();
   1215 	int		mode;
   1216 	int		error = 0;
   1217 	int		aio_errors = 0;
   1218 	int		i;
   1219 	size_t		ssize;
   1220 	int		deadhead = 0;
   1221 	int		aio_notsupported = 0;
   1222 	int		lio_head_port;
   1223 	int		aio_port;
   1224 	int		aio_thread;
   1225 	port_kevent_t	*pkevtp = NULL;
   1226 	int		portused = 0;
   1227 	port_notify_t	pnotify;
   1228 	int		event;
   1229 
   1230 	aiop = curproc->p_aio;
   1231 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
   1232 		return (EINVAL);
   1233 
   1234 	ssize = (sizeof (aiocb_t *) * nent);
   1235 	cbplist = kmem_alloc(ssize, KM_SLEEP);
   1236 	ucbp = (aiocb_t **)cbplist;
   1237 
   1238 	if (copyin(aiocb_arg, cbplist, ssize) ||
   1239 	    (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent)))) {
   1240 		kmem_free(cbplist, ssize);
   1241 		return (EFAULT);
   1242 	}
   1243 
   1244 	/* Event Ports  */
   1245 	if (sigev &&
   1246 	    (sigevk.sigev_notify == SIGEV_THREAD ||
   1247 	    sigevk.sigev_notify == SIGEV_PORT)) {
   1248 		if (sigevk.sigev_notify == SIGEV_THREAD) {
   1249 			pnotify.portnfy_port = sigevk.sigev_signo;
   1250 			pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
   1251 		} else if (copyin(sigevk.sigev_value.sival_ptr,
   1252 		    &pnotify, sizeof (pnotify))) {
   1253 			kmem_free(cbplist, ssize);
   1254 			return (EFAULT);
   1255 		}
   1256 		error = port_alloc_event(pnotify.portnfy_port,
   1257 		    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
   1258 		if (error) {
   1259 			if (error == ENOMEM || error == EAGAIN)
   1260 				error = EAGAIN;
   1261 			else
   1262 				error = EINVAL;
   1263 			kmem_free(cbplist, ssize);
   1264 			return (error);
   1265 		}
   1266 		lio_head_port = pnotify.portnfy_port;
   1267 		portused = 1;
   1268 	}
   1269 
   1270 	/*
   1271 	 * a list head should be allocated if notification is
   1272 	 * enabled for this list.
   1273 	 */
   1274 	head = NULL;
   1275 
   1276 	if (mode_arg == LIO_WAIT || sigev) {
   1277 		mutex_enter(&aiop->aio_mutex);
   1278 		error = aio_lio_alloc(&head);
   1279 		mutex_exit(&aiop->aio_mutex);
   1280 		if (error)
   1281 			goto done;
   1282 		deadhead = 1;
   1283 		head->lio_nent = nent;
   1284 		head->lio_refcnt = nent;
   1285 		head->lio_port = -1;
   1286 		head->lio_portkev = NULL;
   1287 		if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
   1288 		    sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
   1289 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
   1290 			if (sqp == NULL) {
   1291 				error = EAGAIN;
   1292 				goto done;
   1293 			}
   1294 			sqp->sq_func = NULL;
   1295 			sqp->sq_next = NULL;
   1296 			sqp->sq_info.si_code = SI_ASYNCIO;
   1297 			sqp->sq_info.si_pid = curproc->p_pid;
   1298 			sqp->sq_info.si_ctid = PRCTID(curproc);
   1299 			sqp->sq_info.si_zoneid = getzoneid();
   1300 			sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
   1301 			sqp->sq_info.si_signo = sigevk.sigev_signo;
   1302 			sqp->sq_info.si_value = sigevk.sigev_value;
   1303 			head->lio_sigqp = sqp;
   1304 		} else {
   1305 			head->lio_sigqp = NULL;
   1306 		}
   1307 		if (pkevtp) {
   1308 			/*
   1309 			 * Prepare data to send when list of aiocb's
   1310 			 * has completed.
   1311 			 */
   1312 			port_init_event(pkevtp, (uintptr_t)sigev,
   1313 			    (void *)(uintptr_t)pnotify.portnfy_user,
   1314 			    NULL, head);
   1315 			pkevtp->portkev_events = AIOLIO;
   1316 			head->lio_portkev = pkevtp;
   1317 			head->lio_port = pnotify.portnfy_port;
   1318 		}
   1319 	}
   1320 
   1321 	for (i = 0; i < nent; i++, ucbp++) {
   1322 
   1323 		cbp = *ucbp;
   1324 		/* skip entry if it can't be copied. */
   1325 		if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) {
   1326 			if (head) {
   1327 				mutex_enter(&aiop->aio_mutex);
   1328 				head->lio_nent--;
   1329 				head->lio_refcnt--;
   1330 				mutex_exit(&aiop->aio_mutex);
   1331 			}
   1332 			continue;
   1333 		}
   1334 
   1335 		/* skip if opcode for aiocb is LIO_NOP */
   1336 		mode = aiocb->aio_lio_opcode;
   1337 		if (mode == LIO_NOP) {
   1338 			cbp = NULL;
   1339 			if (head) {
   1340 				mutex_enter(&aiop->aio_mutex);
   1341 				head->lio_nent--;
   1342 				head->lio_refcnt--;
   1343 				mutex_exit(&aiop->aio_mutex);
   1344 			}
   1345 			continue;
   1346 		}
   1347 
   1348 		/* increment file descriptor's ref count. */
   1349 		if ((fp = getf(aiocb->aio_fildes)) == NULL) {
   1350 			lio_set_uerror(&cbp->aio_resultp, EBADF);
   1351 			if (head) {
   1352 				mutex_enter(&aiop->aio_mutex);
   1353 				head->lio_nent--;
   1354 				head->lio_refcnt--;
   1355 				mutex_exit(&aiop->aio_mutex);
   1356 			}
   1357 			aio_errors++;
   1358 			continue;
   1359 		}
   1360 
   1361 		/*
   1362 		 * check the permission of the partition
   1363 		 */
   1364 		if ((fp->f_flag & mode) == 0) {
   1365 			releasef(aiocb->aio_fildes);
   1366 			lio_set_uerror(&cbp->aio_resultp, EBADF);
   1367 			if (head) {
   1368 				mutex_enter(&aiop->aio_mutex);
   1369 				head->lio_nent--;
   1370 				head->lio_refcnt--;
   1371 				mutex_exit(&aiop->aio_mutex);
   1372 			}
   1373 			aio_errors++;
   1374 			continue;
   1375 		}
   1376 
   1377 		/*
   1378 		 * common case where requests are to the same fd
   1379 		 * for the same r/w operation.
   1380 		 * for UFS, need to set EBADFD
   1381 		 */
   1382 		vp = fp->f_vnode;
   1383 		if (fp != prev_fp || mode != prev_mode) {
   1384 			aio_func = check_vp(vp, mode);
   1385 			if (aio_func == NULL) {
   1386 				prev_fp = NULL;
   1387 				releasef(aiocb->aio_fildes);
   1388 				lio_set_uerror(&cbp->aio_resultp, EBADFD);
   1389 				aio_notsupported++;
   1390 				if (head) {
   1391 					mutex_enter(&aiop->aio_mutex);
   1392 					head->lio_nent--;
   1393 					head->lio_refcnt--;
   1394 					mutex_exit(&aiop->aio_mutex);
   1395 				}
   1396 				continue;
   1397 			} else {
   1398 				prev_fp = fp;
   1399 				prev_mode = mode;
   1400 			}
   1401 		}
   1402 
   1403 		error = aio_req_setup(&reqp, aiop, aiocb,
   1404 		    &cbp->aio_resultp, vp, 0);
   1405 		if (error) {
   1406 			releasef(aiocb->aio_fildes);
   1407 			lio_set_uerror(&cbp->aio_resultp, error);
   1408 			if (head) {
   1409 				mutex_enter(&aiop->aio_mutex);
   1410 				head->lio_nent--;
   1411 				head->lio_refcnt--;
   1412 				mutex_exit(&aiop->aio_mutex);
   1413 			}
   1414 			aio_errors++;
   1415 			continue;
   1416 		}
   1417 
   1418 		reqp->aio_req_lio = head;
   1419 		deadhead = 0;
   1420 
   1421 		/*
   1422 		 * Set the errno field now before sending the request to
   1423 		 * the driver to avoid a race condition
   1424 		 */
   1425 		(void) suword32(&cbp->aio_resultp.aio_errno,
   1426 		    EINPROGRESS);
   1427 
   1428 		reqp->aio_req_iocb.iocb = (caddr_t)cbp;
   1429 
   1430 		event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE;
   1431 		aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
   1432 		aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
   1433 		if (aio_port | aio_thread) {
   1434 			port_kevent_t *lpkevp;
   1435 			/*
   1436 			 * Prepare data to send with each aiocb completed.
   1437 			 */
   1438 			if (aio_port) {
   1439 				void *paddr =
   1440 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
   1441 				if (copyin(paddr, &pnotify, sizeof (pnotify)))
   1442 					error = EFAULT;
   1443 			} else {	/* aio_thread */
   1444 				pnotify.portnfy_port =
   1445 				    aiocb->aio_sigevent.sigev_signo;
   1446 				pnotify.portnfy_user =
   1447 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
   1448 			}
   1449 			if (error)
   1450 				/* EMPTY */;
   1451 			else if (pkevtp != NULL &&
   1452 			    pnotify.portnfy_port == lio_head_port)
   1453 				error = port_dup_event(pkevtp, &lpkevp,
   1454 				    PORT_ALLOC_DEFAULT);
   1455 			else
   1456 				error = port_alloc_event(pnotify.portnfy_port,
   1457 				    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
   1458 				    &lpkevp);
   1459 			if (error == 0) {
   1460 				port_init_event(lpkevp, (uintptr_t)cbp,
   1461 				    (void *)(uintptr_t)pnotify.portnfy_user,
   1462 				    aio_port_callback, reqp);
   1463 				lpkevp->portkev_events = event;
   1464 				reqp->aio_req_portkev = lpkevp;
   1465 				reqp->aio_req_port = pnotify.portnfy_port;
   1466 			}
   1467 		}
   1468 
   1469 		/*
   1470 		 * send the request to driver.
   1471 		 */
   1472 		if (error == 0) {
   1473 			if (aiocb->aio_nbytes == 0) {
   1474 				clear_active_fd(aiocb->aio_fildes);
   1475 				aio_zerolen(reqp);
   1476 				continue;
   1477 			}
   1478 			error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
   1479 			    CRED());
   1480 		}
   1481 
   1482 		/*
   1483 		 * the fd's ref count is not decremented until the IO has
   1484 		 * completed unless there was an error.
   1485 		 */
   1486 		if (error) {
   1487 			releasef(aiocb->aio_fildes);
   1488 			lio_set_uerror(&cbp->aio_resultp, error);
   1489 			if (head) {
   1490 				mutex_enter(&aiop->aio_mutex);
   1491 				head->lio_nent--;
   1492 				head->lio_refcnt--;
   1493 				mutex_exit(&aiop->aio_mutex);
   1494 			}
   1495 			if (error == ENOTSUP)
   1496 				aio_notsupported++;
   1497 			else
   1498 				aio_errors++;
   1499 			lio_set_error(reqp, portused);
   1500 		} else {
   1501 			clear_active_fd(aiocb->aio_fildes);
   1502 		}
   1503 	}
   1504 
   1505 	if (aio_notsupported) {
   1506 		error = ENOTSUP;
   1507 	} else if (aio_errors) {
   1508 		/*
   1509 		 * return EIO if any request failed
   1510 		 */
   1511 		error = EIO;
   1512 	}
   1513 
   1514 	if (mode_arg == LIO_WAIT) {
   1515 		mutex_enter(&aiop->aio_mutex);
   1516 		while (head->lio_refcnt > 0) {
   1517 			if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
   1518 				mutex_exit(&aiop->aio_mutex);
   1519 				error = EINTR;
   1520 				goto done;
   1521 			}
   1522 		}
   1523 		mutex_exit(&aiop->aio_mutex);
   1524 		alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_64);
   1525 	}
   1526 
   1527 done:
   1528 	kmem_free(cbplist, ssize);
   1529 	if (deadhead) {
   1530 		if (head->lio_sigqp)
   1531 			kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
   1532 		if (head->lio_portkev)
   1533 			port_free_event(head->lio_portkev);
   1534 		kmem_free(head, sizeof (aio_lio_t));
   1535 	}
   1536 	return (error);
   1537 }
   1538 
   1539 #endif /* _LP64 */
   1540 
   1541 /*
   1542  * Asynchronous list IO.
   1543  * If list I/O is called with LIO_WAIT it can still return
   1544  * before all the I/O's are completed if a signal is caught
   1545  * or if the list include UFS I/O requests. If this happens,
   1546  * libaio will call aliowait() to wait for the I/O's to
   1547  * complete
   1548  */
   1549 /*ARGSUSED*/
   1550 static int
   1551 aliowait(
   1552 	int	mode,
   1553 	void	*aiocb,
   1554 	int	nent,
   1555 	void	*sigev,
   1556 	int	run_mode)
   1557 {
   1558 	aio_lio_t	*head;
   1559 	aio_t		*aiop;
   1560 	caddr_t		cbplist;
   1561 	aiocb_t		*cbp, **ucbp;
   1562 #ifdef	_SYSCALL32_IMPL
   1563 	aiocb32_t	*cbp32;
   1564 	caddr32_t	*ucbp32;
   1565 	aiocb64_32_t	*cbp64;
   1566 #endif
   1567 	int		error = 0;
   1568 	int		i;
   1569 	size_t		ssize = 0;
   1570 	model_t		model = get_udatamodel();
   1571 
   1572 	aiop = curproc->p_aio;
   1573 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
   1574 		return (EINVAL);
   1575 
   1576 	if (model == DATAMODEL_NATIVE)
   1577 		ssize = (sizeof (aiocb_t *) * nent);
   1578 #ifdef	_SYSCALL32_IMPL
   1579 	else
   1580 		ssize = (sizeof (caddr32_t) * nent);
   1581 #endif  /* _SYSCALL32_IMPL */
   1582 
   1583 	if (ssize == 0)
   1584 		return (EINVAL);
   1585 
   1586 	cbplist = kmem_alloc(ssize, KM_SLEEP);
   1587 
   1588 	if (model == DATAMODEL_NATIVE)
   1589 		ucbp = (aiocb_t **)cbplist;
   1590 #ifdef	_SYSCALL32_IMPL
   1591 	else
   1592 		ucbp32 = (caddr32_t *)cbplist;
   1593 #endif  /* _SYSCALL32_IMPL */
   1594 
   1595 	if (copyin(aiocb, cbplist, ssize)) {
   1596 		error = EFAULT;
   1597 		goto done;
   1598 	}
   1599 
   1600 	/*
   1601 	 * To find the list head, we go through the
   1602 	 * list of aiocb structs, find the request
   1603 	 * its for, then get the list head that reqp
   1604 	 * points to
   1605 	 */
   1606 	head = NULL;
   1607 
   1608 	for (i = 0; i < nent; i++) {
   1609 		if (model == DATAMODEL_NATIVE) {
   1610 			/*
   1611 			 * Since we are only checking for a NULL pointer
   1612 			 * Following should work on both native data sizes
   1613 			 * as well as for largefile aiocb.
   1614 			 */
   1615 			if ((cbp = *ucbp++) == NULL)
   1616 				continue;
   1617 			if (run_mode != AIO_LARGEFILE)
   1618 				if (head = aio_list_get(&cbp->aio_resultp))
   1619 					break;
   1620 			else {
   1621 				/*
   1622 				 * This is a case when largefile call is
   1623 				 * made on 32 bit kernel.
   1624 				 * Treat each pointer as pointer to
   1625 				 * aiocb64_32
   1626 				 */
   1627 				if (head = aio_list_get((aio_result_t *)
   1628 				    &(((aiocb64_32_t *)cbp)->aio_resultp)))
   1629 					break;
   1630 			}
   1631 		}
   1632 #ifdef	_SYSCALL32_IMPL
   1633 		else {
   1634 			if (run_mode == AIO_LARGEFILE) {
   1635 				if ((cbp64 = (aiocb64_32_t *)
   1636 				    (uintptr_t)*ucbp32++) == NULL)
   1637 					continue;
   1638 				if (head = aio_list_get((aio_result_t *)
   1639 				    &cbp64->aio_resultp))
   1640 					break;
   1641 			} else if (run_mode == AIO_32) {
   1642 				if ((cbp32 = (aiocb32_t *)
   1643 				    (uintptr_t)*ucbp32++) == NULL)
   1644 					continue;
   1645 				if (head = aio_list_get((aio_result_t *)
   1646 				    &cbp32->aio_resultp))
   1647 					break;
   1648 			}
   1649 		}
   1650 #endif	/* _SYSCALL32_IMPL */
   1651 	}
   1652 
   1653 	if (head == NULL) {
   1654 		error = EINVAL;
   1655 		goto done;
   1656 	}
   1657 
   1658 	mutex_enter(&aiop->aio_mutex);
   1659 	while (head->lio_refcnt > 0) {
   1660 		if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
   1661 			mutex_exit(&aiop->aio_mutex);
   1662 			error = EINTR;
   1663 			goto done;
   1664 		}
   1665 	}
   1666 	mutex_exit(&aiop->aio_mutex);
   1667 	alio_cleanup(aiop, (aiocb_t **)cbplist, nent, run_mode);
   1668 done:
   1669 	kmem_free(cbplist, ssize);
   1670 	return (error);
   1671 }
   1672 
   1673 aio_lio_t *
   1674 aio_list_get(aio_result_t *resultp)
   1675 {
   1676 	aio_lio_t	*head = NULL;
   1677 	aio_t		*aiop;
   1678 	aio_req_t 	**bucket;
   1679 	aio_req_t 	*reqp;
   1680 	long		index;
   1681 
   1682 	aiop = curproc->p_aio;
   1683 	if (aiop == NULL)
   1684 		return (NULL);
   1685 
   1686 	if (resultp) {
   1687 		index = AIO_HASH(resultp);
   1688 		bucket = &aiop->aio_hash[index];
   1689 		for (reqp = *bucket; reqp != NULL;
   1690 		    reqp = reqp->aio_hash_next) {
   1691 			if (reqp->aio_req_resultp == resultp) {
   1692 				head = reqp->aio_req_lio;
   1693 				return (head);
   1694 			}
   1695 		}
   1696 	}
   1697 	return (NULL);
   1698 }
   1699 
   1700 
   1701 static void
   1702 lio_set_uerror(void *resultp, int error)
   1703 {
   1704 	/*
   1705 	 * the resultp field is a pointer to where the
   1706 	 * error should be written out to the user's
   1707 	 * aiocb.
   1708 	 *
   1709 	 */
   1710 	if (get_udatamodel() == DATAMODEL_NATIVE) {
   1711 		(void) sulword(&((aio_result_t *)resultp)->aio_return,
   1712 		    (ssize_t)-1);
   1713 		(void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
   1714 	}
   1715 #ifdef	_SYSCALL32_IMPL
   1716 	else {
   1717 		(void) suword32(&((aio_result32_t *)resultp)->aio_return,
   1718 		    (uint_t)-1);
   1719 		(void) suword32(&((aio_result32_t *)resultp)->aio_errno, error);
   1720 	}
   1721 #endif  /* _SYSCALL32_IMPL */
   1722 }
   1723 
   1724 /*
   1725  * do cleanup completion for all requests in list. memory for
   1726  * each request is also freed.
   1727  */
   1728 static void
   1729 alio_cleanup(aio_t *aiop, aiocb_t **cbp, int nent, int run_mode)
   1730 {
   1731 	int i;
   1732 	aio_req_t *reqp;
   1733 	aio_result_t *resultp;
   1734 	aiocb64_32_t *aiocb_64;
   1735 
   1736 	for (i = 0; i < nent; i++) {
   1737 		if (get_udatamodel() == DATAMODEL_NATIVE) {
   1738 			if (cbp[i] == NULL)
   1739 				continue;
   1740 			if (run_mode == AIO_LARGEFILE) {
   1741 				aiocb_64 = (aiocb64_32_t *)cbp[i];
   1742 				resultp = (aio_result_t *)
   1743 				    &aiocb_64->aio_resultp;
   1744 			} else
   1745 				resultp = &cbp[i]->aio_resultp;
   1746 		}
   1747 #ifdef	_SYSCALL32_IMPL
   1748 		else {
   1749 			aiocb32_t *aiocb_32;
   1750 			caddr32_t *cbp32;
   1751 
   1752 			cbp32 = (caddr32_t *)cbp;
   1753 			if (cbp32[i] == NULL)
   1754 				continue;
   1755 			if (run_mode == AIO_32) {
   1756 				aiocb_32 = (aiocb32_t *)(uintptr_t)cbp32[i];
   1757 				resultp = (aio_result_t *)&aiocb_32->
   1758 				    aio_resultp;
   1759 			} else if (run_mode == AIO_LARGEFILE) {
   1760 				aiocb_64 = (aiocb64_32_t *)(uintptr_t)cbp32[i];
   1761 				resultp = (aio_result_t *)&aiocb_64->
   1762 				    aio_resultp;
   1763 			}
   1764 		}
   1765 #endif  /* _SYSCALL32_IMPL */
   1766 		/*
   1767 		 * we need to get the aio_cleanupq_mutex since we call
   1768 		 * aio_req_done().
   1769 		 */
   1770 		mutex_enter(&aiop->aio_cleanupq_mutex);
   1771 		mutex_enter(&aiop->aio_mutex);
   1772 		reqp = aio_req_done(resultp);
   1773 		mutex_exit(&aiop->aio_mutex);
   1774 		mutex_exit(&aiop->aio_cleanupq_mutex);
   1775 		if (reqp != NULL) {
   1776 			aphysio_unlock(reqp);
   1777 			aio_copyout_result(reqp);
   1778 			mutex_enter(&aiop->aio_mutex);
   1779 			aio_req_free(aiop, reqp);
   1780 			mutex_exit(&aiop->aio_mutex);
   1781 		}
   1782 	}
   1783 }
   1784 
   1785 /*
   1786  * Write out the results for an aio request that is done.
   1787  */
   1788 static int
   1789 aioerror(void *cb, int run_mode)
   1790 {
   1791 	aio_result_t *resultp;
   1792 	aio_t *aiop;
   1793 	aio_req_t *reqp;
   1794 	int retval;
   1795 
   1796 	aiop = curproc->p_aio;
   1797 	if (aiop == NULL || cb == NULL)
   1798 		return (EINVAL);
   1799 
   1800 	if (get_udatamodel() == DATAMODEL_NATIVE) {
   1801 		if (run_mode == AIO_LARGEFILE)
   1802 			resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
   1803 			    aio_resultp;
   1804 		else
   1805 			resultp = &((aiocb_t *)cb)->aio_resultp;
   1806 	}
   1807 #ifdef	_SYSCALL32_IMPL
   1808 	else {
   1809 		if (run_mode == AIO_LARGEFILE)
   1810 			resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
   1811 			    aio_resultp;
   1812 		else if (run_mode == AIO_32)
   1813 			resultp = (aio_result_t *)&((aiocb32_t *)cb)->
   1814 			    aio_resultp;
   1815 	}
   1816 #endif  /* _SYSCALL32_IMPL */
   1817 	/*
   1818 	 * we need to get the aio_cleanupq_mutex since we call
   1819 	 * aio_req_find().
   1820 	 */
   1821 	mutex_enter(&aiop->aio_cleanupq_mutex);
   1822 	mutex_enter(&aiop->aio_mutex);
   1823 	retval = aio_req_find(resultp, &reqp);
   1824 	mutex_exit(&aiop->aio_mutex);
   1825 	mutex_exit(&aiop->aio_cleanupq_mutex);
   1826 	if (retval == 0) {
   1827 		aphysio_unlock(reqp);
   1828 		aio_copyout_result(reqp);
   1829 		mutex_enter(&aiop->aio_mutex);
   1830 		aio_req_free(aiop, reqp);
   1831 		mutex_exit(&aiop->aio_mutex);
   1832 		return (0);
   1833 	} else if (retval == 1)
   1834 		return (EINPROGRESS);
   1835 	else if (retval == 2)
   1836 		return (EINVAL);
   1837 	return (0);
   1838 }
   1839 
   1840 /*
   1841  * 	aio_cancel - if no requests outstanding,
   1842  *			return AIO_ALLDONE
   1843  *			else
   1844  *			return AIO_NOTCANCELED
   1845  */
   1846 static int
   1847 aio_cancel(
   1848 	int	fildes,
   1849 	void 	*cb,
   1850 	long	*rval,
   1851 	int	run_mode)
   1852 {
   1853 	aio_t *aiop;
   1854 	void *resultp;
   1855 	int index;
   1856 	aio_req_t **bucket;
   1857 	aio_req_t *ent;
   1858 
   1859 
   1860 	/*
   1861 	 * Verify valid file descriptor
   1862 	 */
   1863 	if ((getf(fildes)) == NULL) {
   1864 		return (EBADF);
   1865 	}
   1866 	releasef(fildes);
   1867 
   1868 	aiop = curproc->p_aio;
   1869 	if (aiop == NULL)
   1870 		return (EINVAL);
   1871 
   1872 	if (aiop->aio_outstanding == 0) {
   1873 		*rval = AIO_ALLDONE;
   1874 		return (0);
   1875 	}
   1876 
   1877 	mutex_enter(&aiop->aio_mutex);
   1878 	if (cb != NULL) {
   1879 		if (get_udatamodel() == DATAMODEL_NATIVE) {
   1880 			if (run_mode == AIO_LARGEFILE)
   1881 				resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
   1882 				    ->aio_resultp;
   1883 			else
   1884 				resultp = &((aiocb_t *)cb)->aio_resultp;
   1885 		}
   1886 #ifdef	_SYSCALL32_IMPL
   1887 		else {
   1888 			if (run_mode == AIO_LARGEFILE)
   1889 				resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
   1890 				    ->aio_resultp;
   1891 			else if (run_mode == AIO_32)
   1892 				resultp = (aio_result_t *)&((aiocb32_t *)cb)
   1893 				    ->aio_resultp;
   1894 		}
   1895 #endif  /* _SYSCALL32_IMPL */
   1896 		index = AIO_HASH(resultp);
   1897 		bucket = &aiop->aio_hash[index];
   1898 		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
   1899 			if (ent->aio_req_resultp == resultp) {
   1900 				if ((ent->aio_req_flags & AIO_PENDING) == 0) {
   1901 					mutex_exit(&aiop->aio_mutex);
   1902 					*rval = AIO_ALLDONE;
   1903 					return (0);
   1904 				}
   1905 				mutex_exit(&aiop->aio_mutex);
   1906 				*rval = AIO_NOTCANCELED;
   1907 				return (0);
   1908 			}
   1909 		}
   1910 		mutex_exit(&aiop->aio_mutex);
   1911 		*rval = AIO_ALLDONE;
   1912 		return (0);
   1913 	}
   1914 
   1915 	for (index = 0; index < AIO_HASHSZ; index++) {
   1916 		bucket = &aiop->aio_hash[index];
   1917 		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
   1918 			if (ent->aio_req_fd == fildes) {
   1919 				if ((ent->aio_req_flags & AIO_PENDING) != 0) {
   1920 					mutex_exit(&aiop->aio_mutex);
   1921 					*rval = AIO_NOTCANCELED;
   1922 					return (0);
   1923 				}
   1924 			}
   1925 		}
   1926 	}
   1927 	mutex_exit(&aiop->aio_mutex);
   1928 	*rval = AIO_ALLDONE;
   1929 	return (0);
   1930 }
   1931 
   1932 /*
   1933  * solaris version of asynchronous read and write
   1934  */
   1935 static int
   1936 arw(
   1937 	int	opcode,
   1938 	int	fdes,
   1939 	char	*bufp,
   1940 	int	bufsize,
   1941 	offset_t	offset,
   1942 	aio_result_t	*resultp,
   1943 	int		mode)
   1944 {
   1945 	file_t		*fp;
   1946 	int		error;
   1947 	struct vnode	*vp;
   1948 	aio_req_t	*reqp;
   1949 	aio_t		*aiop;
   1950 	int		(*aio_func)();
   1951 #ifdef _LP64
   1952 	aiocb_t		aiocb;
   1953 #else
   1954 	aiocb64_32_t	aiocb64;
   1955 #endif
   1956 
   1957 	aiop = curproc->p_aio;
   1958 	if (aiop == NULL)
   1959 		return (EINVAL);
   1960 
   1961 	if ((fp = getf(fdes)) == NULL) {
   1962 		return (EBADF);
   1963 	}
   1964 
   1965 	/*
   1966 	 * check the permission of the partition
   1967 	 */
   1968 	if ((fp->f_flag & mode) == 0) {
   1969 		releasef(fdes);
   1970 		return (EBADF);
   1971 	}
   1972 
   1973 	vp = fp->f_vnode;
   1974 	aio_func = check_vp(vp, mode);
   1975 	if (aio_func == NULL) {
   1976 		releasef(fdes);
   1977 		return (EBADFD);
   1978 	}
   1979 #ifdef _LP64
   1980 	aiocb.aio_fildes = fdes;
   1981 	aiocb.aio_buf = bufp;
   1982 	aiocb.aio_nbytes = bufsize;
   1983 	aiocb.aio_offset = offset;
   1984 	aiocb.aio_sigevent.sigev_notify = 0;
   1985 	error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 1);
   1986 #else
   1987 	aiocb64.aio_fildes = fdes;
   1988 	aiocb64.aio_buf = (caddr32_t)bufp;
   1989 	aiocb64.aio_nbytes = bufsize;
   1990 	aiocb64.aio_offset = offset;
   1991 	aiocb64.aio_sigevent.sigev_notify = 0;
   1992 	error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 1);
   1993 #endif
   1994 	if (error) {
   1995 		releasef(fdes);
   1996 		return (error);
   1997 	}
   1998 
   1999 	/*
   2000 	 * enable polling on this request if the opcode has
   2001 	 * the AIO poll bit set
   2002 	 */
   2003 	if (opcode & AIO_POLL_BIT)
   2004 		reqp->aio_req_flags |= AIO_POLL;
   2005 
   2006 	if (bufsize == 0) {
   2007 		clear_active_fd(fdes);
   2008 		aio_zerolen(reqp);
   2009 		return (0);
   2010 	}
   2011 	/*
   2012 	 * send the request to driver.
   2013 	 */
   2014 	error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
   2015 	/*
   2016 	 * the fd is stored in the aio_req_t by aio_req_setup(), and
   2017 	 * is released by the aio_cleanup_thread() when the IO has
   2018 	 * completed.
   2019 	 */
   2020 	if (error) {
   2021 		releasef(fdes);
   2022 		mutex_enter(&aiop->aio_mutex);
   2023 		aio_req_free(aiop, reqp);
   2024 		aiop->aio_pending--;
   2025 		if (aiop->aio_flags & AIO_REQ_BLOCK)
   2026 			cv_signal(&aiop->aio_cleanupcv);
   2027 		mutex_exit(&aiop->aio_mutex);
   2028 		return (error);
   2029 	}
   2030 	clear_active_fd(fdes);
   2031 	return (0);
   2032 }
   2033 
   2034 /*
   2035  * posix version of asynchronous read and write
   2036  */
   2037 static int
   2038 aiorw(
   2039 	int		opcode,
   2040 	void		*aiocb_arg,
   2041 	int		mode,
   2042 	int		run_mode)
   2043 {
   2044 #ifdef _SYSCALL32_IMPL
   2045 	aiocb32_t	aiocb32;
   2046 	struct	sigevent32 *sigev32;
   2047 	port_notify32_t	pntfy32;
   2048 #endif
   2049 	aiocb64_32_t	aiocb64;
   2050 	aiocb_t		aiocb;
   2051 	file_t		*fp;
   2052 	int		error, fd;
   2053 	size_t		bufsize;
   2054 	struct vnode	*vp;
   2055 	aio_req_t	*reqp;
   2056 	aio_t		*aiop;
   2057 	int		(*aio_func)();
   2058 	aio_result_t	*resultp;
   2059 	struct	sigevent *sigev;
   2060 	model_t		model;
   2061 	int		aio_use_port = 0;
   2062 	port_notify_t	pntfy;
   2063 
   2064 	model = get_udatamodel();
   2065 	aiop = curproc->p_aio;
   2066 	if (aiop == NULL)
   2067 		return (EINVAL);
   2068 
   2069 	if (model == DATAMODEL_NATIVE) {
   2070 		if (run_mode != AIO_LARGEFILE) {
   2071 			if (copyin(aiocb_arg, &aiocb, sizeof (aiocb_t)))
   2072 				return (EFAULT);
   2073 			bufsize = aiocb.aio_nbytes;
   2074 			resultp = &(((aiocb_t *)aiocb_arg)->aio_resultp);
   2075 			if ((fp = getf(fd = aiocb.aio_fildes)) == NULL) {
   2076 				return (EBADF);
   2077 			}
   2078 			sigev = &aiocb.aio_sigevent;
   2079 		} else {
   2080 			/*
   2081 			 * We come here only when we make largefile
   2082 			 * call on 32 bit kernel using 32 bit library.
   2083 			 */
   2084 			if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
   2085 				return (EFAULT);
   2086 			bufsize = aiocb64.aio_nbytes;
   2087 			resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
   2088 			    ->aio_resultp);
   2089 			if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL)
   2090 				return (EBADF);
   2091 			sigev = (struct sigevent *)&aiocb64.aio_sigevent;
   2092 		}
   2093 
   2094 		if (sigev->sigev_notify == SIGEV_PORT) {
   2095 			if (copyin((void *)sigev->sigev_value.sival_ptr,
   2096 			    &pntfy, sizeof (port_notify_t))) {
   2097 				releasef(fd);
   2098 				return (EFAULT);
   2099 			}
   2100 			aio_use_port = 1;
   2101 		} else if (sigev->sigev_notify == SIGEV_THREAD) {
   2102 			pntfy.portnfy_port = aiocb.aio_sigevent.sigev_signo;
   2103 			pntfy.portnfy_user =
   2104 			    aiocb.aio_sigevent.sigev_value.sival_ptr;
   2105 			aio_use_port = 1;
   2106 		}
   2107 	}
   2108 #ifdef	_SYSCALL32_IMPL
   2109 	else {
   2110 		if (run_mode == AIO_32) {
   2111 			/* 32 bit system call is being made on 64 bit kernel */
   2112 			if (copyin(aiocb_arg, &aiocb32, sizeof (aiocb32_t)))
   2113 				return (EFAULT);
   2114 
   2115 			bufsize = aiocb32.aio_nbytes;
   2116 			aiocb_32ton(&aiocb32, &aiocb);
   2117 			resultp = (aio_result_t *)&(((aiocb32_t *)aiocb_arg)->
   2118 			    aio_resultp);
   2119 			if ((fp = getf(fd = aiocb32.aio_fildes)) == NULL) {
   2120 				return (EBADF);
   2121 			}
   2122 			sigev32 = &aiocb32.aio_sigevent;
   2123 		} else if (run_mode == AIO_LARGEFILE) {
   2124 			/*
   2125 			 * We come here only when we make largefile
   2126 			 * call on 64 bit kernel using 32 bit library.
   2127 			 */
   2128 			if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
   2129 				return (EFAULT);
   2130 			bufsize = aiocb64.aio_nbytes;
   2131 			aiocb_LFton(&aiocb64, &aiocb);
   2132 			resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
   2133 			    ->aio_resultp);
   2134 			if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL)
   2135 				return (EBADF);
   2136 			sigev32 = &aiocb64.aio_sigevent;
   2137 		}
   2138 
   2139 		if (sigev32->sigev_notify == SIGEV_PORT) {
   2140 			if (copyin(
   2141 			    (void *)(uintptr_t)sigev32->sigev_value.sival_ptr,
   2142 			    &pntfy32, sizeof (port_notify32_t))) {
   2143 				releasef(fd);
   2144 				return (EFAULT);
   2145 			}
   2146 			pntfy.portnfy_port = pntfy32.portnfy_port;
   2147 			pntfy.portnfy_user = (void *)(uintptr_t)
   2148 			    pntfy32.portnfy_user;
   2149 			aio_use_port = 1;
   2150 		} else if (sigev32->sigev_notify == SIGEV_THREAD) {
   2151 			pntfy.portnfy_port = sigev32->sigev_signo;
   2152 			pntfy.portnfy_user = (void *)(uintptr_t)
   2153 			    sigev32->sigev_value.sival_ptr;
   2154 			aio_use_port = 1;
   2155 		}
   2156 	}
   2157 #endif  /* _SYSCALL32_IMPL */
   2158 
   2159 	/*
   2160 	 * check the permission of the partition
   2161 	 */
   2162 
   2163 	if ((fp->f_flag & mode) == 0) {
   2164 		releasef(fd);
   2165 		return (EBADF);
   2166 	}
   2167 
   2168 	vp = fp->f_vnode;
   2169 	aio_func = check_vp(vp, mode);
   2170 	if (aio_func == NULL) {
   2171 		releasef(fd);
   2172 		return (EBADFD);
   2173 	}
   2174 	if (run_mode == AIO_LARGEFILE)
   2175 		error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 0);
   2176 	else
   2177 		error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 0);
   2178 
   2179 	if (error) {
   2180 		releasef(fd);
   2181 		return (error);
   2182 	}
   2183 	/*
   2184 	 * enable polling on this request if the opcode has
   2185 	 * the AIO poll bit set
   2186 	 */
   2187 	if (opcode & AIO_POLL_BIT)
   2188 		reqp->aio_req_flags |= AIO_POLL;
   2189 
   2190 	if (model == DATAMODEL_NATIVE)
   2191 		reqp->aio_req_iocb.iocb = aiocb_arg;
   2192 #ifdef  _SYSCALL32_IMPL
   2193 	else
   2194 		reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)aiocb_arg;
   2195 #endif
   2196 
   2197 	if (aio_use_port) {
   2198 		int event = (run_mode == AIO_LARGEFILE)?
   2199 		    ((mode == FREAD)? AIOAREAD64 : AIOAWRITE64) :
   2200 		    ((mode == FREAD)? AIOAREAD : AIOAWRITE);
   2201 		error = aio_req_assoc_port_rw(&pntfy, aiocb_arg, reqp, event);
   2202 	}
   2203 
   2204 	/*
   2205 	 * send the request to driver.
   2206 	 */
   2207 	if (error == 0) {
   2208 		if (bufsize == 0) {
   2209 			clear_active_fd(fd);
   2210 			aio_zerolen(reqp);
   2211 			return (0);
   2212 		}
   2213 		error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
   2214 	}
   2215 
   2216 	/*
   2217 	 * the fd is stored in the aio_req_t by aio_req_setup(), and
   2218 	 * is released by the aio_cleanup_thread() when the IO has
   2219 	 * completed.
   2220 	 */
   2221 	if (error) {
   2222 		releasef(fd);
   2223 		mutex_enter(&aiop->aio_mutex);
   2224 		if (aio_use_port)
   2225 			aio_deq(&aiop->aio_portpending, reqp);
   2226 		aio_req_free(aiop, reqp);
   2227 		aiop->aio_pending--;
   2228 		if (aiop->aio_flags & AIO_REQ_BLOCK)
   2229 			cv_signal(&aiop->aio_cleanupcv);
   2230 		mutex_exit(&aiop->aio_mutex);
   2231 		return (error);
   2232 	}
   2233 	clear_active_fd(fd);
   2234 	return (0);
   2235 }
   2236 
   2237 
   2238 /*
   2239  * set error for a list IO entry that failed.
   2240  */
   2241 static void
   2242 lio_set_error(aio_req_t *reqp, int portused)
   2243 {
   2244 	aio_t *aiop = curproc->p_aio;
   2245 
   2246 	if (aiop == NULL)
   2247 		return;
   2248 
   2249 	mutex_enter(&aiop->aio_mutex);
   2250 	if (portused)
   2251 		aio_deq(&aiop->aio_portpending, reqp);
   2252 	aiop->aio_pending--;
   2253 	/* request failed, AIO_PHYSIODONE set to aviod physio cleanup. */
   2254 	reqp->aio_req_flags |= AIO_PHYSIODONE;
   2255 	/*
   2256 	 * Need to free the request now as its never
   2257 	 * going to get on the done queue
   2258 	 *
   2259 	 * Note: aio_outstanding is decremented in
   2260 	 *	 aio_req_free()
   2261 	 */
   2262 	aio_req_free(aiop, reqp);
   2263 	if (aiop->aio_flags & AIO_REQ_BLOCK)
   2264 		cv_signal(&aiop->aio_cleanupcv);
   2265 	mutex_exit(&aiop->aio_mutex);
   2266 }
   2267 
   2268 /*
   2269  * check if a specified request is done, and remove it from
   2270  * the done queue. otherwise remove anybody from the done queue
   2271  * if NULL is specified.
   2272  */
   2273 static aio_req_t *
   2274 aio_req_done(void *resultp)
   2275 {
   2276 	aio_req_t **bucket;
   2277 	aio_req_t *ent;
   2278 	aio_t *aiop = curproc->p_aio;
   2279 	long index;
   2280 
   2281 	ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
   2282 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
   2283 
   2284 	if (resultp) {
   2285 		index = AIO_HASH(resultp);
   2286 		bucket = &aiop->aio_hash[index];
   2287 		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
   2288 			if (ent->aio_req_resultp == (aio_result_t *)resultp) {
   2289 				if (ent->aio_req_flags & AIO_DONEQ) {
   2290 					return (aio_req_remove(ent));
   2291 				}
   2292 				return (NULL);
   2293 			}
   2294 		}
   2295 		/* no match, resultp is invalid */
   2296 		return (NULL);
   2297 	}
   2298 	return (aio_req_remove(NULL));
   2299 }
   2300 
   2301 /*
   2302  * determine if a user-level resultp pointer is associated with an
   2303  * active IO request. Zero is returned when the request is done,
   2304  * and the request is removed from the done queue. Only when the
   2305  * return value is zero, is the "reqp" pointer valid. One is returned
   2306  * when the request is inprogress. Two is returned when the request
   2307  * is invalid.
   2308  */
   2309 static int
   2310 aio_req_find(aio_result_t *resultp, aio_req_t **reqp)
   2311 {
   2312 	aio_req_t **bucket;
   2313 	aio_req_t *ent;
   2314 	aio_t *aiop = curproc->p_aio;
   2315 	long index;
   2316 
   2317 	ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
   2318 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
   2319 
   2320 	index = AIO_HASH(resultp);
   2321 	bucket = &aiop->aio_hash[index];
   2322 	for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
   2323 		if (ent->aio_req_resultp == resultp) {
   2324 			if (ent->aio_req_flags & AIO_DONEQ) {
   2325 				*reqp = aio_req_remove(ent);
   2326 				return (0);
   2327 			}
   2328 			return (1);
   2329 		}
   2330 	}
   2331 	/* no match, resultp is invalid */
   2332 	return (2);
   2333 }
   2334 
   2335 /*
   2336  * remove a request from the done queue.
   2337  */
   2338 static aio_req_t *
   2339 aio_req_remove(aio_req_t *reqp)
   2340 {
   2341 	aio_t *aiop = curproc->p_aio;
   2342 
   2343 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
   2344 
   2345 	if (reqp != NULL) {
   2346 		ASSERT(reqp->aio_req_flags & AIO_DONEQ);
   2347 		if (reqp->aio_req_next == reqp) {
   2348 			/* only one request on queue */
   2349 			if (reqp ==  aiop->aio_doneq) {
   2350 				aiop->aio_doneq = NULL;
   2351 			} else {
   2352 				ASSERT(reqp == aiop->aio_cleanupq);
   2353 				aiop->aio_cleanupq = NULL;
   2354 			}
   2355 		} else {
   2356 			reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
   2357 			reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
   2358 			/*
   2359 			 * The request can be either on the aio_doneq or the
   2360 			 * aio_cleanupq
   2361 			 */
   2362 			if (reqp == aiop->aio_doneq)
   2363 				aiop->aio_doneq = reqp->aio_req_next;
   2364 
   2365 			if (reqp == aiop->aio_cleanupq)
   2366 				aiop->aio_cleanupq = reqp->aio_req_next;
   2367 		}
   2368 		reqp->aio_req_flags &= ~AIO_DONEQ;
   2369 		reqp->aio_req_next = NULL;
   2370 		reqp->aio_req_prev = NULL;
   2371 	} else if ((reqp = aiop->aio_doneq) != NULL) {
   2372 		ASSERT(reqp->aio_req_flags & AIO_DONEQ);
   2373 		if (reqp == reqp->aio_req_next) {
   2374 			/* only one request on queue */
   2375 			aiop->aio_doneq = NULL;
   2376 		} else {
   2377 			reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
   2378 			reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
   2379 			aiop->aio_doneq = reqp->aio_req_next;
   2380 		}
   2381 		reqp->aio_req_flags &= ~AIO_DONEQ;
   2382 		reqp->aio_req_next = NULL;
   2383 		reqp->aio_req_prev = NULL;
   2384 	}
   2385 	if (aiop->aio_doneq == NULL && (aiop->aio_flags & AIO_WAITN))
   2386 		cv_broadcast(&aiop->aio_waitcv);
   2387 	return (reqp);
   2388 }
   2389 
   2390 static int
   2391 aio_req_setup(
   2392 	aio_req_t	**reqpp,
   2393 	aio_t 		*aiop,
   2394 	aiocb_t 	*arg,
   2395 	aio_result_t 	*resultp,
   2396 	vnode_t		*vp,
   2397 	int		old_solaris_req)
   2398 {
   2399 	sigqueue_t	*sqp = NULL;
   2400 	aio_req_t 	*reqp;
   2401 	struct uio 	*uio;
   2402 	struct sigevent *sigev;
   2403 	int		error;
   2404 
   2405 	sigev = &arg->aio_sigevent;
   2406 	if (sigev->sigev_notify == SIGEV_SIGNAL &&
   2407 	    sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) {
   2408 		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
   2409 		if (sqp == NULL)
   2410 			return (EAGAIN);
   2411 		sqp->sq_func = NULL;
   2412 		sqp->sq_next = NULL;
   2413 		sqp->sq_info.si_code = SI_ASYNCIO;
   2414 		sqp->sq_info.si_pid = curproc->p_pid;
   2415 		sqp->sq_info.si_ctid = PRCTID(curproc);
   2416 		sqp->sq_info.si_zoneid = getzoneid();
   2417 		sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
   2418 		sqp->sq_info.si_signo = sigev->sigev_signo;
   2419 		sqp->sq_info.si_value = sigev->sigev_value;
   2420 	}
   2421 
   2422 	mutex_enter(&aiop->aio_mutex);
   2423 
   2424 	if (aiop->aio_flags & AIO_REQ_BLOCK) {
   2425 		mutex_exit(&aiop->aio_mutex);
   2426 		if (sqp)
   2427 			kmem_free(sqp, sizeof (sigqueue_t));
   2428 		return (EIO);
   2429 	}
   2430 	/*
   2431 	 * get an aio_reqp from the free list or allocate one
   2432 	 * from dynamic memory.
   2433 	 */
   2434 	if (error = aio_req_alloc(&reqp, resultp)) {
   2435 		mutex_exit(&aiop->aio_mutex);
   2436 		if (sqp)
   2437 			kmem_free(sqp, sizeof (sigqueue_t));
   2438 		return (error);
   2439 	}
   2440 	aiop->aio_pending++;
   2441 	aiop->aio_outstanding++;
   2442 	reqp->aio_req_flags = AIO_PENDING;
   2443 	if (old_solaris_req) {
   2444 		/* this is an old solaris aio request */
   2445 		reqp->aio_req_flags |= AIO_SOLARIS;
   2446 		aiop->aio_flags |= AIO_SOLARIS_REQ;
   2447 	}
   2448 	if (sigev->sigev_notify == SIGEV_THREAD ||
   2449 	    sigev->sigev_notify == SIGEV_PORT)
   2450 		aio_enq(&aiop->aio_portpending, reqp, 0);
   2451 	mutex_exit(&aiop->aio_mutex);
   2452 	/*
   2453 	 * initialize aio request.
   2454 	 */
   2455 	reqp->aio_req_fd = arg->aio_fildes;
   2456 	reqp->aio_req_sigqp = sqp;
   2457 	reqp->aio_req_iocb.iocb = NULL;
   2458 	reqp->aio_req_lio = NULL;
   2459 	reqp->aio_req_buf.b_file = vp;
   2460 	uio = reqp->aio_req.aio_uio;
   2461 	uio->uio_iovcnt = 1;
   2462 	uio->uio_iov->iov_base = (caddr_t)arg->aio_buf;
   2463 	uio->uio_iov->iov_len = arg->aio_nbytes;
   2464 	uio->uio_loffset = arg->aio_offset;
   2465 	*reqpp = reqp;
   2466 	return (0);
   2467 }
   2468 
   2469 /*
   2470  * Allocate p_aio struct.
   2471  */
   2472 static aio_t *
   2473 aio_aiop_alloc(void)
   2474 {
   2475 	aio_t	*aiop;
   2476 
   2477 	ASSERT(MUTEX_HELD(&curproc->p_lock));
   2478 
   2479 	aiop = kmem_zalloc(sizeof (struct aio), KM_NOSLEEP);
   2480 	if (aiop) {
   2481 		mutex_init(&aiop->aio_mutex, NULL, MUTEX_DEFAULT, NULL);
   2482 		mutex_init(&aiop->aio_cleanupq_mutex, NULL, MUTEX_DEFAULT,
   2483 		    NULL);
   2484 		mutex_init(&aiop->aio_portq_mutex, NULL, MUTEX_DEFAULT, NULL);
   2485 	}
   2486 	return (aiop);
   2487 }
   2488 
   2489 /*
   2490  * Allocate an aio_req struct.
   2491  */
   2492 static int
   2493 aio_req_alloc(aio_req_t **nreqp, aio_result_t *resultp)
   2494 {
   2495 	aio_req_t *reqp;
   2496 	aio_t *aiop = curproc->p_aio;
   2497 
   2498 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
   2499 
   2500 	if ((reqp = aiop->aio_free) != NULL) {
   2501 		aiop->aio_free = reqp->aio_req_next;
   2502 		bzero(reqp, sizeof (*reqp));
   2503 	} else {
   2504 		/*
   2505 		 * Check whether memory is getting tight.
   2506 		 * This is a temporary mechanism to avoid memory
   2507 		 * exhaustion by a single process until we come up
   2508 		 * with a per process solution such as setrlimit().
   2509 		 */
   2510 		if (freemem < desfree)
   2511 			return (EAGAIN);
   2512 		reqp = kmem_zalloc(sizeof (struct aio_req_t), KM_NOSLEEP);
   2513 		if (reqp == NULL)
   2514 			return (EAGAIN);
   2515 	}
   2516 	reqp->aio_req.aio_uio = &reqp->aio_req_uio;
   2517 	reqp->aio_req.aio_uio->uio_iov = &reqp->aio_req_iov;
   2518 	reqp->aio_req.aio_private = reqp;
   2519 	reqp->aio_req_buf.b_offset = -1;
   2520 	reqp->aio_req_resultp = resultp;
   2521 	if (aio_hash_insert(reqp, aiop)) {
   2522 		reqp->aio_req_next = aiop->aio_free;
   2523 		aiop->aio_free = reqp;
   2524 		return (EBUSY);
   2525 	}
   2526 	*nreqp = reqp;
   2527 	return (0);
   2528 }
   2529 
   2530 /*
   2531  * Allocate an aio_lio_t struct.
   2532  */
   2533 static int
   2534 aio_lio_alloc(aio_lio_t **head)
   2535 {
   2536 	aio_lio_t *liop;
   2537 	aio_t *aiop = curproc->p_aio;
   2538 
   2539 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
   2540 
   2541 	if ((liop = aiop->aio_lio_free) != NULL) {
   2542 		aiop->aio_lio_free = liop->lio_next;
   2543 	} else {
   2544 		/*
   2545 		 * Check whether memory is getting tight.
   2546 		 * This is a temporary mechanism to avoid memory
   2547 		 * exhaustion by a single process until we come up
   2548 		 * with a per process solution such as setrlimit().
   2549 		 */
   2550 		if (freemem < desfree)
   2551 			return (EAGAIN);
   2552 
   2553 		liop = kmem_zalloc(sizeof (aio_lio_t), KM_NOSLEEP);
   2554 		if (liop == NULL)
   2555 			return (EAGAIN);
   2556 	}
   2557 	*head = liop;
   2558 	return (0);
   2559 }
   2560 
   2561 /*
   2562  * this is a special per-process thread that is only activated if
   2563  * the process is unmapping a segment with outstanding aio. normally,
   2564  * the process will have completed the aio before unmapping the
   2565  * segment. If the process does unmap a segment with outstanding aio,
   2566  * this special thread will guarentee that the locked pages due to
   2567  * aphysio() are released, thereby permitting the segment to be
   2568  * unmapped. In addition to this, the cleanup thread is woken up
   2569  * during DR operations to release the locked pages.
   2570  */
   2571 
   2572 static int
   2573 aio_cleanup_thread(aio_t *aiop)
   2574 {
   2575 	proc_t *p = curproc;
   2576 	struct as *as = p->p_as;
   2577 	int poked = 0;
   2578 	kcondvar_t *cvp;
   2579 	int exit_flag = 0;
   2580 	int rqclnup = 0;
   2581 
   2582 	sigfillset(&curthread->t_hold);
   2583 	sigdiffset(&curthread->t_hold, &cantmask);
   2584 	for (;;) {
   2585 		/*
   2586 		 * if a segment is being unmapped, and the current
   2587 		 * process's done queue is not empty, then every request
   2588 		 * on the doneq with locked resources should be forced
   2589 		 * to release their locks. By moving the doneq request
   2590 		 * to the cleanupq, aio_cleanup() will process the cleanupq,
   2591 		 * and place requests back onto the doneq. All requests
   2592 		 * processed by aio_cleanup() will have their physical
   2593 		 * resources unlocked.
   2594 		 */
   2595 		mutex_enter(&aiop->aio_mutex);
   2596 		if ((aiop->aio_flags & AIO_CLEANUP) == 0) {
   2597 			aiop->aio_flags |= AIO_CLEANUP;
   2598 			mutex_enter(&as->a_contents);
   2599 			if (aiop->aio_rqclnup) {
   2600 				aiop->aio_rqclnup = 0;
   2601 				rqclnup = 1;
   2602 			}
   2603 			mutex_exit(&as->a_contents);
   2604 			if (aiop->aio_doneq) {
   2605 				aio_req_t *doneqhead = aiop->aio_doneq;
   2606 				aiop->aio_doneq = NULL;
   2607 				aio_cleanupq_concat(aiop, doneqhead, AIO_DONEQ);
   2608 			}
   2609 		}
   2610 		mutex_exit(&aiop->aio_mutex);
   2611 		aio_cleanup(AIO_CLEANUP_THREAD);
   2612 		/*
   2613 		 * thread should block on the cleanupcv while
   2614 		 * AIO_CLEANUP is set.
   2615 		 */
   2616 		cvp = &aiop->aio_cleanupcv;
   2617 		mutex_enter(&aiop->aio_mutex);
   2618 
   2619 		if (aiop->aio_pollq != NULL || aiop->aio_cleanupq != NULL ||
   2620 		    aiop->aio_notifyq != NULL ||
   2621 		    aiop->aio_portcleanupq != NULL) {
   2622 			mutex_exit(&aiop->aio_mutex);
   2623 			continue;
   2624 		}
   2625 		mutex_enter(&as->a_contents);
   2626 
   2627 		/*
   2628 		 * AIO_CLEANUP determines when the cleanup thread
   2629 		 * should be active. This flag is set when
   2630 		 * the cleanup thread is awakened by as_unmap() or
   2631 		 * due to DR operations.
   2632 		 * The flag is cleared when the blocking as_unmap()
   2633 		 * that originally awakened us is allowed to
   2634 		 * complete. as_unmap() blocks when trying to
   2635 		 * unmap a segment that has SOFTLOCKed pages. when
   2636 		 * the segment's pages are all SOFTUNLOCKed,
   2637 		 * as->a_flags & AS_UNMAPWAIT should be zero.
   2638 		 *
   2639 		 * In case of cleanup request by DR, the flag is cleared
   2640 		 * once all the pending aio requests have been processed.
   2641 		 *
   2642 		 * The flag shouldn't be cleared right away if the
   2643 		 * cleanup thread was interrupted because the process
   2644 		 * is doing forkall(). This happens when cv_wait_sig()
   2645 		 * returns zero, because it was awakened by a pokelwps().
   2646 		 * If the process is not exiting, it must be doing forkall().
   2647 		 */
   2648 		if ((poked == 0) &&
   2649 		    ((!rqclnup && (AS_ISUNMAPWAIT(as) == 0)) ||
   2650 		    (aiop->aio_pending == 0))) {
   2651 			aiop->aio_flags &= ~(AIO_CLEANUP | AIO_CLEANUP_PORT);
   2652 			cvp = &as->a_cv;
   2653 			rqclnup = 0;
   2654 		}
   2655 		mutex_exit(&aiop->aio_mutex);
   2656 		if (poked) {
   2657 			/*
   2658 			 * If the process is exiting/killed, don't return
   2659 			 * immediately without waiting for pending I/O's
   2660 			 * and releasing the page locks.
   2661 			 */
   2662 			if (p->p_flag & (SEXITLWPS|SKILLED)) {
   2663 				/*
   2664 				 * If exit_flag is set, then it is
   2665 				 * safe to exit because we have released
   2666 				 * page locks of completed I/O's.
   2667 				 */
   2668 				if (exit_flag)
   2669 					break;
   2670 
   2671 				mutex_exit(&as->a_contents);
   2672 
   2673 				/*
   2674 				 * Wait for all the pending aio to complete.
   2675 				 */
   2676 				mutex_enter(&aiop->aio_mutex);
   2677 				aiop->aio_flags |= AIO_REQ_BLOCK;
   2678 				while (aiop->aio_pending != 0)
   2679 					cv_wait(&aiop->aio_cleanupcv,
   2680 					    &aiop->aio_mutex);
   2681 				mutex_exit(&aiop->aio_mutex);
   2682 				exit_flag = 1;
   2683 				continue;
   2684 			} else if (p->p_flag &
   2685 			    (SHOLDFORK|SHOLDFORK1|SHOLDWATCH)) {
   2686 				/*
   2687 				 * hold LWP until it
   2688 				 * is continued.
   2689 				 */
   2690 				mutex_exit(&as->a_contents);
   2691 				mutex_enter(&p->p_lock);
   2692 				stop(PR_SUSPENDED, SUSPEND_NORMAL);
   2693 				mutex_exit(&p->p_lock);
   2694 				poked = 0;
   2695 				continue;
   2696 			}
   2697 		} else {
   2698 			/*
   2699 			 * When started this thread will sleep on as->a_cv.
   2700 			 * as_unmap will awake this thread if the
   2701 			 * segment has SOFTLOCKed pages (poked = 0).
   2702 			 * 1. pokelwps() awakes this thread =>
   2703 			 *    break the loop to check SEXITLWPS, SHOLDFORK, etc
   2704 			 * 2. as_unmap awakes this thread =>
   2705 			 *    to break the loop it is necessary that
   2706 			 *    - AS_UNMAPWAIT is set (as_unmap is waiting for
   2707 			 *	memory to be unlocked)
   2708 			 *    - AIO_CLEANUP is not set
   2709 			 *	(if AIO_CLEANUP is set we have to wait for
   2710 			 *	pending requests. aio_done will send a signal
   2711 			 *	for every request which completes to continue
   2712 			 *	unmapping the corresponding address range)
   2713 			 * 3. A cleanup request will wake this thread up, ex.
   2714 			 *    by the DR operations. The aio_rqclnup flag will
   2715 			 *    be set.
   2716 			 */
   2717 			while (poked == 0) {
   2718 				/*
   2719 				 * The clean up requests that came in
   2720 				 * after we had just cleaned up, couldn't
   2721 				 * be causing the unmap thread to block - as
   2722 				 * unmap event happened first.
   2723 				 * Let aio_done() wake us up if it sees a need.
   2724 				 */
   2725 				if (aiop->aio_rqclnup &&
   2726 				    (aiop->aio_flags & AIO_CLEANUP) == 0)
   2727 					break;
   2728 				poked = !cv_wait_sig(cvp, &as->a_contents);
   2729 				if (AS_ISUNMAPWAIT(as) == 0)
   2730 					cv_signal(cvp);
   2731 				if (aiop->aio_outstanding != 0)
   2732 					break;
   2733 			}
   2734 		}
   2735 		mutex_exit(&as->a_contents);
   2736 	}
   2737 exit:
   2738 	mutex_exit(&as->a_contents);
   2739 	ASSERT((curproc->p_flag & (SEXITLWPS|SKILLED)));
   2740 	aston(curthread);	/* make thread do post_syscall */
   2741 	return (0);
   2742 }
   2743 
   2744 /*
   2745  * save a reference to a user's outstanding aio in a hash list.
   2746  */
   2747 static int
   2748 aio_hash_insert(
   2749 	aio_req_t *aio_reqp,
   2750 	aio_t *aiop)
   2751 {
   2752 	long index;
   2753 	aio_result_t *resultp = aio_reqp->aio_req_resultp;
   2754 	aio_req_t *current;
   2755 	aio_req_t **nextp;
   2756 
   2757 	index = AIO_HASH(resultp);
   2758 	nextp = &aiop->aio_hash[index];
   2759 	while ((current = *nextp) != NULL) {
   2760 		if (current->aio_req_resultp == resultp)
   2761 			return (DUPLICATE);
   2762 		nextp = &current->aio_hash_next;
   2763 	}
   2764 	*nextp = aio_reqp;
   2765 	aio_reqp->aio_hash_next = NULL;
   2766 	return (0);
   2767 }
   2768 
   2769 static int
   2770 (*check_vp(struct vnode *vp, int mode))(vnode_t *, struct aio_req *,
   2771     cred_t *)
   2772 {
   2773 	struct snode *sp;
   2774 	dev_t		dev;
   2775 	struct cb_ops  	*cb;
   2776 	major_t		major;
   2777 	int		(*aio_func)();
   2778 
   2779 	dev = vp->v_rdev;
   2780 	major = getmajor(dev);
   2781 
   2782 	/*
   2783 	 * return NULL for requests to files and STREAMs so
   2784 	 * that libaio takes care of them.
   2785 	 */
   2786 	if (vp->v_type == VCHR) {
   2787 		/* no stream device for kaio */
   2788 		if (STREAMSTAB(major)) {
   2789 			return (NULL);
   2790 		}
   2791 	} else {
   2792 		return (NULL);
   2793 	}
   2794 
   2795 	/*
   2796 	 * Check old drivers which do not have async I/O entry points.
   2797 	 */
   2798 	if (devopsp[major]->devo_rev < 3)
   2799 		return (NULL);
   2800 
   2801 	cb = devopsp[major]->devo_cb_ops;
   2802 
   2803 	if (cb->cb_rev < 1)
   2804 		return (NULL);
   2805 
   2806 	/*
   2807 	 * Check whether this device is a block device.
   2808 	 * Kaio is not supported for devices like tty.
   2809 	 */
   2810 	if (cb->cb_strategy == nodev || cb->cb_strategy == NULL)
   2811 		return (NULL);
   2812 
   2813 	/*
   2814 	 * Clustering: If vnode is a PXFS vnode, then the device may be remote.
   2815 	 * We cannot call the driver directly. Instead return the
   2816 	 * PXFS functions.
   2817 	 */
   2818 
   2819 	if (IS_PXFSVP(vp)) {
   2820 		if (mode & FREAD)
   2821 			return (clpxfs_aio_read);
   2822 		else
   2823 			return (clpxfs_aio_write);
   2824 	}
   2825 	if (mode & FREAD)
   2826 		aio_func = (cb->cb_aread == nodev) ? NULL : driver_aio_read;
   2827 	else
   2828 		aio_func = (cb->cb_awrite == nodev) ? NULL : driver_aio_write;
   2829 
   2830 	/*
   2831 	 * Do we need this ?
   2832 	 * nodev returns ENXIO anyway.
   2833 	 */
   2834 	if (aio_func == nodev)
   2835 		return (NULL);
   2836 
   2837 	sp = VTOS(vp);
   2838 	smark(sp, SACC);
   2839 	return (aio_func);
   2840 }
   2841 
   2842 /*
   2843  * Clustering: We want check_vp to return a function prototyped
   2844  * correctly that will be common to both PXFS and regular case.
   2845  * We define this intermediate function that will do the right
   2846  * thing for driver cases.
   2847  */
   2848 
   2849 static int
   2850 driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
   2851 {
   2852 	dev_t dev;
   2853 	struct cb_ops  	*cb;
   2854 
   2855 	ASSERT(vp->v_type == VCHR);
   2856 	ASSERT(!IS_PXFSVP(vp));
   2857 	dev = VTOS(vp)->s_dev;
   2858 	ASSERT(STREAMSTAB(getmajor(dev)) == NULL);
   2859 
   2860 	cb = devopsp[getmajor(dev)]->devo_cb_ops;
   2861 
   2862 	ASSERT(cb->cb_awrite != nodev);
   2863 	return ((*cb->cb_awrite)(dev, aio, cred_p));
   2864 }
   2865 
   2866 /*
   2867  * Clustering: We want check_vp to return a function prototyped
   2868  * correctly that will be common to both PXFS and regular case.
   2869  * We define this intermediate function that will do the right
   2870  * thing for driver cases.
   2871  */
   2872 
   2873 static int
   2874 driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
   2875 {
   2876 	dev_t dev;
   2877 	struct cb_ops  	*cb;
   2878 
   2879 	ASSERT(vp->v_type == VCHR);
   2880 	ASSERT(!IS_PXFSVP(vp));
   2881 	dev = VTOS(vp)->s_dev;
   2882 	ASSERT(!STREAMSTAB(getmajor(dev)));
   2883 
   2884 	cb = devopsp[getmajor(dev)]->devo_cb_ops;
   2885 
   2886 	ASSERT(cb->cb_aread != nodev);
   2887 	return ((*cb->cb_aread)(dev, aio, cred_p));
   2888 }
   2889 
   2890 /*
   2891  * This routine is called when a largefile call is made by a 32bit
   2892  * process on a ILP32 or LP64 kernel. All 64bit processes are large
   2893  * file by definition and will call alio() instead.
   2894  */
   2895 static int
   2896 alioLF(
   2897 	int		mode_arg,
   2898 	void		*aiocb_arg,
   2899 	int		nent,
   2900 	void		*sigev)
   2901 {
   2902 	file_t		*fp;
   2903 	file_t		*prev_fp = NULL;
   2904 	int		prev_mode = -1;
   2905 	struct vnode	*vp;
   2906 	aio_lio_t	*head;
   2907 	aio_req_t	*reqp;
   2908 	aio_t		*aiop;
   2909 	caddr_t		cbplist;
   2910 	aiocb64_32_t	cb64;
   2911 	aiocb64_32_t	*aiocb = &cb64;
   2912 	aiocb64_32_t	*cbp;
   2913 	caddr32_t	*ucbp;
   2914 #ifdef _LP64
   2915 	aiocb_t		aiocb_n;
   2916 #endif
   2917 	struct sigevent32	sigevk;
   2918 	sigqueue_t	*sqp;
   2919 	int		(*aio_func)();
   2920 	int		mode;
   2921 	int		error = 0;
   2922 	int		aio_errors = 0;
   2923 	int		i;
   2924 	size_t		ssize;
   2925 	int		deadhead = 0;
   2926 	int		aio_notsupported = 0;
   2927 	int		lio_head_port;
   2928 	int		aio_port;
   2929 	int		aio_thread;
   2930 	port_kevent_t	*pkevtp = NULL;
   2931 	int		portused = 0;
   2932 	port_notify32_t	pnotify;
   2933 	int		event;
   2934 
   2935 	aiop = curproc->p_aio;
   2936 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
   2937 		return (EINVAL);
   2938 
   2939 	ASSERT(get_udatamodel() == DATAMODEL_ILP32);
   2940 
   2941 	ssize = (sizeof (caddr32_t) * nent);
   2942 	cbplist = kmem_alloc(ssize, KM_SLEEP);
   2943 	ucbp = (caddr32_t *)cbplist;
   2944 
   2945 	if (copyin(aiocb_arg, cbplist, ssize) ||
   2946 	    (sigev && copyin(sigev, &sigevk, sizeof (sigevk)))) {
   2947 		kmem_free(cbplist, ssize);
   2948 		return (EFAULT);
   2949 	}
   2950 
   2951 	/* Event Ports  */
   2952 	if (sigev &&
   2953 	    (sigevk.sigev_notify == SIGEV_THREAD ||
   2954 	    sigevk.sigev_notify == SIGEV_PORT)) {
   2955 		if (sigevk.sigev_notify == SIGEV_THREAD) {
   2956 			pnotify.portnfy_port = sigevk.sigev_signo;
   2957 			pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
   2958 		} else if (copyin(
   2959 		    (void *)(uintptr_t)sigevk.sigev_value.sival_ptr,
   2960 		    &pnotify, sizeof (pnotify))) {
   2961 			kmem_free(cbplist, ssize);
   2962 			return (EFAULT);
   2963 		}
   2964 		error = port_alloc_event(pnotify.portnfy_port,
   2965 		    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
   2966 		if (error) {
   2967 			if (error == ENOMEM || error == EAGAIN)
   2968 				error = EAGAIN;
   2969 			else
   2970 				error = EINVAL;
   2971 			kmem_free(cbplist, ssize);
   2972 			return (error);
   2973 		}
   2974 		lio_head_port = pnotify.portnfy_port;
   2975 		portused = 1;
   2976 	}
   2977 
   2978 	/*
   2979 	 * a list head should be allocated if notification is
   2980 	 * enabled for this list.
   2981 	 */
   2982 	head = NULL;
   2983 
   2984 	if (mode_arg == LIO_WAIT || sigev) {
   2985 		mutex_enter(&aiop->aio_mutex);
   2986 		error = aio_lio_alloc(&head);
   2987 		mutex_exit(&aiop->aio_mutex);
   2988 		if (error)
   2989 			goto done;
   2990 		deadhead = 1;
   2991 		head->lio_nent = nent;
   2992 		head->lio_refcnt = nent;
   2993 		head->lio_port = -1;
   2994 		head->lio_portkev = NULL;
   2995 		if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
   2996 		    sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
   2997 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
   2998 			if (sqp == NULL) {
   2999 				error = EAGAIN;
   3000 				goto done;
   3001 			}
   3002 			sqp->sq_func = NULL;
   3003 			sqp->sq_next = NULL;
   3004 			sqp->sq_info.si_code = SI_ASYNCIO;
   3005 			sqp->sq_info.si_pid = curproc->p_pid;
   3006 			sqp->sq_info.si_ctid = PRCTID(curproc);
   3007 			sqp->sq_info.si_zoneid = getzoneid();
   3008 			sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
   3009 			sqp->sq_info.si_signo = sigevk.sigev_signo;
   3010 			sqp->sq_info.si_value.sival_int =
   3011 			    sigevk.sigev_value.sival_int;
   3012 			head->lio_sigqp = sqp;
   3013 		} else {
   3014 			head->lio_sigqp = NULL;
   3015 		}
   3016 		if (pkevtp) {
   3017 			/*
   3018 			 * Prepare data to send when list of aiocb's
   3019 			 * has completed.
   3020 			 */
   3021 			port_init_event(pkevtp, (uintptr_t)sigev,
   3022 			    (void *)(uintptr_t)pnotify.portnfy_user,
   3023 			    NULL, head);
   3024 			pkevtp->portkev_events = AIOLIO64;
   3025 			head->lio_portkev = pkevtp;
   3026 			head->lio_port = pnotify.portnfy_port;
   3027 		}
   3028 	}
   3029 
   3030 	for (i = 0; i < nent; i++, ucbp++) {
   3031 
   3032 		cbp = (aiocb64_32_t *)(uintptr_t)*ucbp;
   3033 		/* skip entry if it can't be copied. */
   3034 		if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) {
   3035 			if (head) {
   3036 				mutex_enter(&aiop->aio_mutex);
   3037 				head->lio_nent--;
   3038 				head->lio_refcnt--;
   3039 				mutex_exit(&aiop->aio_mutex);
   3040 			}
   3041 			continue;
   3042 		}
   3043 
   3044 		/* skip if opcode for aiocb is LIO_NOP */
   3045 		mode = aiocb->aio_lio_opcode;
   3046 		if (mode == LIO_NOP) {
   3047 			cbp = NULL;
   3048 			if (head) {
   3049 				mutex_enter(&aiop->aio_mutex);
   3050 				head->lio_nent--;
   3051 				head->lio_refcnt--;
   3052 				mutex_exit(&aiop->aio_mutex);
   3053 			}
   3054 			continue;
   3055 		}
   3056 
   3057 		/* increment file descriptor's ref count. */
   3058 		if ((fp = getf(aiocb->aio_fildes)) == NULL) {
   3059 			lio_set_uerror(&cbp->aio_resultp, EBADF);
   3060 			if (head) {
   3061 				mutex_enter(&aiop->aio_mutex);
   3062 				head->lio_nent--;
   3063 				head->lio_refcnt--;
   3064 				mutex_exit(&aiop->aio_mutex);
   3065 			}
   3066 			aio_errors++;
   3067 			continue;
   3068 		}
   3069 
   3070 		/*
   3071 		 * check the permission of the partition
   3072 		 */
   3073 		if ((fp->f_flag & mode) == 0) {
   3074 			releasef(aiocb->aio_fildes);
   3075 			lio_set_uerror(&cbp->aio_resultp, EBADF);
   3076 			if (head) {
   3077 				mutex_enter(&aiop->aio_mutex);
   3078 				head->lio_nent--;
   3079 				head->lio_refcnt--;
   3080 				mutex_exit(&aiop->aio_mutex);
   3081 			}
   3082 			aio_errors++;
   3083 			continue;
   3084 		}
   3085 
   3086 		/*
   3087 		 * common case where requests are to the same fd
   3088 		 * for the same r/w operation
   3089 		 * for UFS, need to set EBADFD
   3090 		 */
   3091 		vp = fp->f_vnode;
   3092 		if (fp != prev_fp || mode != prev_mode) {
   3093 			aio_func = check_vp(vp, mode);
   3094 			if (aio_func == NULL) {
   3095 				prev_fp = NULL;
   3096 				releasef(aiocb->aio_fildes);
   3097 				lio_set_uerror(&cbp->aio_resultp, EBADFD);
   3098 				aio_notsupported++;
   3099 				if (head) {
   3100 					mutex_enter(&aiop->aio_mutex);
   3101 					head->lio_nent--;
   3102 					head->lio_refcnt--;
   3103 					mutex_exit(&aiop->aio_mutex);
   3104 				}
   3105 				continue;
   3106 			} else {
   3107 				prev_fp = fp;
   3108 				prev_mode = mode;
   3109 			}
   3110 		}
   3111 
   3112 #ifdef	_LP64
   3113 		aiocb_LFton(aiocb, &aiocb_n);
   3114 		error = aio_req_setup(&reqp, aiop, &aiocb_n,
   3115 		    (aio_result_t *)&cbp->aio_resultp, vp, 0);
   3116 #else
   3117 		error = aio_req_setupLF(&reqp, aiop, aiocb,
   3118 		    (aio_result_t *)&cbp->aio_resultp, vp, 0);
   3119 #endif  /* _LP64 */
   3120 		if (error) {
   3121 			releasef(aiocb->aio_fildes);
   3122 			lio_set_uerror(&cbp->aio_resultp, error);
   3123 			if (head) {
   3124 				mutex_enter(&aiop->aio_mutex);
   3125 				head->lio_nent--;
   3126 				head->lio_refcnt--;
   3127 				mutex_exit(&aiop->aio_mutex);
   3128 			}
   3129 			aio_errors++;
   3130 			continue;
   3131 		}
   3132 
   3133 		reqp->aio_req_lio = head;
   3134 		deadhead = 0;
   3135 
   3136 		/*
   3137 		 * Set the errno field now before sending the request to
   3138 		 * the driver to avoid a race condition
   3139 		 */
   3140 		(void) suword32(&cbp->aio_resultp.aio_errno,
   3141 		    EINPROGRESS);
   3142 
   3143 		reqp->aio_req_iocb.iocb32 = *ucbp;
   3144 
   3145 		event = (mode == LIO_READ)? AIOAREAD64 : AIOAWRITE64;
   3146 		aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
   3147 		aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
   3148 		if (aio_port | aio_thread) {
   3149 			port_kevent_t *lpkevp;
   3150 			/*
   3151 			 * Prepare data to send with each aiocb completed.
   3152 			 */
   3153 			if (aio_port) {
   3154 				void *paddr = (void *)(uintptr_t)
   3155 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
   3156 				if (copyin(paddr, &pnotify, sizeof (pnotify)))
   3157 					error = EFAULT;
   3158 			} else {	/* aio_thread */
   3159 				pnotify.portnfy_port =
   3160 				    aiocb->aio_sigevent.sigev_signo;
   3161 				pnotify.portnfy_user =
   3162 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
   3163 			}
   3164 			if (error)
   3165 				/* EMPTY */;
   3166 			else if (pkevtp != NULL &&
   3167 			    pnotify.portnfy_port == lio_head_port)
   3168 				error = port_dup_event(pkevtp, &lpkevp,
   3169 				    PORT_ALLOC_DEFAULT);
   3170 			else
   3171 				error = port_alloc_event(pnotify.portnfy_port,
   3172 				    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
   3173 				    &lpkevp);
   3174 			if (error == 0) {
   3175 				port_init_event(lpkevp, (uintptr_t)*ucbp,
   3176 				    (void *)(uintptr_t)pnotify.portnfy_user,
   3177 				    aio_port_callback, reqp);
   3178 				lpkevp->portkev_events = event;
   3179 				reqp->aio_req_portkev = lpkevp;
   3180 				reqp->aio_req_port = pnotify.portnfy_port;
   3181 			}
   3182 		}
   3183 
   3184 		/*
   3185 		 * send the request to driver.
   3186 		 */
   3187 		if (error == 0) {
   3188 			if (aiocb->aio_nbytes == 0) {
   3189 				clear_active_fd(aiocb->aio_fildes);
   3190 				aio_zerolen(reqp);
   3191 				continue;
   3192 			}
   3193 			error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
   3194 			    CRED());
   3195 		}
   3196 
   3197 		/*
   3198 		 * the fd's ref count is not decremented until the IO has
   3199 		 * completed unless there was an error.
   3200 		 */
   3201 		if (error) {
   3202 			releasef(aiocb->aio_fildes);
   3203 			lio_set_uerror(&cbp->aio_resultp, error);
   3204 			if (head) {
   3205 				mutex_enter(&aiop->aio_mutex);
   3206 				head->lio_nent--;
   3207 				head->lio_refcnt--;
   3208 				mutex_exit(&aiop->aio_mutex);
   3209 			}
   3210 			if (error == ENOTSUP)
   3211 				aio_notsupported++;
   3212 			else
   3213 				aio_errors++;
   3214 			lio_set_error(reqp, portused);
   3215 		} else {
   3216 			clear_active_fd(aiocb->aio_fildes);
   3217 		}
   3218 	}
   3219 
   3220 	if (aio_notsupported) {
   3221 		error = ENOTSUP;
   3222 	} else if (aio_errors) {
   3223 		/*
   3224 		 * return EIO if any request failed
   3225 		 */
   3226 		error = EIO;
   3227 	}
   3228 
   3229 	if (mode_arg == LIO_WAIT) {
   3230 		mutex_enter(&aiop->aio_mutex);
   3231 		while (head->lio_refcnt > 0) {
   3232 			if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
   3233 				mutex_exit(&aiop->aio_mutex);
   3234 				error = EINTR;
   3235 				goto done;
   3236 			}
   3237 		}
   3238 		mutex_exit(&aiop->aio_mutex);
   3239 		alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_LARGEFILE);
   3240 	}
   3241 
   3242 done:
   3243 	kmem_free(cbplist, ssize);
   3244 	if (deadhead) {
   3245 		if (head->lio_sigqp)
   3246 			kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
   3247 		if (head->lio_portkev)
   3248 			port_free_event(head->lio_portkev);
   3249 		kmem_free(head, sizeof (aio_lio_t));
   3250 	}
   3251 	return (error);
   3252 }
   3253 
   3254 #ifdef  _SYSCALL32_IMPL
   3255 static void
   3256 aiocb_LFton(aiocb64_32_t *src, aiocb_t *dest)
   3257 {
   3258 	dest->aio_fildes = src->aio_fildes;
   3259 	dest->aio_buf = (void *)(uintptr_t)src->aio_buf;
   3260 	dest->aio_nbytes = (size_t)src->aio_nbytes;
   3261 	dest->aio_offset = (off_t)src->aio_offset;
   3262 	dest->aio_reqprio = src->aio_reqprio;
   3263 	dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
   3264 	dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;
   3265 
   3266 	/*
   3267 	 * See comment in sigqueue32() on handling of 32-bit
   3268 	 * sigvals in a 64-bit kernel.
   3269 	 */
   3270 	dest->aio_sigevent.sigev_value.sival_int =
   3271 	    (int)src->aio_sigevent.sigev_value.sival_int;
   3272 	dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
   3273 	    (uintptr_t)src->aio_sigevent.sigev_notify_function;
   3274 	dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
   3275 	    (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
   3276 	dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
   3277 	dest->aio_lio_opcode = src->aio_lio_opcode;
   3278 	dest->aio_state = src->aio_state;
   3279 	dest->aio__pad[0] = src->aio__pad[0];
   3280 }
   3281 #endif
   3282 
   3283 /*
   3284  * This function is used only for largefile calls made by
   3285  * 32 bit applications.
   3286  */
   3287 static int
   3288 aio_req_setupLF(
   3289 	aio_req_t	**reqpp,
   3290 	aio_t		*aiop,
   3291 	aiocb64_32_t	*arg,
   3292 	aio_result_t	*resultp,
   3293 	vnode_t		*vp,
   3294 	int		old_solaris_req)
   3295 {
   3296 	sigqueue_t	*sqp = NULL;
   3297 	aio_req_t	*reqp;
   3298 	struct uio	*uio;
   3299 	struct sigevent32 *sigev;
   3300 	int 		error;
   3301 
   3302 	sigev = &arg->aio_sigevent;
   3303 	if (sigev->sigev_notify == SIGEV_SIGNAL &&
   3304 	    sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) {
   3305 		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
   3306 		if (sqp == NULL)
   3307 			return (EAGAIN);
   3308 		sqp->sq_func = NULL;
   3309 		sqp->sq_next = NULL;
   3310 		sqp->sq_info.si_code = SI_ASYNCIO;
   3311 		sqp->sq_info.si_pid = curproc->p_pid;
   3312 		sqp->sq_info.si_ctid = PRCTID(curproc);
   3313 		sqp->sq_info.si_zoneid = getzoneid();
   3314 		sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
   3315 		sqp->sq_info.si_signo = sigev->sigev_signo;
   3316 		sqp->sq_info.si_value.sival_int = sigev->sigev_value.sival_int;
   3317 	}
   3318 
   3319 	mutex_enter(&aiop->aio_mutex);
   3320 
   3321 	if (aiop->aio_flags & AIO_REQ_BLOCK) {
   3322 		mutex_exit(&aiop->aio_mutex);
   3323 		if (sqp)
   3324 			kmem_free(sqp, sizeof (sigqueue_t));
   3325 		return (EIO);
   3326 	}
   3327 	/*
   3328 	 * get an aio_reqp from the free list or allocate one
   3329 	 * from dynamic memory.
   3330 	 */
   3331 	if (error = aio_req_alloc(&reqp, resultp)) {
   3332 		mutex_exit(&aiop->aio_mutex);
   3333 		if (sqp)
   3334 			kmem_free(sqp, sizeof (sigqueue_t));
   3335 		return (error);
   3336 	}
   3337 	aiop->aio_pending++;
   3338 	aiop->aio_outstanding++;
   3339 	reqp->aio_req_flags = AIO_PENDING;
   3340 	if (old_solaris_req) {
   3341 		/* this is an old solaris aio request */
   3342 		reqp->aio_req_flags |= AIO_SOLARIS;
   3343 		aiop->aio_flags |= AIO_SOLARIS_REQ;
   3344 	}
   3345 	if (sigev->sigev_notify == SIGEV_THREAD ||
   3346 	    sigev->sigev_notify == SIGEV_PORT)
   3347 		aio_enq(&aiop->aio_portpending, reqp, 0);
   3348 	mutex_exit(&aiop->aio_mutex);
   3349 	/*
   3350 	 * initialize aio request.
   3351 	 */
   3352 	reqp->aio_req_fd = arg->aio_fildes;
   3353 	reqp->aio_req_sigqp = sqp;
   3354 	reqp->aio_req_iocb.iocb = NULL;
   3355 	reqp->aio_req_lio = NULL;
   3356 	reqp->aio_req_buf.b_file = vp;
   3357 	uio = reqp->aio_req.aio_uio;
   3358 	uio->uio_iovcnt = 1;
   3359 	uio->uio_iov->iov_base = (caddr_t)(uintptr_t)arg->aio_buf;
   3360 	uio->uio_iov->iov_len = arg->aio_nbytes;
   3361 	uio->uio_loffset = arg->aio_offset;
   3362 	*reqpp = reqp;
   3363 	return (0);
   3364 }
   3365 
   3366 /*
   3367  * This routine is called when a non largefile call is made by a 32bit
   3368  * process on a ILP32 or LP64 kernel.
   3369  */
   3370 static int
   3371 alio32(
   3372 	int		mode_arg,
   3373 	void		*aiocb_arg,
   3374 	int		nent,
   3375 	void		*sigev)
   3376 {
   3377 	file_t		*fp;
   3378 	file_t		*prev_fp = NULL;
   3379 	int		prev_mode = -1;
   3380 	struct vnode	*vp;
   3381 	aio_lio_t	*head;
   3382 	aio_req_t	*reqp;
   3383 	aio_t		*aiop;
   3384 	caddr_t		cbplist;
   3385 	aiocb_t		cb;
   3386 	aiocb_t		*aiocb = &cb;
   3387 #ifdef	_LP64
   3388 	aiocb32_t	*cbp;
   3389 	caddr32_t	*ucbp;
   3390 	aiocb32_t	cb32;
   3391 	aiocb32_t	*aiocb32 = &cb32;
   3392 	struct sigevent32	sigevk;
   3393 #else
   3394 	aiocb_t		*cbp, **ucbp;
   3395 	struct sigevent	sigevk;
   3396 #endif
   3397 	sigqueue_t	*sqp;
   3398 	int		(*aio_func)();
   3399 	int		mode;
   3400 	int		error = 0;
   3401 	int		aio_errors = 0;
   3402 	int		i;
   3403 	size_t		ssize;
   3404 	int		deadhead = 0;
   3405 	int		aio_notsupported = 0;
   3406 	int		lio_head_port;
   3407 	int		aio_port;
   3408 	int		aio_thread;
   3409 	port_kevent_t	*pkevtp = NULL;
   3410 	int		portused = 0;
   3411 #ifdef	_LP64
   3412 	port_notify32_t	pnotify;
   3413 #else
   3414 	port_notify_t	pnotify;
   3415 #endif
   3416 	int		event;
   3417 
   3418 	aiop = curproc->p_aio;
   3419 	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
   3420 		return (EINVAL);
   3421 
   3422 #ifdef	_LP64
   3423 	ssize = (sizeof (caddr32_t) * nent);
   3424 #else
   3425 	ssize = (sizeof (aiocb_t *) * nent);
   3426 #endif
   3427 	cbplist = kmem_alloc(ssize, KM_SLEEP);
   3428 	ucbp = (void *)cbplist;
   3429 
   3430 	if (copyin(aiocb_arg, cbplist, ssize) ||
   3431 	    (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent32)))) {
   3432 		kmem_free(cbplist, ssize);
   3433 		return (EFAULT);
   3434 	}
   3435 
   3436 	/* Event Ports  */
   3437 	if (sigev &&
   3438 	    (sigevk.sigev_notify == SIGEV_THREAD ||
   3439 	    sigevk.sigev_notify == SIGEV_PORT)) {
   3440 		if (sigevk.sigev_notify == SIGEV_THREAD) {
   3441 			pnotify.portnfy_port = sigevk.sigev_signo;
   3442 			pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
   3443 		} else if (copyin(
   3444 		    (void *)(uintptr_t)sigevk.sigev_value.sival_ptr,
   3445 		    &pnotify, sizeof (pnotify))) {
   3446 			kmem_free(cbplist, ssize);
   3447 			return (EFAULT);
   3448 		}
   3449 		error = port_alloc_event(pnotify.portnfy_port,
   3450 		    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
   3451 		if (error) {
   3452 			if (error == ENOMEM || error == EAGAIN)
   3453 				error = EAGAIN;
   3454 			else
   3455 				error = EINVAL;
   3456 			kmem_free(cbplist, ssize);
   3457 			return (error);
   3458 		}
   3459 		lio_head_port = pnotify.portnfy_port;
   3460 		portused = 1;
   3461 	}
   3462 
   3463 	/*
   3464 	 * a list head should be allocated if notification is
   3465 	 * enabled for this list.
   3466 	 */
   3467 	head = NULL;
   3468 
   3469 	if (mode_arg == LIO_WAIT || sigev) {
   3470 		mutex_enter(&aiop->aio_mutex);
   3471 		error = aio_lio_alloc(&head);
   3472 		mutex_exit(&aiop->aio_mutex);
   3473 		if (error)
   3474 			goto done;
   3475 		deadhead = 1;
   3476 		head->lio_nent = nent;
   3477 		head->lio_refcnt = nent;
   3478 		head->lio_port = -1;
   3479 		head->lio_portkev = NULL;
   3480 		if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
   3481 		    sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
   3482 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
   3483 			if (sqp == NULL) {
   3484 				error = EAGAIN;
   3485 				goto done;
   3486 			}
   3487 			sqp->sq_func = NULL;
   3488 			sqp->sq_next = NULL;
   3489 			sqp->sq_info.si_code = SI_ASYNCIO;
   3490 			sqp->sq_info.si_pid = curproc->p_pid;
   3491 			sqp->sq_info.si_ctid = PRCTID(curproc);
   3492 			sqp->sq_info.si_zoneid = getzoneid();
   3493 			sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
   3494 			sqp->sq_info.si_signo = sigevk.sigev_signo;
   3495 			sqp->sq_info.si_value.sival_int =
   3496 			    sigevk.sigev_value.sival_int;
   3497 			head->lio_sigqp = sqp;
   3498 		} else {
   3499 			head->lio_sigqp = NULL;
   3500 		}
   3501 		if (pkevtp) {
   3502 			/*
   3503 			 * Prepare data to send when list of aiocb's has
   3504 			 * completed.
   3505 			 */
   3506 			port_init_event(pkevtp, (uintptr_t)sigev,
   3507 			    (void *)(uintptr_t)pnotify.portnfy_user,
   3508 			    NULL, head);
   3509 			pkevtp->portkev_events = AIOLIO;
   3510 			head->lio_portkev = pkevtp;
   3511 			head->lio_port = pnotify.portnfy_port;
   3512 		}
   3513 	}
   3514 
   3515 	for (i = 0; i < nent; i++, ucbp++) {
   3516 
   3517 		/* skip entry if it can't be copied. */
   3518 #ifdef	_LP64
   3519 		cbp = (aiocb32_t *)(uintptr_t)*ucbp;
   3520 		if (cbp == NULL || copyin(cbp, aiocb32, sizeof (*aiocb32)))
   3521 #else
   3522 		cbp = (aiocb_t *)*ucbp;
   3523 		if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb)))
   3524 #endif
   3525 		{
   3526 			if (head) {
   3527 				mutex_enter(&aiop->aio_mutex);
   3528 				head->lio_nent--;
   3529 				head->lio_refcnt--;
   3530 				mutex_exit(&aiop->aio_mutex);
   3531 			}
   3532 			continue;
   3533 		}
   3534 #ifdef	_LP64
   3535 		/*
   3536 		 * copy 32 bit structure into 64 bit structure
   3537 		 */
   3538 		aiocb_32ton(aiocb32, aiocb);
   3539 #endif /* _LP64 */
   3540 
   3541 		/* skip if opcode for aiocb is LIO_NOP */
   3542 		mode = aiocb->aio_lio_opcode;
   3543 		if (mode == LIO_NOP) {
   3544 			cbp = NULL;
   3545 			if (head) {
   3546 				mutex_enter(&aiop->aio_mutex);
   3547 				head->lio_nent--;
   3548 				head->lio_refcnt--;
   3549 				mutex_exit(&aiop->aio_mutex);
   3550 			}
   3551 			continue;
   3552 		}
   3553 
   3554 		/* increment file descriptor's ref count. */
   3555 		if ((fp = getf(aiocb->aio_fildes)) == NULL) {
   3556 			lio_set_uerror(&cbp->aio_resultp, EBADF);
   3557 			if (head) {
   3558 				mutex_enter(&aiop->aio_mutex);
   3559 				head->lio_nent--;
   3560 				head->lio_refcnt--;
   3561 				mutex_exit(&aiop->aio_mutex);
   3562 			}
   3563 			aio_errors++;
   3564 			continue;
   3565 		}
   3566 
   3567 		/*
   3568 		 * check the permission of the partition
   3569 		 */
   3570 		if ((fp->f_flag & mode) == 0) {
   3571 			releasef(aiocb->aio_fildes);
   3572 			lio_set_uerror(&cbp->aio_resultp, EBADF);
   3573 			if (head) {
   3574 				mutex_enter(&aiop->aio_mutex);
   3575 				head->lio_nent--;
   3576 				head->lio_refcnt--;
   3577 				mutex_exit(&aiop->aio_mutex);
   3578 			}
   3579 			aio_errors++;
   3580 			continue;
   3581 		}
   3582 
   3583 		/*
   3584 		 * common case where requests are to the same fd
   3585 		 * for the same r/w operation
   3586 		 * for UFS, need to set EBADFD
   3587 		 */
   3588 		vp = fp->f_vnode;
   3589 		if (fp != prev_fp || mode != prev_mode) {
   3590 			aio_func = check_vp(vp, mode);
   3591 			if (aio_func == NULL) {
   3592 				prev_fp = NULL;
   3593 				releasef(aiocb->aio_fildes);
   3594 				lio_set_uerror(&cbp->aio_resultp, EBADFD);
   3595 				aio_notsupported++;
   3596 				if (head) {
   3597 					mutex_enter(&aiop->aio_mutex);
   3598 					head->lio_nent--;
   3599 					head->lio_refcnt--;
   3600 					mutex_exit(&aiop->aio_mutex);
   3601 				}
   3602 				continue;
   3603 			} else {
   3604 				prev_fp = fp;
   3605 				prev_mode = mode;
   3606 			}
   3607 		}
   3608 
   3609 		error = aio_req_setup(&reqp, aiop, aiocb,
   3610 		    (aio_result_t *)&cbp->aio_resultp, vp, 0);
   3611 		if (error) {
   3612 			releasef(aiocb->aio_fildes);
   3613 			lio_set_uerror(&cbp->aio_resultp, error);
   3614 			if (head) {
   3615 				mutex_enter(&aiop->aio_mutex);
   3616 				head->lio_nent--;
   3617 				head->lio_refcnt--;
   3618 				mutex_exit(&aiop->aio_mutex);
   3619 			}
   3620 			aio_errors++;
   3621 			continue;
   3622 		}
   3623 
   3624 		reqp->aio_req_lio = head;
   3625 		deadhead = 0;
   3626 
   3627 		/*
   3628 		 * Set the errno field now before sending the request to
   3629 		 * the driver to avoid a race condition
   3630 		 */
   3631 		(void) suword32(&cbp->aio_resultp.aio_errno,
   3632 		    EINPROGRESS);
   3633 
   3634 		reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)cbp;
   3635 
   3636 		event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE;
   3637 		aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
   3638 		aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
   3639 		if (aio_port | aio_thread) {
   3640 			port_kevent_t *lpkevp;
   3641 			/*
   3642 			 * Prepare data to send with each aiocb completed.
   3643 			 */
   3644 #ifdef _LP64
   3645 			if (aio_port) {
   3646 				void *paddr = (void  *)(uintptr_t)
   3647 				    aiocb32->aio_sigevent.sigev_value.sival_ptr;
   3648 				if (copyin(paddr, &pnotify, sizeof (pnotify)))
   3649 					error = EFAULT;
   3650 			} else {	/* aio_thread */
   3651 				pnotify.portnfy_port =
   3652 				    aiocb32->aio_sigevent.sigev_signo;
   3653 				pnotify.portnfy_user =
   3654 				    aiocb32->aio_sigevent.sigev_value.sival_ptr;
   3655 			}
   3656 #else
   3657 			if (aio_port) {
   3658 				void *paddr =
   3659 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
   3660 				if (copyin(paddr, &pnotify, sizeof (pnotify)))
   3661 					error = EFAULT;
   3662 			} else {	/* aio_thread */
   3663 				pnotify.portnfy_port =
   3664 				    aiocb->aio_sigevent.sigev_signo;
   3665 				pnotify.portnfy_user =
   3666 				    aiocb->aio_sigevent.sigev_value.sival_ptr;
   3667 			}
   3668 #endif
   3669 			if (error)
   3670 				/* EMPTY */;
   3671 			else if (pkevtp != NULL &&
   3672 			    pnotify.portnfy_port == lio_head_port)
   3673 				error = port_dup_event(pkevtp, &lpkevp,
   3674 				    PORT_ALLOC_DEFAULT);
   3675 			else
   3676 				error = port_alloc_event(pnotify.portnfy_port,
   3677 				    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
   3678 				    &lpkevp);
   3679 			if (error == 0) {
   3680 				port_init_event(lpkevp, (uintptr_t)cbp,
   3681 				    (void *)(uintptr_t)pnotify.portnfy_user,
   3682 				    aio_port_callback, reqp);
   3683 				lpkevp->portkev_events = event;
   3684 				reqp->aio_req_portkev = lpkevp;
   3685 				reqp->aio_req_port = pnotify.portnfy_port;
   3686 			}
   3687 		}
   3688 
   3689 		/*
   3690 		 * send the request to driver.
   3691 		 */
   3692 		if (error == 0) {
   3693 			if (aiocb->aio_nbytes == 0) {
   3694 				clear_active_fd(aiocb->aio_fildes);
   3695 				aio_zerolen(reqp);
   3696 				continue;
   3697 			}
   3698 			error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
   3699 			    CRED());
   3700 		}
   3701 
   3702 		/*
   3703 		 * the fd's ref count is not decremented until the IO has
   3704 		 * completed unless there was an error.
   3705 		 */
   3706 		if (error) {
   3707 			releasef(aiocb->aio_fildes);
   3708 			lio_set_uerror(&cbp->aio_resultp, error);
   3709 			if (head) {
   3710 				mutex_enter(&aiop->aio_mutex);
   3711 				head->lio_nent--;
   3712 				head->lio_refcnt--;
   3713 				mutex_exit(&aiop->aio_mutex);
   3714 			}
   3715 			if (error == ENOTSUP)
   3716 				aio_notsupported++;
   3717 			else
   3718 				aio_errors++;
   3719 			lio_set_error(reqp, portused);
   3720 		} else {
   3721 			clear_active_fd(aiocb->aio_fildes);
   3722 		}
   3723 	}
   3724 
   3725 	if (aio_notsupported) {
   3726 		error = ENOTSUP;
   3727 	} else if (aio_errors) {
   3728 		/*
   3729 		 * return EIO if any request failed
   3730 		 */
   3731 		error = EIO;
   3732 	}
   3733 
   3734 	if (mode_arg == LIO_WAIT) {
   3735 		mutex_enter(&aiop->aio_mutex);
   3736 		while (head->lio_refcnt > 0) {
   3737 			if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
   3738 				mutex_exit(&aiop->aio_mutex);
   3739 				error = EINTR;
   3740 				goto done;
   3741 			}
   3742 		}
   3743 		mutex_exit(&aiop->aio_mutex);
   3744 		alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_32);
   3745 	}
   3746 
   3747 done:
   3748 	kmem_free(cbplist, ssize);
   3749 	if (deadhead) {
   3750 		if (head->lio_sigqp)
   3751 			kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
   3752 		if (head->lio_portkev)
   3753 			port_free_event(head->lio_portkev);
   3754 		kmem_free(head, sizeof (aio_lio_t));
   3755 	}
   3756 	return (error);
   3757 }
   3758 
   3759 
   3760 #ifdef  _SYSCALL32_IMPL
   3761 void
   3762 aiocb_32ton(aiocb32_t *src, aiocb_t *dest)
   3763 {
   3764 	dest->aio_fildes = src->aio_fildes;
   3765 	dest->aio_buf = (caddr_t)(uintptr_t)src->aio_buf;
   3766 	dest->aio_nbytes = (size_t)src->aio_nbytes;
   3767 	dest->aio_offset = (off_t)src->aio_offset;
   3768 	dest->aio_reqprio = src->aio_reqprio;
   3769 	dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
   3770 	dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;
   3771 
   3772 	/*
   3773 	 * See comment in sigqueue32() on handling of 32-bit
   3774 	 * sigvals in a 64-bit kernel.
   3775 	 */
   3776 	dest->aio_sigevent.sigev_value.sival_int =
   3777 	    (int)src->aio_sigevent.sigev_value.sival_int;
   3778 	dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
   3779 	    (uintptr_t)src->aio_sigevent.sigev_notify_function;
   3780 	dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
   3781 	    (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
   3782 	dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
   3783 	dest->aio_lio_opcode = src->aio_lio_opcode;
   3784 	dest->aio_state = src->aio_state;
   3785 	dest->aio__pad[0] = src->aio__pad[0];
   3786 }
   3787 #endif /* _SYSCALL32_IMPL */
   3788 
   3789 /*
   3790  * aio_port_callback() is called just before the event is retrieved from the
   3791  * port. The task of this callback function is to finish the work of the
   3792  * transaction for the application, it means :
   3793  * - copyout transaction data to the application
   3794  *	(this thread is running in the right process context)
   3795  * - keep trace of the transaction (update of counters).
   3796  * - free allocated buffers
   3797  * The aiocb pointer is the object element of the port_kevent_t structure.
   3798  *
   3799  * flag :
   3800  *	PORT_CALLBACK_DEFAULT : do copyout and free resources
   3801  *	PORT_CALLBACK_CLOSE   : don't do copyout, free resources
   3802  */
   3803 
   3804 /*ARGSUSED*/
   3805 int
   3806 aio_port_callback(void *arg, int *events, pid_t pid, int flag, void *evp)
   3807 {
   3808 	aio_t		*aiop = curproc->p_aio;
   3809 	aio_req_t	*reqp = arg;
   3810 	struct	iovec	*iov;
   3811 	struct	buf	*bp;
   3812 	void		*resultp;
   3813 
   3814 	if (pid != curproc->p_pid) {
   3815 		/* wrong proc !!, can not deliver data here ... */
   3816 		return (EACCES);
   3817 	}
   3818 
   3819 	mutex_enter(&aiop->aio_portq_mutex);
   3820 	reqp->aio_req_portkev = NULL;
   3821 	aio_req_remove_portq(aiop, reqp); /* remove request from portq */
   3822 	mutex_exit(&aiop->aio_portq_mutex);
   3823 	aphysio_unlock(reqp);		/* unlock used pages */
   3824 	mutex_enter(&aiop->aio_mutex);
   3825 	if (reqp->aio_req_flags & AIO_COPYOUTDONE) {
   3826 		aio_req_free_port(aiop, reqp);	/* back to free list */
   3827 		mutex_exit(&aiop->aio_mutex);
   3828 		return (0);
   3829 	}
   3830 
   3831 	iov = reqp->aio_req_uio.uio_iov;
   3832 	bp = &reqp->aio_req_buf;
   3833 	resultp = (void *)reqp->aio_req_resultp;
   3834 	aio_req_free_port(aiop, reqp);	/* request struct back to free list */
   3835 	mutex_exit(&aiop->aio_mutex);
   3836 	if (flag == PORT_CALLBACK_DEFAULT)
   3837 		aio_copyout_result_port(iov, bp, resultp);
   3838 	return (0);
   3839 }
   3840