Home | History | Annotate | Download | only in sockfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/types.h>
     28 #include <sys/param.h>
     29 #include <sys/cmn_err.h>
     30 #include <sys/uio.h>
     31 #include <sys/stropts.h>
     32 #include <sys/strsun.h>
     33 #include <sys/systm.h>
     34 #include <sys/socketvar.h>
     35 #include <fs/sockfs/sodirect.h>
     36 
     37 /*
     38  * In support of on-board asynchronous DMA hardware (e.g. Intel I/OAT)
     39  * we use a consolidation private KAPI to allow the protocol to start
     40  * an asynchronous copyout to a user-land receive-side buffer (uioa)
     41  * when a blocking socket read (e.g. read, recv, ...) is pending.
     42  *
     43  * In some broad strokes, this is what happens. When recv is called,
     44  * we first determine whether it would be beneficial to use uioa, and
     45  * if so set up the required state (all done by sod_rcv_init()).
     46  * The protocol can only initiate asynchronous copyout if the receive
     47  * queue is empty, so the first thing we do is drain any previously
     48  * queued data (using sod_uioa_so_init()). Once the copyouts (if any)
     49  * have been scheduled we wait for the receive to be satisfied. During
     50  * that time any new mblks that are enqueued will be scheduled to be
     51  * copied out asynchronously (sod_uioa_mblk_init()). When the receive
     52  * has been satisfied we wait for all scheduled copyout operations to
     53  * complete before we return to the user (sod_rcv_done())
     54  */
     55 
     56 static struct kmem_cache *sock_sod_cache;
     57 
     58 /*
     59  * This function is called at the beginning of recvmsg().
     60  *
     61  * If I/OAT is enabled on this sonode, initialize the uioa state machine
     62  * with state UIOA_ALLOC.
     63  */
     64 uio_t *
     65 sod_rcv_init(struct sonode *so, int flags, struct uio **uiopp)
     66 {
     67 	struct uio *suiop;
     68 	struct uio *uiop;
     69 	sodirect_t *sodp = so->so_direct;
     70 
     71 	if (sodp == NULL)
     72 		return (NULL);
     73 
     74 	suiop = NULL;
     75 	uiop = *uiopp;
     76 
     77 	mutex_enter(&so->so_lock);
     78 	if (uiop->uio_resid >= uioasync.mincnt &&
     79 	    sodp != NULL && sodp->sod_enabled &&
     80 	    uioasync.enabled && !(flags & MSG_PEEK) &&
     81 	    !so->so_proto_props.sopp_loopback &&
     82 	    !(so->so_state & SS_CANTRCVMORE)) {
     83 		/*
     84 		 * Big enough I/O for uioa min setup and an sodirect socket
     85 		 * and sodirect enabled and uioa enabled and I/O will be done
     86 		 * and not EOF so initialize the sodirect_t uioa_t with "uiop".
     87 		 */
     88 		if (!uioainit(uiop, &sodp->sod_uioa)) {
     89 			/*
     90 			 * Successful uioainit() so the uio_t part of the
     91 			 * uioa_t will be used for all uio_t work to follow,
     92 			 * we return the original "uiop" in "suiop".
     93 			 */
     94 			suiop = uiop;
     95 			*uiopp = (uio_t *)&sodp->sod_uioa;
     96 			/*
     97 			 * Before returning to the caller the passed in uio_t
     98 			 * "uiop" will be updated via a call to uioafini()
     99 			 * below.
    100 			 *
    101 			 * Note, the uioa.uioa_state isn't set to UIOA_ENABLED
    102 			 * here as first we have to uioamove() any currently
    103 			 * queued M_DATA mblk_t(s) so it will be done later.
    104 			 */
    105 		}
    106 	}
    107 	mutex_exit(&so->so_lock);
    108 
    109 	return (suiop);
    110 }
    111 
    112 /*
    113  * This function is called at the end of recvmsg(), it finializes all the I/OAT
    114  * operations, and reset the uioa state to UIOA_ALLOC.
    115  */
    116 int
    117 sod_rcv_done(struct sonode *so, struct uio *suiop, struct uio *uiop)
    118 {
    119 	int error = 0;
    120 	sodirect_t *sodp = so->so_direct;
    121 	mblk_t *mp;
    122 
    123 	if (sodp == NULL) {
    124 		return (0);
    125 	}
    126 
    127 	ASSERT(MUTEX_HELD(&so->so_lock));
    128 	/* Finish any sodirect and uioa processing */
    129 	if (suiop != NULL) {
    130 		/* Finish any uioa_t processing */
    131 
    132 		ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
    133 		error = uioafini(suiop, (uioa_t *)uiop);
    134 		if ((mp = sodp->sod_uioafh) != NULL) {
    135 			sodp->sod_uioafh = NULL;
    136 			sodp->sod_uioaft = NULL;
    137 			freemsg(mp);
    138 		}
    139 	}
    140 	ASSERT(sodp->sod_uioafh == NULL);
    141 
    142 	return (error);
    143 }
    144 
    145 /*
    146  * Schedule a uioamove() on a mblk. This is done as mblks are enqueued
    147  * by the protocol on the socket's rcv queue.
    148  *
    149  * Caller must be holding so_lock.
    150  */
    151 void
    152 sod_uioa_mblk_init(struct sodirect_s *sodp, mblk_t *mp, size_t msg_size)
    153 {
    154 	uioa_t *uioap = &sodp->sod_uioa;
    155 	mblk_t *mp1 = mp;
    156 	mblk_t *lmp = NULL;
    157 
    158 	ASSERT(DB_TYPE(mp) == M_DATA);
    159 	ASSERT(msg_size == msgdsize(mp));
    160 
    161 	if (uioap->uioa_state & UIOA_ENABLED) {
    162 		/* Uioa is enabled */
    163 
    164 		if (msg_size > uioap->uio_resid) {
    165 			/*
    166 			 * There isn't enough uio space for the mblk_t chain
    167 			 * so disable uioa such that this and any additional
    168 			 * mblk_t data is handled by the socket and schedule
    169 			 * the socket for wakeup to finish this uioa.
    170 			 */
    171 			uioap->uioa_state &= UIOA_CLR;
    172 			uioap->uioa_state |= UIOA_FINI;
    173 			return;
    174 		}
    175 		do {
    176 			uint32_t	len = MBLKL(mp1);
    177 
    178 			if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) {
    179 				/* Scheduled, mark dblk_t as such */
    180 				DB_FLAGS(mp1) |= DBLK_UIOA;
    181 			} else {
    182 				/* Error, turn off async processing */
    183 				uioap->uioa_state &= UIOA_CLR;
    184 				uioap->uioa_state |= UIOA_FINI;
    185 				break;
    186 			}
    187 			lmp = mp1;
    188 		} while ((mp1 = mp1->b_cont) != NULL);
    189 
    190 		if (mp1 != NULL || uioap->uio_resid == 0) {
    191 			/* Break the mblk chain if neccessary. */
    192 			if (mp1 != NULL && lmp != NULL) {
    193 				mp->b_next = mp1;
    194 				lmp->b_cont = NULL;
    195 			}
    196 		}
    197 	}
    198 }
    199 
    200 /*
    201  * This function is called on a mblk that thas been successfully uioamoved().
    202  */
    203 void
    204 sod_uioa_mblk_done(sodirect_t *sodp, mblk_t *bp)
    205 {
    206 	if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) {
    207 		/*
    208 		 * A uioa flaged mblk_t chain, already uio processed,
    209 		 * add it to the sodirect uioa pending free list.
    210 		 *
    211 		 * Note, a b_cont chain headed by a DBLK_UIOA enable
    212 		 * mblk_t must have all mblk_t(s) DBLK_UIOA enabled.
    213 		 */
    214 		mblk_t	*bpt = sodp->sod_uioaft;
    215 
    216 		ASSERT(sodp != NULL);
    217 
    218 		/*
    219 		 * Add first mblk_t of "bp" chain to current sodirect uioa
    220 		 * free list tail mblk_t, if any, else empty list so new head.
    221 		 */
    222 		if (bpt == NULL)
    223 			sodp->sod_uioafh = bp;
    224 		else
    225 			bpt->b_cont = bp;
    226 
    227 		/*
    228 		 * Walk mblk_t "bp" chain to find tail and adjust rptr of
    229 		 * each to reflect that uioamove() has consumed all data.
    230 		 */
    231 		bpt = bp;
    232 		for (;;) {
    233 			ASSERT(bpt->b_datap->db_flags & DBLK_UIOA);
    234 
    235 			bpt->b_rptr = bpt->b_wptr;
    236 			if (bpt->b_cont == NULL)
    237 				break;
    238 			bpt = bpt->b_cont;
    239 		}
    240 		/* New sodirect uioa free list tail */
    241 		sodp->sod_uioaft = bpt;
    242 
    243 		/* Only dequeue once with data returned per uioa_t */
    244 		if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) {
    245 			sodp->sod_uioa.uioa_state &= UIOA_CLR;
    246 			sodp->sod_uioa.uioa_state |= UIOA_FINI;
    247 		}
    248 	}
    249 }
    250 
    251 /*
    252  * When transit from UIOA_INIT state to UIOA_ENABLE state in recvmsg(), call
    253  * this function on a non-STREAMS socket to schedule uioamove() on the data
    254  * that has already queued in this socket.
    255  */
    256 void
    257 sod_uioa_so_init(struct sonode *so, struct sodirect_s *sodp, struct uio *uiop)
    258 {
    259 	uioa_t	*uioap = (uioa_t *)uiop;
    260 	mblk_t	*lbp;
    261 	mblk_t	*wbp;
    262 	mblk_t	*bp;
    263 	int	len;
    264 	int	error;
    265 	boolean_t in_rcv_q = B_TRUE;
    266 
    267 	ASSERT(MUTEX_HELD(&so->so_lock));
    268 	ASSERT(&sodp->sod_uioa == uioap);
    269 
    270 	/*
    271 	 * Walk first b_cont chain in sod_q
    272 	 * and schedule any M_DATA mblk_t's for uio asynchronous move.
    273 	 */
    274 	bp = so->so_rcv_q_head;
    275 
    276 again:
    277 	/* Walk the chain */
    278 	lbp = NULL;
    279 	wbp = bp;
    280 
    281 	do {
    282 		if (bp == NULL)
    283 			break;
    284 
    285 		if (wbp->b_datap->db_type != M_DATA) {
    286 			/* Not M_DATA, no more uioa */
    287 			goto nouioa;
    288 		}
    289 		if ((len = wbp->b_wptr - wbp->b_rptr) > 0) {
    290 			/* Have a M_DATA mblk_t with data */
    291 			if (len > uioap->uio_resid || (so->so_oobmark > 0 &&
    292 			    len + uioap->uioa_mbytes >= so->so_oobmark)) {
    293 				/* Not enough uio sapce, or beyond oobmark */
    294 				goto nouioa;
    295 			}
    296 			ASSERT(!(wbp->b_datap->db_flags & DBLK_UIOA));
    297 			error = uioamove(wbp->b_rptr, len,
    298 			    UIO_READ, uioap);
    299 			if (!error) {
    300 				/* Scheduled, mark dblk_t as such */
    301 				wbp->b_datap->db_flags |= DBLK_UIOA;
    302 			} else {
    303 				/* Break the mblk chain */
    304 				goto nouioa;
    305 			}
    306 		}
    307 		/* Save last wbp processed */
    308 		lbp = wbp;
    309 	} while ((wbp = wbp->b_cont) != NULL);
    310 
    311 	if (in_rcv_q && (bp == NULL || bp->b_next == NULL)) {
    312 		/*
    313 		 * We get here only once to process the sonode dump area
    314 		 * if so_rcv_q_head is NULL or all the mblks have been
    315 		 * successfully uioamoved()ed.
    316 		 */
    317 		in_rcv_q = B_FALSE;
    318 
    319 		/* move to dump area */
    320 		bp = so->so_rcv_head;
    321 		goto again;
    322 	}
    323 
    324 	return;
    325 
    326 nouioa:
    327 	/* No more uioa */
    328 	uioap->uioa_state &= UIOA_CLR;
    329 	uioap->uioa_state |= UIOA_FINI;
    330 
    331 	/*
    332 	 * If we processed 1 or more mblk_t(s) then we need to split the
    333 	 * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s)
    334 	 * are in the current chain and the rest are in the following new
    335 	 * chain.
    336 	 */
    337 	if (lbp != NULL) {
    338 		/* New end of current chain */
    339 		lbp->b_cont = NULL;
    340 
    341 		/* Insert new chain wbp after bp */
    342 		if ((wbp->b_next = bp->b_next) == NULL) {
    343 			if (in_rcv_q)
    344 				so->so_rcv_q_last_head = wbp;
    345 			else
    346 				so->so_rcv_last_head = wbp;
    347 		}
    348 		bp->b_next = wbp;
    349 		bp->b_next->b_prev = bp->b_prev;
    350 		bp->b_prev = lbp;
    351 	}
    352 }
    353 
    354 /*
    355  * Initialize sodirect data structures on a socket.
    356  */
    357 void
    358 sod_sock_init(struct sonode *so)
    359 {
    360 	sodirect_t	*sodp;
    361 
    362 	ASSERT(so->so_direct == NULL);
    363 
    364 	so->so_state |= SS_SODIRECT;
    365 
    366 	sodp = kmem_cache_alloc(sock_sod_cache, KM_SLEEP);
    367 	sodp->sod_enabled = B_TRUE;
    368 	sodp->sod_uioafh = NULL;
    369 	sodp->sod_uioaft = NULL;
    370 	/*
    371 	 * Remainder of the sod_uioa members are left uninitialized
    372 	 * but will be initialized later by uioainit() before uioa
    373 	 * is enabled.
    374 	 */
    375 	sodp->sod_uioa.uioa_state = UIOA_ALLOC;
    376 	so->so_direct = sodp;
    377 }
    378 
    379 void
    380 sod_sock_fini(struct sonode *so)
    381 {
    382 	sodirect_t *sodp = so->so_direct;
    383 
    384 	ASSERT(sodp->sod_uioafh == NULL);
    385 
    386 	so->so_direct = NULL;
    387 	kmem_cache_free(sock_sod_cache, sodp);
    388 }
    389 
    390 /*
    391  * Init the sodirect kmem cache while sockfs is loading.
    392  */
    393 int
    394 sod_init()
    395 {
    396 	/* Allocate sodirect_t kmem_cache */
    397 	sock_sod_cache = kmem_cache_create("sock_sod_cache",
    398 	    sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
    399 
    400 	return (0);
    401 }
    402 
    403 ssize_t
    404 sod_uioa_mblk(struct sonode *so, mblk_t *mp)
    405 {
    406 	sodirect_t *sodp = so->so_direct;
    407 
    408 	ASSERT(sodp != NULL);
    409 	ASSERT(MUTEX_HELD(&so->so_lock));
    410 
    411 	ASSERT(sodp->sod_enabled);
    412 	ASSERT(sodp->sod_uioa.uioa_state != (UIOA_ALLOC|UIOA_INIT));
    413 
    414 	ASSERT(sodp->sod_uioa.uioa_state & (UIOA_ENABLED|UIOA_FINI));
    415 
    416 	if (mp == NULL && so->so_rcv_q_head != NULL) {
    417 		mp = so->so_rcv_q_head;
    418 		ASSERT(mp->b_prev != NULL);
    419 		mp->b_prev = NULL;
    420 		so->so_rcv_q_head = mp->b_next;
    421 		if (so->so_rcv_q_head == NULL) {
    422 			so->so_rcv_q_last_head = NULL;
    423 		}
    424 		mp->b_next = NULL;
    425 	}
    426 
    427 	sod_uioa_mblk_done(sodp, mp);
    428 
    429 	if (so->so_rcv_q_head == NULL && so->so_rcv_head != NULL &&
    430 	    DB_TYPE(so->so_rcv_head) == M_DATA &&
    431 	    (DB_FLAGS(so->so_rcv_head) & DBLK_UIOA)) {
    432 		/* more arrived */
    433 		ASSERT(so->so_rcv_q_head == NULL);
    434 		mp = so->so_rcv_head;
    435 		so->so_rcv_head = mp->b_next;
    436 		if (so->so_rcv_head == NULL)
    437 			so->so_rcv_last_head = NULL;
    438 		mp->b_prev = mp->b_next = NULL;
    439 		sod_uioa_mblk_done(sodp, mp);
    440 	}
    441 
    442 #ifdef DEBUG
    443 	if (so->so_rcv_q_head != NULL) {
    444 		mblk_t *m = so->so_rcv_q_head;
    445 		while (m != NULL) {
    446 			if (DB_FLAGS(m) & DBLK_UIOA) {
    447 				cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
    448 				    " in so_rcv_q_head.\n", (void *)m);
    449 			}
    450 			m = m->b_next;
    451 		}
    452 	}
    453 	if (so->so_rcv_head != NULL) {
    454 		mblk_t *m = so->so_rcv_head;
    455 		while (m != NULL) {
    456 			if (DB_FLAGS(m) & DBLK_UIOA) {
    457 				cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
    458 				    " in so_rcv_head.\n", (void *)m);
    459 			}
    460 			m = m->b_next;
    461 		}
    462 	}
    463 #endif
    464 	return (sodp->sod_uioa.uioa_mbytes);
    465 }
    466