Home | History | Annotate | Download | only in inet
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 /* Copyright (c) 1990 Mentat Inc. */
     26 
     27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     28 
     29 /*
     30  * This file contains common code for handling Options Management requests.
     31  */
     32 
     33 #include <sys/types.h>
     34 #include <sys/stream.h>
     35 #include <sys/stropts.h>
     36 #include <sys/strsubr.h>
     37 #include <sys/errno.h>
     38 #define	_SUN_TPI_VERSION 2
     39 #include <sys/tihdr.h>
     40 #include <sys/socket.h>
     41 #include <sys/ddi.h>
     42 #include <sys/debug.h>		/* for ASSERT */
     43 #include <sys/policy.h>
     44 
     45 #include <inet/common.h>
     46 #include <inet/mi.h>
     47 #include <inet/nd.h>
     48 #include <netinet/ip6.h>
     49 #include <inet/ip.h>
     50 #include <inet/mib2.h>
     51 #include <netinet/in.h>
     52 #include "optcom.h"
     53 
     54 #include <inet/optcom.h>
     55 
     56 /*
     57  * Function prototypes
     58  */
     59 static t_scalar_t process_topthdrs_first_pass(mblk_t *, cred_t *, optdb_obj_t *,
     60     boolean_t *, size_t *);
     61 static t_scalar_t do_options_second_pass(queue_t *q, mblk_t *reqmp,
     62     mblk_t *ack_mp, cred_t *, optdb_obj_t *dbobjp,
     63     mblk_t *first_mp, boolean_t is_restart, boolean_t *queued_statusp);
     64 static t_uscalar_t get_worst_status(t_uscalar_t, t_uscalar_t);
     65 static int do_opt_default(queue_t *, struct T_opthdr *, uchar_t **,
     66     t_uscalar_t *, cred_t *, optdb_obj_t *);
     67 static void do_opt_current(queue_t *, struct T_opthdr *, uchar_t **,
     68     t_uscalar_t *, cred_t *cr, optdb_obj_t *);
     69 static int do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
     70     uint_t optset_context, uchar_t **resptrp, t_uscalar_t *worst_statusp,
     71     cred_t *, optdb_obj_t *dbobjp, mblk_t *first_mp);
     72 static opdes_t *opt_chk_lookup(t_uscalar_t, t_uscalar_t, opdes_t *, uint_t);
     73 static boolean_t opt_level_valid(t_uscalar_t, optlevel_t *, uint_t);
     74 static size_t opt_level_allopts_lengths(t_uscalar_t, opdes_t *, uint_t);
     75 static boolean_t opt_length_ok(opdes_t *, struct T_opthdr *);
     76 static t_uscalar_t optcom_max_optbuf_len(opdes_t *, uint_t);
     77 static boolean_t opt_bloated_maxsize(opdes_t *);
     78 
     79 /* Common code for sending back a T_ERROR_ACK. */
     80 void
     81 optcom_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
     82 {
     83 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
     84 		qreply(q, mp);
     85 }
     86 
     87 /*
     88  * The option management routines svr4_optcom_req() and tpi_optcom_req() use
     89  * callback functions as arguments. Here is the expected interfaces
     90  * assumed from the callback functions
     91  *
     92  *
     93  * (1) deffn(q, optlevel, optname, optvalp)
     94  *
     95  *	- Function only called when default value comes from protocol
     96  *	 specific code and not the option database table (indicated by
     97  *	  OP_DEF_FN property in option database.)
     98  *	- Error return is -1. Valid returns are >=0.
     99  *	- When valid, the return value represents the length used for storing
    100  *		the default value of the option.
    101  *      - Error return implies the called routine did not recognize this
    102  *              option. Something downstream could so input is left unchanged
    103  *              in request buffer.
    104  *
    105  * (2) getfn(q, optlevel, optname, optvalp)
    106  *
    107  *	- Error return is -1. Valid returns are >=0.
    108  *	- When valid, the return value represents the length used for storing
    109  *		the actual value of the option.
    110  *      - Error return implies the called routine did not recognize this
    111  *              option. Something downstream could so input is left unchanged
    112  *              in request buffer.
    113  *
    114  * (3) setfn(q, optset_context, optlevel, optname, inlen, invalp,
    115  *	outlenp, outvalp, attrp, cr);
    116  *
    117  *	- OK return is 0, Error code is returned as a non-zero argument.
    118  *      - If negative it is ignored by svr4_optcom_req(). If positive, error
    119  *        is returned. A negative return implies that option, while handled on
    120  *	  this stack is not handled at this level and will be handled further
    121  *	  downstream.
    122  *	- Both negative and positive errors are treats as errors in an
    123  *	  identical manner by tpi_optcom_req(). The errors affect "status"
    124  *	  field of each option's T_opthdr. If sucessfull, an appropriate sucess
    125  *	  result is carried. If error, it instantiated to "failure" at the
    126  *	  topmost level and left unchanged at other levels. (This "failure" can
    127  *	  turn to a success at another level).
    128  *	- optset_context passed for tpi_optcom_req(). It is interpreted as:
    129  *        - SETFN_OPTCOM_CHECKONLY
    130  *		semantics are to pretend to set the value and report
    131  *		back if it would be successful.
    132  *		This is used with T_CHECK semantics in XTI
    133  *        - SETFN_OPTCOM_NEGOTIATE
    134  *		set the value. Call from option management primitive
    135  *		T_OPTMGMT_REQ when T_NEGOTIATE flags is used.
    136  *	  - SETFN_UD_NEGOTIATE
    137  *		option request came riding on UNITDATA primitive most often
    138  *		has  "this datagram" semantics to influence prpoerties
    139  *		affecting an outgoig datagram or associated with recived
    140  *		datagram
    141  *		[ Note: XTI permits this use outside of "this datagram"
    142  *		semantics also and permits setting "management related"
    143  *		options in this	context and its test suite enforces it ]
    144  *	  - SETFN_CONN_NEGOTATE
    145  *		option request came riding on CONN_REQ/RES primitive and
    146  *		most often has "this connection" (negotiation during
    147  *		"connection estblishment") semantics.
    148  *		[ Note: XTI permits use of these outside of "this connection"
    149  *		semantics and permits "management related" options in this
    150  *		context and its test suite enforces it. ]
    151  *
    152  *	- inlen, invalp is the option length,value requested to be set.
    153  *	- outlenp, outvalp represent return parameters which contain the
    154  *	  value set and it might be different from one passed on input.
    155  *	- attrp points to a data structure that's used by v6 modules to
    156  *	  store ancillary data options or sticky options.
    157  *	- cr points to the caller's credentials
    158  *	- the caller might pass same buffers for input and output and the
    159  *	  routine should protect against this case by not updating output
    160  *	  buffers until it is done referencing input buffers and any other
    161  *	  issues (e.g. not use bcopy() if we do not trust what it does).
    162  *      - If option is not known, it returns error. We randomly pick EINVAL.
    163  *        It can however get called with options that are handled downstream
    164  *        opr upstream so for svr4_optcom_req(), it does not return error for
    165  *        negative return values.
    166  *
    167  */
    168 
    169 /*
    170  * Upper Level Protocols call this routine when they receive
    171  * a T_SVR4_OPTMGMT_REQ message.  They supply callback functions
    172  * for setting a new value for a single options, getting the
    173  * current value for a single option, and checking for support
    174  * of a single option.  svr4_optcom_req validates the option management
    175  * buffer passed in, and calls the appropriate routines to do the
    176  * job requested.
    177  * XXX Code below needs some restructuring after we have some more
    178  * macros to support 'struct opthdr' in the headers.
    179  *
    180  * IP-MT notes: The option management framework functions svr4_optcom_req() and
    181  * tpi_optcom_req() allocate and prepend an M_CTL mblk to the actual
    182  * T_optmgmt_req mblk and pass the chain as an additional parameter to the
    183  * protocol set functions. If a protocol set function (such as ip_opt_set)
    184  * cannot process the option immediately it can return EINPROGRESS. ip_opt_set
    185  * enqueues the message in the appropriate sq and returns EINPROGRESS. Later
    186  * the sq framework arranges to restart this operation and passes control to
    187  * the restart function ip_restart_optmgmt() which in turn calls
    188  * svr4_optcom_req() or tpi_optcom_req() to restart the option processing.
    189  */
    190 int
    191 svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
    192     boolean_t pass_to_ip)
    193 {
    194 	pfi_t	deffn = dbobjp->odb_deffn;
    195 	pfi_t	getfn = dbobjp->odb_getfn;
    196 	opt_set_fn setfn = dbobjp->odb_setfn;
    197 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
    198 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
    199 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
    200 	opt_restart_t *or;
    201 	struct opthdr *restart_opt;
    202 	boolean_t is_restart = B_FALSE;
    203 	mblk_t	*first_mp;
    204 
    205 	t_uscalar_t max_optbuf_len;
    206 	int len;
    207 	mblk_t	*mp1 = NULL;
    208 	struct opthdr *next_opt;
    209 	struct opthdr *opt;
    210 	struct opthdr *opt1;
    211 	struct opthdr *opt_end;
    212 	struct opthdr *opt_start;
    213 	opdes_t	*optd;
    214 	boolean_t	pass_to_next = B_FALSE;
    215 	struct T_optmgmt_ack *toa;
    216 	struct T_optmgmt_req *tor;
    217 
    218 	/*
    219 	 * Allocate M_CTL and prepend to the packet for restarting this
    220 	 * option if needed. IP may need to queue and restart the option
    221 	 * if it cannot obtain exclusive conditions immediately. Please see
    222 	 * IP-MT notes before the start of svr4_optcom_req
    223 	 */
    224 	if (mp->b_datap->db_type == M_CTL) {
    225 		is_restart = B_TRUE;
    226 		first_mp = mp;
    227 		mp = mp->b_cont;
    228 		ASSERT(mp->b_wptr - mp->b_rptr >=
    229 		    sizeof (struct T_optmgmt_req));
    230 		tor = (struct T_optmgmt_req *)mp->b_rptr;
    231 		ASSERT(tor->MGMT_flags == T_NEGOTIATE);
    232 
    233 		or = (opt_restart_t *)first_mp->b_rptr;
    234 		opt_start = or->or_start;
    235 		opt_end = or->or_end;
    236 		restart_opt = or->or_ropt;
    237 		goto restart;
    238 	}
    239 
    240 	tor = (struct T_optmgmt_req *)mp->b_rptr;
    241 	/* Verify message integrity. */
    242 	if (mp->b_wptr - mp->b_rptr < sizeof (struct T_optmgmt_req))
    243 		goto bad_opt;
    244 	/* Verify MGMT_flags legal */
    245 	switch (tor->MGMT_flags) {
    246 	case T_DEFAULT:
    247 	case T_NEGOTIATE:
    248 	case T_CURRENT:
    249 	case T_CHECK:
    250 		/* OK - legal request flags */
    251 		break;
    252 	default:
    253 		optcom_err_ack(q, mp, TBADFLAG, 0);
    254 		return (0);
    255 	}
    256 	if (tor->MGMT_flags == T_DEFAULT) {
    257 		/* Is it a request for default option settings? */
    258 
    259 		/*
    260 		 * Note: XXX TLI and TPI specification was unclear about
    261 		 * semantics of T_DEFAULT and the following historical note
    262 		 * and its interpretation is incorrect (it implies a request
    263 		 * for default values of only the identified options not all.
    264 		 * The semantics have been explained better in XTI spec.)
    265 		 * However, we do not modify (comment or code) here to keep
    266 		 * compatibility.
    267 		 * We can rethink this if it ever becomes an issue.
    268 		 * ----historical comment start------
    269 		 * As we understand it, the input buffer is meaningless
    270 		 * so we ditch the message.  A T_DEFAULT request is a
    271 		 * request to obtain a buffer containing defaults for
    272 		 * all supported options, so we allocate a maximum length
    273 		 * reply.
    274 		 * ----historical comment end -------
    275 		 */
    276 		/* T_DEFAULT not passed down */
    277 		ASSERT(topmost_tpiprovider == B_TRUE);
    278 		freemsg(mp);
    279 		max_optbuf_len = optcom_max_optbuf_len(opt_arr,
    280 		    opt_arr_cnt);
    281 		mp = allocb(max_optbuf_len, BPRI_MED);
    282 		if (!mp) {
    283 no_mem:;
    284 			optcom_err_ack(q, mp, TSYSERR, ENOMEM);
    285 			return (0);
    286 		}
    287 
    288 		/* Initialize the T_optmgmt_ack header. */
    289 		toa = (struct T_optmgmt_ack *)mp->b_rptr;
    290 		bzero((char *)toa, max_optbuf_len);
    291 		toa->PRIM_type = T_OPTMGMT_ACK;
    292 		toa->OPT_offset = (t_scalar_t)sizeof (struct T_optmgmt_ack);
    293 		/* TODO: Is T_DEFAULT the right thing to put in MGMT_flags? */
    294 		toa->MGMT_flags = T_DEFAULT;
    295 
    296 		/* Now walk the table of options passed in */
    297 		opt = (struct opthdr *)&toa[1];
    298 		for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
    299 			/*
    300 			 * All the options in the table of options passed
    301 			 * in are by definition supported by the protocol
    302 			 * calling this function.
    303 			 */
    304 			if (!OA_READ_PERMISSION(optd, cr))
    305 				continue;
    306 			opt->level = optd->opdes_level;
    307 			opt->name = optd->opdes_name;
    308 			if (!(optd->opdes_props & OP_DEF_FN) ||
    309 			    ((len = (*deffn)(q, opt->level,
    310 			    opt->name, (uchar_t *)&opt[1])) < 0)) {
    311 				/*
    312 				 * Fill length and value from table.
    313 				 *
    314 				 * Default value not instantiated from function
    315 				 * (or the protocol specific function failed it;
    316 				 * In this interpretation of T_DEFAULT, this is
    317 				 * the best we can do)
    318 				 */
    319 				switch (optd->opdes_size) {
    320 				/*
    321 				 * Since options are guaranteed aligned only
    322 				 * on a 4 byte boundary (t_scalar_t) any
    323 				 * option that is greater in size will default
    324 				 * to the bcopy below
    325 				 */
    326 				case sizeof (int32_t):
    327 					*(int32_t *)&opt[1] =
    328 					    (int32_t)optd->opdes_default;
    329 					break;
    330 				case sizeof (int16_t):
    331 					*(int16_t *)&opt[1] =
    332 					    (int16_t)optd->opdes_default;
    333 					break;
    334 				case sizeof (int8_t):
    335 					*(int8_t *)&opt[1] =
    336 					    (int8_t)optd->opdes_default;
    337 					break;
    338 				default:
    339 					/*
    340 					 * other length but still assume
    341 					 * fixed - use bcopy
    342 					 */
    343 					bcopy(optd->opdes_defbuf,
    344 					    &opt[1], optd->opdes_size);
    345 					break;
    346 				}
    347 				opt->len = optd->opdes_size;
    348 			}
    349 			else
    350 				opt->len = (t_uscalar_t)len;
    351 			opt = (struct opthdr *)((char *)&opt[1] +
    352 			    _TPI_ALIGN_OPT(opt->len));
    353 		}
    354 
    355 		/* Now record the final length. */
    356 		toa->OPT_length = (t_scalar_t)((char *)opt - (char *)&toa[1]);
    357 		mp->b_wptr = (uchar_t *)opt;
    358 		mp->b_datap->db_type = M_PCPROTO;
    359 		/* Ship it back. */
    360 		qreply(q, mp);
    361 		return (0);
    362 	}
    363 	/* T_DEFAULT processing complete - no more T_DEFAULT */
    364 
    365 	/*
    366 	 * For T_NEGOTIATE, T_CURRENT, and T_CHECK requests, we make a
    367 	 * pass through the input buffer validating the details and
    368 	 * making sure each option is supported by the protocol.
    369 	 */
    370 	if ((opt_start = (struct opthdr *)mi_offset_param(mp,
    371 	    tor->OPT_offset, tor->OPT_length)) == NULL)
    372 		goto bad_opt;
    373 	if (!__TPI_OPT_ISALIGNED(opt_start))
    374 		goto bad_opt;
    375 
    376 	opt_end = (struct opthdr *)((uchar_t *)opt_start +
    377 	    tor->OPT_length);
    378 
    379 	for (opt = opt_start; opt < opt_end; opt = next_opt) {
    380 		/*
    381 		 * Verify we have room to reference the option header
    382 		 * fields in the option buffer.
    383 		 */
    384 		if ((uchar_t *)opt + sizeof (struct opthdr) >
    385 		    (uchar_t *)opt_end)
    386 			goto bad_opt;
    387 		/*
    388 		 * We now compute pointer to next option in buffer 'next_opt'
    389 		 * The next_opt computation above below 'opt->len' initialized
    390 		 * by application which cannot be trusted. The usual value
    391 		 * too large will be captured by the loop termination condition
    392 		 * above. We check for the following which it will miss.
    393 		 * 	-pointer space wraparound arithmetic overflow
    394 		 *	-last option in buffer with 'opt->len' being too large
    395 		 *	 (only reason 'next_opt' should equal or exceed
    396 		 *	 'opt_end' for last option is roundup unless length is
    397 		 *	 too-large/invalid)
    398 		 */
    399 		next_opt = (struct opthdr *)((uchar_t *)&opt[1] +
    400 		    _TPI_ALIGN_OPT(opt->len));
    401 
    402 		if ((uchar_t *)next_opt < (uchar_t *)&opt[1] ||
    403 		    ((next_opt >= opt_end) &&
    404 		    (((uchar_t *)next_opt - (uchar_t *)opt_end) >=
    405 		    __TPI_ALIGN_SIZE)))
    406 			goto bad_opt;
    407 
    408 		/* sanity check */
    409 		if (opt->name == T_ALLOPT)
    410 			goto bad_opt;
    411 
    412 		/* Find the option in the opt_arr. */
    413 		if ((optd = opt_chk_lookup(opt->level, opt->name,
    414 		    opt_arr, opt_arr_cnt)) == NULL) {
    415 			/*
    416 			 * Not found, that is a bad thing if
    417 			 * the caller is a tpi provider
    418 			 */
    419 			if (topmost_tpiprovider)
    420 				goto bad_opt;
    421 			else
    422 				continue; /* skip unmodified */
    423 		}
    424 
    425 		/* Additional checks dependent on operation. */
    426 		switch (tor->MGMT_flags) {
    427 		case T_NEGOTIATE:
    428 			if (!OA_WRITE_OR_EXECUTE(optd, cr)) {
    429 				/* can't negotiate option */
    430 				if (!(OA_MATCHED_PRIV(optd, cr)) &&
    431 				    OA_WX_ANYPRIV(optd)) {
    432 					/*
    433 					 * not privileged but privilege
    434 					 * will help negotiate option.
    435 					 */
    436 					optcom_err_ack(q, mp, TACCES, 0);
    437 					return (0);
    438 				} else
    439 					goto bad_opt;
    440 			}
    441 			/*
    442 			 * Verify size for options
    443 			 * Note: For retaining compatibility with historical
    444 			 * behavior, variable lengths options will have their
    445 			 * length verified in the setfn() processing.
    446 			 * In order to be compatible with SunOS 4.X we return
    447 			 * EINVAL errors for bad lengths.
    448 			 */
    449 			if (!(optd->opdes_props & OP_VARLEN)) {
    450 				/* fixed length - size must match */
    451 				if (opt->len != optd->opdes_size) {
    452 					optcom_err_ack(q, mp, TSYSERR, EINVAL);
    453 					return (0);
    454 				}
    455 			}
    456 			break;
    457 
    458 		case T_CHECK:
    459 			if (!OA_RWX_ANYPRIV(optd))
    460 				/* any of "rwx" permission but not not none */
    461 				goto bad_opt;
    462 			/*
    463 			 * XXX Since T_CURRENT was not there in TLI and the
    464 			 * official TLI inspired TPI standard, getsockopt()
    465 			 * API uses T_CHECK (for T_CURRENT semantics)
    466 			 * The following fallthru makes sense because of its
    467 			 * historical use as semantic equivalent to T_CURRENT.
    468 			 */
    469 			/* FALLTHRU */
    470 		case T_CURRENT:
    471 			if (!OA_READ_PERMISSION(optd, cr)) {
    472 				/* can't read option value */
    473 				if (!(OA_MATCHED_PRIV(optd, cr)) &&
    474 				    OA_R_ANYPRIV(optd)) {
    475 					/*
    476 					 * not privileged but privilege
    477 					 * will help in reading option value.
    478 					 */
    479 					optcom_err_ack(q, mp, TACCES, 0);
    480 					return (0);
    481 				} else
    482 					goto bad_opt;
    483 			}
    484 			break;
    485 
    486 		default:
    487 			optcom_err_ack(q, mp, TBADFLAG, 0);
    488 			return (0);
    489 		}
    490 		/* We liked it.  Keep going. */
    491 	} /* end for loop scanning option buffer */
    492 
    493 	/* Now complete the operation as required. */
    494 	switch (tor->MGMT_flags) {
    495 	case T_CHECK:
    496 		/*
    497 		 * Historically used same as T_CURRENT (which was added to
    498 		 * standard later). Code retained for compatibility.
    499 		 */
    500 		/* FALLTHROUGH */
    501 	case T_CURRENT:
    502 		/*
    503 		 * Allocate a maximum size reply.  Perhaps we are supposed to
    504 		 * assume that the input buffer includes space for the answers
    505 		 * as well as the opthdrs, but we don't know that for sure.
    506 		 * So, instead, we create a new output buffer, using the
    507 		 * input buffer only as a list of options.
    508 		 */
    509 		max_optbuf_len = optcom_max_optbuf_len(opt_arr,
    510 		    opt_arr_cnt);
    511 		mp1 = allocb_cred(max_optbuf_len, cr);
    512 		if (!mp1)
    513 			goto no_mem;
    514 		/* Initialize the header. */
    515 		mp1->b_datap->db_type = M_PCPROTO;
    516 		mp1->b_wptr = &mp1->b_rptr[sizeof (struct T_optmgmt_ack)];
    517 		toa = (struct T_optmgmt_ack *)mp1->b_rptr;
    518 		toa->OPT_offset = (t_scalar_t)sizeof (struct T_optmgmt_ack);
    519 		toa->MGMT_flags = tor->MGMT_flags;
    520 		/*
    521 		 * Walk through the input buffer again, this time adding
    522 		 * entries to the output buffer for each option requested.
    523 		 * Note, sanity of option header, last option etc, verified
    524 		 * in first pass.
    525 		 */
    526 		opt1 = (struct opthdr *)&toa[1];
    527 
    528 		for (opt = opt_start; opt < opt_end; opt = next_opt) {
    529 
    530 			next_opt = (struct opthdr *)((uchar_t *)&opt[1] +
    531 			    _TPI_ALIGN_OPT(opt->len));
    532 
    533 			opt1->name = opt->name;
    534 			opt1->level = opt->level;
    535 			len = (*getfn)(q, opt->level,
    536 			    opt->name, (uchar_t *)&opt1[1]);
    537 			/*
    538 			 * Failure means option is not recognized. Copy input
    539 			 * buffer as is
    540 			 */
    541 			if (len < 0) {
    542 				opt1->len = opt->len;
    543 				bcopy(&opt[1], &opt1[1], opt->len);
    544 			} else {
    545 				opt1->len = (t_uscalar_t)len;
    546 			}
    547 			opt1 = (struct opthdr *)((uchar_t *)&opt1[1] +
    548 			    _TPI_ALIGN_OPT(opt1->len));
    549 		} /* end for loop */
    550 
    551 		/* Record the final length. */
    552 		toa->OPT_length = (t_scalar_t)((uchar_t *)opt1 -
    553 		    (uchar_t *)&toa[1]);
    554 		mp1->b_wptr = (uchar_t *)opt1;
    555 		/* Ditch the input buffer. */
    556 		freemsg(mp);
    557 		mp = mp1;
    558 		/* Always let the next module look at the option. */
    559 		pass_to_next = B_TRUE;
    560 		break;
    561 
    562 	case T_NEGOTIATE:
    563 		first_mp = allocb(sizeof (opt_restart_t), BPRI_LO);
    564 		if (first_mp == NULL) {
    565 			optcom_err_ack(q, mp, TSYSERR, ENOMEM);
    566 			return (0);
    567 		}
    568 		first_mp->b_datap->db_type = M_CTL;
    569 		or = (opt_restart_t *)first_mp->b_rptr;
    570 		or->or_start = opt_start;
    571 		or->or_end =  opt_end;
    572 		or->or_type = T_SVR4_OPTMGMT_REQ;
    573 		or->or_private = 0;
    574 		first_mp->b_cont = mp;
    575 restart:
    576 		/*
    577 		 * Here we are expecting that the response buffer is exactly
    578 		 * the same size as the input buffer.  We pass each opthdr
    579 		 * to the protocol's set function.  If the protocol doesn't
    580 		 * like it, it can update the value in it return argument.
    581 		 */
    582 		/*
    583 		 * Pass each negotiated option through the protocol set
    584 		 * function.
    585 		 * Note: sanity check on option header values done in first
    586 		 * pass and not repeated here.
    587 		 */
    588 		toa = (struct T_optmgmt_ack *)tor;
    589 
    590 		for (opt = is_restart ? restart_opt: opt_start; opt < opt_end;
    591 		    opt = next_opt) {
    592 			int error;
    593 
    594 			/*
    595 			 * Point to the current option in or, in case this
    596 			 * option has to be restarted later on
    597 			 */
    598 			or->or_ropt = opt;
    599 			next_opt = (struct opthdr *)((uchar_t *)&opt[1] +
    600 			    _TPI_ALIGN_OPT(opt->len));
    601 
    602 			error = (*setfn)(q, SETFN_OPTCOM_NEGOTIATE,
    603 			    opt->level, opt->name,
    604 			    opt->len, (uchar_t *)&opt[1],
    605 			    &opt->len, (uchar_t *)&opt[1], NULL, cr, first_mp);
    606 			/*
    607 			 * Treat positive "errors" as real.
    608 			 * Note: negative errors are to be treated as
    609 			 * non-fatal by svr4_optcom_req() and are
    610 			 * returned by setfn() when it is passed an
    611 			 * option it does not handle. Since the option
    612 			 * passed opt_chk_lookup(), it is implied that
    613 			 * it is valid but was either handled upstream
    614 			 * or will be handled downstream.
    615 			 */
    616 			if (error == EINPROGRESS) {
    617 				/*
    618 				 * The message is queued and will be
    619 				 * reprocessed later. Typically ip queued
    620 				 * the message to get some exclusive conditions
    621 				 * and later on calls this func again.
    622 				 */
    623 				return (EINPROGRESS);
    624 			} else if (error > 0) {
    625 				optcom_err_ack(q, mp, TSYSERR, error);
    626 				freeb(first_mp);
    627 				return (0);
    628 			}
    629 			/*
    630 			 * error < 0 means option is not recognized.
    631 			 * But with OP_PASSNEXT the next module
    632 			 * might recognize it.
    633 			 */
    634 		}
    635 		/* Done with the restart control mp. */
    636 		freeb(first_mp);
    637 		pass_to_next = B_TRUE;
    638 		break;
    639 	default:
    640 		optcom_err_ack(q, mp, TBADFLAG, 0);
    641 		return (0);
    642 	}
    643 
    644 	if (pass_to_next && (q->q_next != NULL || pass_to_ip)) {
    645 		/* Send it down to the next module and let it reply */
    646 		toa->PRIM_type = T_SVR4_OPTMGMT_REQ; /* Changed by IP to ACK */
    647 		if (q->q_next != NULL)
    648 			putnext(q, mp);
    649 		else
    650 			ip_output(Q_TO_CONN(q), mp, q, IP_WPUT);
    651 	} else {
    652 		/* Set common fields in the header. */
    653 		toa->MGMT_flags = T_SUCCESS;
    654 		mp->b_datap->db_type = M_PCPROTO;
    655 		toa->PRIM_type = T_OPTMGMT_ACK;
    656 		qreply(q, mp);
    657 	}
    658 	return (0);
    659 bad_opt:;
    660 	optcom_err_ack(q, mp, TBADOPT, 0);
    661 	return (0);
    662 }
    663 
    664 /*
    665  * New optcom_req inspired by TPI/XTI semantics
    666  */
    667 int
    668 tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
    669     boolean_t pass_to_ip)
    670 {
    671 	t_scalar_t t_error;
    672 	mblk_t *toa_mp;
    673 	boolean_t pass_to_next;
    674 	size_t toa_len;
    675 	struct T_optmgmt_ack *toa;
    676 	struct T_optmgmt_req *tor =
    677 	    (struct T_optmgmt_req *)mp->b_rptr;
    678 
    679 	opt_restart_t *or;
    680 	boolean_t is_restart = B_FALSE;
    681 	mblk_t	*first_mp = NULL;
    682 	t_uscalar_t worst_status;
    683 	boolean_t queued_status;
    684 
    685 	/*
    686 	 * Allocate M_CTL and prepend to the packet for restarting this
    687 	 * option if needed. IP may need to queue and restart the option
    688 	 * if it cannot obtain exclusive conditions immediately. Please see
    689 	 * IP-MT notes before the start of svr4_optcom_req
    690 	 */
    691 	if (mp->b_datap->db_type == M_CTL) {
    692 		is_restart = B_TRUE;
    693 		first_mp = mp;
    694 		toa_mp = mp->b_cont;
    695 		mp = toa_mp->b_cont;
    696 		ASSERT(mp->b_wptr - mp->b_rptr >=
    697 		    sizeof (struct T_optmgmt_req));
    698 		tor = (struct T_optmgmt_req *)mp->b_rptr;
    699 		ASSERT(tor->MGMT_flags == T_NEGOTIATE);
    700 
    701 		or = (opt_restart_t *)first_mp->b_rptr;
    702 		goto restart;
    703 	}
    704 
    705 	/* Verify message integrity. */
    706 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_optmgmt_req)) {
    707 		optcom_err_ack(q, mp, TBADOPT, 0);
    708 		return (0);
    709 	}
    710 
    711 	/* Verify MGMT_flags legal */
    712 	switch (tor->MGMT_flags) {
    713 	case T_DEFAULT:
    714 	case T_NEGOTIATE:
    715 	case T_CURRENT:
    716 	case T_CHECK:
    717 		/* OK - legal request flags */
    718 		break;
    719 	default:
    720 		optcom_err_ack(q, mp, TBADFLAG, 0);
    721 		return (0);
    722 	}
    723 
    724 	/*
    725 	 * In this design, there are two passes required on the input buffer
    726 	 * mostly to accomodate variable length options and "T_ALLOPT" option
    727 	 * which has the semantics "all options of the specified level".
    728 	 *
    729 	 * For T_DEFAULT, T_NEGOTIATE, T_CURRENT, and T_CHECK requests, we make
    730 	 * a pass through the input buffer validating the details and making
    731 	 * sure each option is supported by the protocol. We also determine the
    732 	 * length of the option buffer to return. (Variable length options and
    733 	 * T_ALLOPT mean that length can be different for output buffer).
    734 	 */
    735 
    736 	pass_to_next = B_FALSE;	/* initial value */
    737 	toa_len = 0;		/* initial value */
    738 
    739 	/*
    740 	 * First pass, we do the following
    741 	 *	- estimate cumulative length needed for results
    742 	 *	- set "status" field based on permissions, option header check
    743 	 *	  etc.
    744 	 *	- determine "pass_to_next" whether we need to send request to
    745 	 *	  downstream module/driver.
    746 	 */
    747 	if ((t_error = process_topthdrs_first_pass(mp, cr, dbobjp,
    748 	    &pass_to_next, &toa_len)) != 0) {
    749 		optcom_err_ack(q, mp, t_error, 0);
    750 		return (0);
    751 	}
    752 
    753 	/*
    754 	 * A validation phase of the input buffer is done. We have also
    755 	 * obtained the length requirement and and other details about the
    756 	 * input and we liked input buffer so far.  We make another scan
    757 	 * through the input now and generate the output necessary to complete
    758 	 * the operation.
    759 	 */
    760 
    761 	toa_mp = allocb_cred(toa_len, cr);
    762 	if (!toa_mp) {
    763 		optcom_err_ack(q, mp, TSYSERR, ENOMEM);
    764 		return (0);
    765 	}
    766 
    767 	first_mp = allocb(sizeof (opt_restart_t), BPRI_LO);
    768 	if (first_mp == NULL) {
    769 		freeb(toa_mp);
    770 		optcom_err_ack(q, mp, TSYSERR, ENOMEM);
    771 		return (0);
    772 	}
    773 	first_mp->b_datap->db_type = M_CTL;
    774 	or = (opt_restart_t *)first_mp->b_rptr;
    775 	/*
    776 	 * Set initial values for generating output.
    777 	 */
    778 	or->or_worst_status = T_SUCCESS;
    779 	or->or_type = T_OPTMGMT_REQ;
    780 	or->or_private = 0;
    781 	/* remaining fields fileed in do_options_second_pass */
    782 
    783 restart:
    784 	/*
    785 	 * This routine makes another pass through the option buffer this
    786 	 * time acting on the request based on "status" result in the
    787 	 * first pass. It also performs "expansion" of T_ALLOPT into
    788 	 * all options of a certain level and acts on each for this request.
    789 	 */
    790 	if ((t_error = do_options_second_pass(q, mp, toa_mp, cr, dbobjp,
    791 	    first_mp, is_restart, &queued_status)) != 0) {
    792 		freemsg(toa_mp);
    793 		optcom_err_ack(q, mp, t_error, 0);
    794 		return (0);
    795 	}
    796 	if (queued_status) {
    797 		/* Option will be restarted */
    798 		return (EINPROGRESS);
    799 	}
    800 	worst_status = or->or_worst_status;
    801 	/* Done with the first mp */
    802 	freeb(first_mp);
    803 	toa_mp->b_cont = NULL;
    804 
    805 	/*
    806 	 * Following code relies on the coincidence that T_optmgmt_req
    807 	 * and T_optmgmt_ack are identical in binary representation
    808 	 */
    809 	toa = (struct T_optmgmt_ack *)toa_mp->b_rptr;
    810 	toa->OPT_length = (t_scalar_t)(toa_mp->b_wptr - (toa_mp->b_rptr +
    811 	    sizeof (struct T_optmgmt_ack)));
    812 	toa->OPT_offset = (t_scalar_t)sizeof (struct T_optmgmt_ack);
    813 
    814 	toa->MGMT_flags = tor->MGMT_flags;
    815 
    816 
    817 	freemsg(mp);		/* free input mblk */
    818 
    819 	/*
    820 	 * If there is atleast one option that requires a downstream
    821 	 * forwarding and if it is possible, we forward the message
    822 	 * downstream. Else we ack it.
    823 	 */
    824 	if (pass_to_next && (q->q_next != NULL || pass_to_ip)) {
    825 		/*
    826 		 * We pass it down as T_OPTMGMT_REQ. This code relies
    827 		 * on the happy coincidence that T_optmgmt_req and
    828 		 * T_optmgmt_ack are identical data structures
    829 		 * at the binary representation level.
    830 		 */
    831 		toa_mp->b_datap->db_type = M_PROTO;
    832 		toa->PRIM_type = T_OPTMGMT_REQ;
    833 		if (q->q_next != NULL)
    834 			putnext(q, toa_mp);
    835 		else
    836 			ip_output(Q_TO_CONN(q), toa_mp, q, IP_WPUT);
    837 	} else {
    838 		toa->PRIM_type = T_OPTMGMT_ACK;
    839 		toa_mp->b_datap->db_type = M_PCPROTO;
    840 		toa->MGMT_flags |= worst_status; /* XXX "worst" or "OR" TPI ? */
    841 		qreply(q, toa_mp);
    842 	}
    843 	return (0);
    844 }
    845 
    846 
    847 /*
    848  * Following routine makes a pass through option buffer in mp and performs the
    849  * following tasks.
    850  *	- estimate cumulative length needed for results
    851  *	- set "status" field based on permissions, option header check
    852  *	  etc.
    853  *	- determine "pass_to_next" whether we need to send request to
    854  *	  downstream module/driver.
    855  */
    856 
    857 static t_scalar_t
    858 process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
    859     boolean_t *pass_to_nextp, size_t *toa_lenp)
    860 {
    861 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
    862 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
    863 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
    864 	optlevel_t *valid_level_arr = dbobjp->odb_valid_levels_arr;
    865 	uint_t valid_level_arr_cnt = dbobjp->odb_valid_levels_arr_cnt;
    866 	struct T_opthdr *opt;
    867 	struct T_opthdr *opt_start, *opt_end;
    868 	opdes_t	*optd;
    869 	size_t allopt_len;
    870 	struct T_optmgmt_req *tor =
    871 	    (struct T_optmgmt_req *)mp->b_rptr;
    872 
    873 	*toa_lenp = sizeof (struct T_optmgmt_ack); /* initial value */
    874 
    875 	if ((opt_start = (struct T_opthdr *)
    876 	    mi_offset_param(mp, tor->OPT_offset, tor->OPT_length)) == NULL) {
    877 		return (TBADOPT);
    878 	}
    879 	if (!__TPI_TOPT_ISALIGNED(opt_start))
    880 		return (TBADOPT);
    881 
    882 	opt_end = (struct T_opthdr *)((uchar_t *)opt_start + tor->OPT_length);
    883 
    884 	for (opt = opt_start; opt && (opt < opt_end);
    885 	    opt = _TPI_TOPT_NEXTHDR(opt_start, tor->OPT_length, opt)) {
    886 		/*
    887 		 * Validate the option for length and alignment
    888 		 * before accessing anything in it.
    889 		 */
    890 		if (!(_TPI_TOPT_VALID(opt, opt_start, opt_end)))
    891 			return (TBADOPT);
    892 
    893 		/* Find the option in the opt_arr. */
    894 		if (opt->name != T_ALLOPT) {
    895 			optd = opt_chk_lookup(opt->level, opt->name,
    896 			    opt_arr, opt_arr_cnt);
    897 			if (optd == NULL) {
    898 				/*
    899 				 * Option not found
    900 				 *
    901 				 * Verify if level is "valid" or not.
    902 				 * Note: This check is required by XTI
    903 				 *
    904 				 * TPI provider always initializes
    905 				 * the "not supported" (or whatever) status
    906 				 * for the options. Other levels leave status
    907 				 * unchanged if they do not understand an
    908 				 * option.
    909 				 */
    910 				if (topmost_tpiprovider) {
    911 					if (!opt_level_valid(opt->level,
    912 					    valid_level_arr,
    913 					    valid_level_arr_cnt))
    914 						return (TBADOPT);
    915 					/*
    916 					 * level is valid - initialize
    917 					 * option as not supported
    918 					 */
    919 					opt->status = T_NOTSUPPORT;
    920 				}
    921 
    922 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
    923 				continue;
    924 			}
    925 		} else {
    926 			/*
    927 			 * Handle T_ALLOPT case as a special case.
    928 			 * Note: T_ALLOPT does not mean anything
    929 			 * for T_CHECK operation.
    930 			 */
    931 			allopt_len = 0;
    932 			if (tor->MGMT_flags == T_CHECK ||
    933 			    !topmost_tpiprovider ||
    934 			    ((allopt_len = opt_level_allopts_lengths(opt->level,
    935 			    opt_arr, opt_arr_cnt)) == 0)) {
    936 				/*
    937 				 * This is confusing but correct !
    938 				 * It is not valid to to use T_ALLOPT with
    939 				 * T_CHECK flag.
    940 				 *
    941 				 * T_ALLOPT is assumed "expanded" at the
    942 				 * topmost_tpiprovider level so it should not
    943 				 * be there as an "option name" if this is not
    944 				 * a topmost_tpiprovider call and we fail it.
    945 				 *
    946 				 * opt_level_allopts_lengths() is used to verify
    947 				 * that "level" associated with the T_ALLOPT is
    948 				 * supported.
    949 				 *
    950 				 */
    951 				opt->status = T_FAILURE;
    952 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
    953 				continue;
    954 			}
    955 			ASSERT(allopt_len != 0); /* remove ? */
    956 
    957 			*toa_lenp += allopt_len;
    958 			opt->status = T_SUCCESS;
    959 			/* XXX - always set T_ALLOPT 'pass_to_next' for now */
    960 			*pass_to_nextp = B_TRUE;
    961 			continue;
    962 		}
    963 		/*
    964 		 * Check if option wants to flow downstream
    965 		 */
    966 		if (optd->opdes_props & OP_PASSNEXT)
    967 			*pass_to_nextp = B_TRUE;
    968 
    969 		/* Additional checks dependent on operation. */
    970 		switch (tor->MGMT_flags) {
    971 		case T_DEFAULT:
    972 		case T_CURRENT:
    973 
    974 			/*
    975 			 * The opt_chk_lookup() routine call above approved of
    976 			 * this option so we can work on the status for it
    977 			 * based on the permissions for the operation. (This
    978 			 * can override any status for it set at higher levels)
    979 			 * We assume this override is OK since chkfn at this
    980 			 * level approved of this option.
    981 			 *
    982 			 * T_CURRENT semantics:
    983 			 * The read access is required. Else option
    984 			 * status is T_NOTSUPPORT.
    985 			 *
    986 			 * T_DEFAULT semantics:
    987 			 * Note: specification is not clear on this but we
    988 			 * interpret T_DEFAULT semantics such that access to
    989 			 * read value is required for access even the default
    990 			 * value. Otherwise the option status is T_NOTSUPPORT.
    991 			 */
    992 			if (!OA_READ_PERMISSION(optd, cr)) {
    993 				opt->status = T_NOTSUPPORT;
    994 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
    995 				/* skip to next */
    996 				continue;
    997 			}
    998 
    999 			/*
   1000 			 * T_DEFAULT/T_CURRENT semantics:
   1001 			 * We know that read access is set. If no other access
   1002 			 * is set, then status is T_READONLY.
   1003 			 */
   1004 			if (OA_READONLY_PERMISSION(optd, cr))
   1005 				opt->status = T_READONLY;
   1006 			else
   1007 				opt->status = T_SUCCESS;
   1008 			/*
   1009 			 * Option passes all checks. Make room for it in the
   1010 			 * ack. Note: size stored in table does not include
   1011 			 * space for option header.
   1012 			 */
   1013 			*toa_lenp += sizeof (struct T_opthdr) +
   1014 			    _TPI_ALIGN_TOPT(optd->opdes_size);
   1015 			break;
   1016 
   1017 		case T_CHECK:
   1018 		case T_NEGOTIATE:
   1019 
   1020 			/*
   1021 			 * T_NEGOTIATE semantics:
   1022 			 * If for fixed length option value on input is not the
   1023 			 * same as value supplied, then status is T_FAILURE.
   1024 			 *
   1025 			 * T_CHECK semantics:
   1026 			 * If value is supplied, semantics same as T_NEGOTIATE.
   1027 			 * It is however ok not to supply a value with T_CHECK.
   1028 			 */
   1029 
   1030 			if (tor->MGMT_flags == T_NEGOTIATE ||
   1031 			    (opt->len != sizeof (struct T_opthdr))) {
   1032 				/*
   1033 				 * Implies "value" is specified in T_CHECK or
   1034 				 * it is a T_NEGOTIATE request.
   1035 				 * Verify size.
   1036 				 * Note: This can override anything about this
   1037 				 * option request done at a higher level.
   1038 				 */
   1039 				if (!opt_length_ok(optd, opt)) {
   1040 					/* bad size */
   1041 					*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
   1042 					opt->status = T_FAILURE;
   1043 					continue;
   1044 				}
   1045 			}
   1046 			/*
   1047 			 * The opt_chk_lookup()  routine above() approved of
   1048 			 * this option so we can work on the status for it based
   1049 			 * on the permissions for the operation. (This can
   1050 			 * override anything set at a higher level).
   1051 			 *
   1052 			 * T_CHECK/T_NEGOTIATE semantics:
   1053 			 * Set status to T_READONLY if read is the only access
   1054 			 * permitted
   1055 			 */
   1056 			if (OA_READONLY_PERMISSION(optd, cr)) {
   1057 				opt->status = T_READONLY;
   1058 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
   1059 				/* skip to next */
   1060 				continue;
   1061 			}
   1062 
   1063 			/*
   1064 			 * T_CHECK/T_NEGOTIATE semantics:
   1065 			 * If write (or execute) access is not set, then status
   1066 			 * is T_NOTSUPPORT.
   1067 			 */
   1068 			if (!OA_WRITE_OR_EXECUTE(optd, cr)) {
   1069 				opt->status = T_NOTSUPPORT;
   1070 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
   1071 				/* skip to next option */
   1072 				continue;
   1073 			}
   1074 			/*
   1075 			 * Option passes all checks. Make room for it in the
   1076 			 * ack and set success in status.
   1077 			 * Note: size stored in table does not include header
   1078 			 * length.
   1079 			 */
   1080 			opt->status = T_SUCCESS;
   1081 			*toa_lenp += sizeof (struct T_opthdr) +
   1082 			    _TPI_ALIGN_TOPT(optd->opdes_size);
   1083 			break;
   1084 
   1085 		default:
   1086 			return (TBADFLAG);
   1087 		}
   1088 	} /* for loop scanning input buffer */
   1089 
   1090 	return (0);		/* OK return */
   1091 }
   1092 
   1093 /*
   1094  * This routine makes another pass through the option buffer this
   1095  * time acting on the request based on "status" result in the
   1096  * first pass. It also performs "expansion" of T_ALLOPT into
   1097  * all options of a certain level and acts on each for this request.
   1098  */
   1099 static t_scalar_t
   1100 do_options_second_pass(queue_t *q, mblk_t *reqmp, mblk_t *ack_mp, cred_t *cr,
   1101     optdb_obj_t *dbobjp, mblk_t *first_mp, boolean_t is_restart,
   1102     boolean_t *queued_statusp)
   1103 {
   1104 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
   1105 	int failed_option;
   1106 	struct T_opthdr *opt;
   1107 	struct T_opthdr *opt_start, *opt_end, *restart_opt;
   1108 	uchar_t *optr;
   1109 	uint_t optset_context;
   1110 	struct T_optmgmt_req *tor = (struct T_optmgmt_req *)reqmp->b_rptr;
   1111 	opt_restart_t	*or;
   1112 	t_uscalar_t	*worst_statusp;
   1113 	int	err;
   1114 
   1115 	*queued_statusp = B_FALSE;
   1116 	or = (opt_restart_t *)first_mp->b_rptr;
   1117 	worst_statusp = &or->or_worst_status;
   1118 
   1119 	optr = (uchar_t *)ack_mp->