Home | History | Annotate | Download | only in dld
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * Data-Link Driver
     28  */
     29 
     30 #include	<inet/common.h>
     31 #include	<sys/strsubr.h>
     32 #include	<sys/stropts.h>
     33 #include	<sys/strsun.h>
     34 #include	<sys/vlan.h>
     35 #include	<sys/dld_impl.h>
     36 #include	<sys/cpuvar.h>
     37 #include	<sys/callb.h>
     38 #include	<sys/list.h>
     39 #include	<sys/mac_client.h>
     40 #include	<sys/mac_client_priv.h>
     41 
     42 static int	str_constructor(void *, void *, int);
     43 static void	str_destructor(void *, void *);
     44 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *, boolean_t);
     45 static void	str_notify_promisc_on_phys(dld_str_t *);
     46 static void	str_notify_promisc_off_phys(dld_str_t *);
     47 static void	str_notify_phys_addr(dld_str_t *, uint_t, const uint8_t *);
     48 static void	str_notify_link_up(dld_str_t *);
     49 static void	str_notify_link_down(dld_str_t *);
     50 static void	str_notify_capab_reneg(dld_str_t *);
     51 static void	str_notify_speed(dld_str_t *, uint32_t);
     52 
     53 static void	ioc_native(dld_str_t *,  mblk_t *);
     54 static void	ioc_margin(dld_str_t *, mblk_t *);
     55 static void	ioc_raw(dld_str_t *, mblk_t *);
     56 static void	ioc_fast(dld_str_t *,  mblk_t *);
     57 static void	ioc_lowlink(dld_str_t *,  mblk_t *);
     58 static void	ioc(dld_str_t *, mblk_t *);
     59 static void	dld_ioc(dld_str_t *, mblk_t *);
     60 static void	dld_wput_nondata(dld_str_t *, mblk_t *);
     61 
     62 static void	str_mdata_raw_put(dld_str_t *, mblk_t *);
     63 static mblk_t	*i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t,
     64     link_tagmode_t);
     65 static mblk_t	*i_dld_ether_header_strip_tag(mblk_t *, boolean_t);
     66 
     67 static uint32_t		str_count;
     68 static kmem_cache_t	*str_cachep;
     69 static mod_hash_t	*str_hashp;
     70 
     71 #define	STR_HASHSZ		64
     72 #define	STR_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
     73 
     74 #define	dld_taskq	system_taskq
     75 
     76 static kmutex_t		dld_taskq_lock;
     77 static kcondvar_t	dld_taskq_cv;
     78 static list_t		dld_taskq_list;		/* List of dld_str_t */
     79 boolean_t		dld_taskq_quit;
     80 boolean_t		dld_taskq_done;
     81 
     82 static void		dld_taskq_dispatch(void);
     83 
     84 /*
     85  * Some notes on entry points, flow-control, queueing.
     86  *
     87  * This driver exports the traditional STREAMS put entry point as well as
     88  * the non-STREAMS fast-path transmit routine which is provided to IP via
     89  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
     90  * and data operations, while the fast-path routine deals only with M_DATA
     91  * fast-path packets.  Regardless of the entry point, all outbound packets
     92  * will end up in DLD_TX(), where they will be delivered to the MAC layer.
     93  *
     94  * The transmit logic operates in the following way: All packets coming
     95  * into DLD will be sent to the MAC layer through DLD_TX(). Flow-control
     96  * happens when the MAC layer indicates the packets couldn't be
     97  * transmitted due to 1) lack of resources (e.g. running out of
     98  * descriptors),  or 2) reaching the allowed bandwidth limit for this
     99  * particular flow. The indication comes in the form of a Tx cookie that
    100  * identifies the blocked ring. In such case, DLD will place a
    101  * dummy message on its write-side STREAMS queue so that the queue is
    102  * marked as "full". Any subsequent packets arriving at the driver will
    103  * still be sent to the MAC layer where it either gets queued in the Tx
    104  * SRS or discarded it if queue limit is exceeded. The write-side STREAMS
    105  * queue gets enabled when MAC layer notifies DLD through MAC_NOTE_TX.
    106  * When the write service procedure runs, it will remove the dummy
    107  * message from the write-side STREAMS queue; in effect this will trigger
    108  * backenabling. The sizes of q_hiwat and q_lowat are set to 1 and 0,
    109  * respectively, due to the above reasons.
    110  *
    111  * All non-data operations, both DLPI and ioctls are single threaded on a per
    112  * dld_str_t endpoint. This is done using a taskq so that the control operation
    113  * has kernel context and can cv_wait for resources. In addition all set type
    114  * operations that involve mac level state modification are serialized on a
    115  * per mac end point using the perimeter mechanism provided by the mac layer.
    116  * This serializes all mac clients trying to modify a single mac end point over
    117  * the entire sequence of mac calls made by that client as an atomic unit. The
    118  * mac framework locking is described in mac.c. A critical element is that
    119  * DLD/DLS does not hold any locks across the mac perimeter.
    120  *
    121  * dld_finddevinfo() returns the dev_info_t * corresponding to a particular
    122  * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
    123  * match dev_t. If a stream is found and it is attached, its dev_info_t *
    124  * is returned. If the mac handle is non-null, it can be safely accessed
    125  * below. The mac handle won't be freed until the mac_unregister which
    126  * won't happen until the driver detaches. The DDI framework ensures that
    127  * the detach won't happen while a getinfo is in progress.
    128  */
    129 typedef struct i_dld_str_state_s {
    130 	major_t		ds_major;
    131 	minor_t		ds_minor;
    132 	int		ds_instance;
    133 	dev_info_t	*ds_dip;
    134 } i_dld_str_state_t;
    135 
    136 /* ARGSUSED */
    137 static uint_t
    138 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
    139 {
    140 	i_dld_str_state_t	*statep = arg;
    141 	dld_str_t		*dsp = (dld_str_t *)val;
    142 	mac_handle_t		mh;
    143 
    144 	if (statep->ds_major != dsp->ds_major)
    145 		return (MH_WALK_CONTINUE);
    146 
    147 	ASSERT(statep->ds_minor != 0);
    148 	mh = dsp->ds_mh;
    149 
    150 	if (statep->ds_minor == dsp->ds_minor) {
    151 		/*
    152 		 * Clone: a clone minor is unique. we can terminate the
    153 		 * walk if we find a matching stream -- even if we fail
    154 		 * to obtain the devinfo.
    155 		 */
    156 		if (mh != NULL) {
    157 			statep->ds_dip = mac_devinfo_get(mh);
    158 			statep->ds_instance = DLS_MINOR2INST(mac_minor(mh));
    159 		}
    160 		return (MH_WALK_TERMINATE);
    161 	}
    162 	return (MH_WALK_CONTINUE);
    163 }
    164 
    165 static dev_info_t *
    166 dld_finddevinfo(dev_t dev)
    167 {
    168 	dev_info_t		*dip;
    169 	i_dld_str_state_t	state;
    170 
    171 	if (getminor(dev) == 0)
    172 		return (NULL);
    173 
    174 	/*
    175 	 * See if it's a minor node of a link
    176 	 */
    177 	if ((dip = dls_link_devinfo(dev)) != NULL)
    178 		return (dip);
    179 
    180 	state.ds_minor = getminor(dev);
    181 	state.ds_major = getmajor(dev);
    182 	state.ds_dip = NULL;
    183 	state.ds_instance = -1;
    184 
    185 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
    186 	return (state.ds_dip);
    187 }
    188 
    189 int
    190 dld_devt_to_instance(dev_t dev)
    191 {
    192 	minor_t			minor;
    193 	i_dld_str_state_t	state;
    194 
    195 	/*
    196 	 * GLDv3 numbers DLPI style 1 node as the instance number + 1.
    197 	 * Minor number 0 is reserved for the DLPI style 2 unattached
    198 	 * node.
    199 	 */
    200 
    201 	if ((minor = getminor(dev)) == 0)
    202 		return (-1);
    203 
    204 	/*
    205 	 * Check for unopened style 1 node.
    206 	 * Note that this doesn't *necessarily* work for legacy
    207 	 * devices, but this code is only called within the
    208 	 * getinfo(9e) implementation for true GLDv3 devices, so it
    209 	 * doesn't matter.
    210 	 */
    211 	if (minor > 0 && minor <= DLS_MAX_MINOR) {
    212 		return (DLS_MINOR2INST(minor));
    213 	}
    214 
    215 	state.ds_minor = getminor(dev);
    216 	state.ds_major = getmajor(dev);
    217 	state.ds_dip = NULL;
    218 	state.ds_instance = -1;
    219 
    220 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
    221 	return (state.ds_instance);
    222 }
    223 
    224 /*
    225  * devo_getinfo: getinfo(9e)
    226  *
    227  * NB: This may be called for a provider before the provider's
    228  * instances are attached.  Hence, if a particular provider needs a
    229  * special mapping (the mac instance != ddi_get_instance()), then it
    230  * may need to provide its own implmentation using the
    231  * mac_devt_to_instance() function, and translating the returned mac
    232  * instance to a devinfo instance.  For dev_t's where the minor number
    233  * is too large (i.e. > MAC_MAX_MINOR), the provider can call this
    234  * function indirectly via the mac_getinfo() function.
    235  */
    236 /*ARGSUSED*/
    237 int
    238 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
    239 {
    240 	dev_info_t	*devinfo;
    241 	minor_t		minor = getminor((dev_t)arg);
    242 	int		rc = DDI_FAILURE;
    243 
    244 	switch (cmd) {
    245 	case DDI_INFO_DEVT2DEVINFO:
    246 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
    247 			*(dev_info_t **)resp = devinfo;
    248 			rc = DDI_SUCCESS;
    249 		}
    250 		break;
    251 	case DDI_INFO_DEVT2INSTANCE:
    252 		if (minor > 0 && minor <= DLS_MAX_MINOR) {
    253 			*resp = (void *)(uintptr_t)DLS_MINOR2INST(minor);
    254 			rc = DDI_SUCCESS;
    255 		} else if (minor > DLS_MAX_MINOR &&
    256 		    (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
    257 			*resp = (void *)(uintptr_t)ddi_get_instance(devinfo);
    258 			rc = DDI_SUCCESS;
    259 		}
    260 		break;
    261 	}
    262 	return (rc);
    263 }
    264 
    265 void *
    266 dld_str_private(queue_t *q)
    267 {
    268 	return (((dld_str_t *)(q->q_ptr))->ds_private);
    269 }
    270 
    271 int
    272 dld_str_open(queue_t *rq, dev_t *devp, void *private)
    273 {
    274 	dld_str_t	*dsp;
    275 	major_t		major;
    276 	minor_t		minor;
    277 	int		err;
    278 
    279 	major = getmajor(*devp);
    280 	minor = getminor(*devp);
    281 
    282 	/*
    283 	 * Create a new dld_str_t for the stream. This will grab a new minor
    284 	 * number that will be handed back in the cloned dev_t.  Creation may
    285 	 * fail if we can't allocate the dummy mblk used for flow-control.
    286 	 */
    287 	dsp = dld_str_create(rq, DLD_DLPI, major,
    288 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
    289 	if (dsp == NULL)
    290 		return (ENOSR);
    291 
    292 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
    293 	dsp->ds_private = private;
    294 	if (minor != 0) {
    295 		/*
    296 		 * Style 1 open
    297 		 */
    298 		if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
    299 			goto failed;
    300 
    301 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
    302 	} else {
    303 		(void) qassociate(rq, -1);
    304 	}
    305 
    306 	/*
    307 	 * Enable the queue srv(9e) routine.
    308 	 */
    309 	qprocson(rq);
    310 
    311 	/*
    312 	 * Construct a cloned dev_t to hand back.
    313 	 */
    314 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
    315 	return (0);
    316 
    317 failed:
    318 	dld_str_destroy(dsp);
    319 	return (err);
    320 }
    321 
    322 int
    323 dld_str_close(queue_t *rq)
    324 {
    325 	dld_str_t	*dsp = rq->q_ptr;
    326 
    327 	/*
    328 	 * All modules on top have been popped off. So there can't be any
    329 	 * threads from the top.
    330 	 */
    331 	ASSERT(dsp->ds_datathr_cnt == 0);
    332 
    333 	/*
    334 	 * Wait until pending DLPI requests are processed.
    335 	 */
    336 	mutex_enter(&dsp->ds_lock);
    337 	while (dsp->ds_dlpi_pending)
    338 		cv_wait(&dsp->ds_dlpi_pending_cv, &dsp->ds_lock);
    339 	mutex_exit(&dsp->ds_lock);
    340 
    341 
    342 	/*
    343 	 * This stream was open to a provider node. Check to see
    344 	 * if it has been cleanly shut down.
    345 	 */
    346 	if (dsp->ds_dlstate != DL_UNATTACHED) {
    347 		/*
    348 		 * The stream is either open to a style 1 provider or
    349 		 * this is not clean shutdown. Detach from the PPA.
    350 		 * (This is still ok even in the style 1 case).
    351 		 */
    352 		dld_str_detach(dsp);
    353 	}
    354 
    355 	dld_str_destroy(dsp);
    356 	return (0);
    357 }
    358 
    359 /*
    360  * qi_qopen: open(9e)
    361  */
    362 /*ARGSUSED*/
    363 int
    364 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
    365 {
    366 	if (sflag == MODOPEN)
    367 		return (ENOTSUP);
    368 
    369 	/*
    370 	 * This is a cloning driver and therefore each queue should only
    371 	 * ever get opened once.
    372 	 */
    373 	if (rq->q_ptr != NULL)
    374 		return (EBUSY);
    375 
    376 	return (dld_str_open(rq, devp, NULL));
    377 }
    378 
    379 /*
    380  * qi_qclose: close(9e)
    381  */
    382 int
    383 dld_close(queue_t *rq)
    384 {
    385 	/*
    386 	 * Disable the queue srv(9e) routine.
    387 	 */
    388 	qprocsoff(rq);
    389 
    390 	return (dld_str_close(rq));
    391 }
    392 
    393 /*
    394  * qi_qputp: put(9e)
    395  */
    396 void
    397 dld_wput(queue_t *wq, mblk_t *mp)
    398 {
    399 	dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
    400 	dld_str_mode_t	mode;
    401 
    402 	switch (DB_TYPE(mp)) {
    403 	case M_DATA:
    404 		mutex_enter(&dsp->ds_lock);
    405 		mode = dsp->ds_mode;
    406 		if ((dsp->ds_dlstate != DL_IDLE) ||
    407 		    (mode != DLD_FASTPATH && mode != DLD_RAW)) {
    408 			mutex_exit(&dsp->ds_lock);
    409 			freemsg(mp);
    410 			break;
    411 		}
    412 
    413 		DLD_DATATHR_INC(dsp);
    414 		mutex_exit(&dsp->ds_lock);
    415 		if (mode == DLD_FASTPATH) {
    416 			if (dsp->ds_mip->mi_media == DL_ETHER &&
    417 			    (MBLKL(mp) < sizeof (struct ether_header))) {
    418 				freemsg(mp);
    419 			} else {
    420 				(void) str_mdata_fastpath_put(dsp, mp, 0, 0);
    421 			}
    422 		} else {
    423 			str_mdata_raw_put(dsp, mp);
    424 		}
    425 		DLD_DATATHR_DCR(dsp);
    426 		break;
    427 	case M_PROTO:
    428 	case M_PCPROTO: {
    429 		t_uscalar_t	prim;
    430 
    431 		if (MBLKL(mp) < sizeof (t_uscalar_t))
    432 			break;
    433 
    434 		prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
    435 
    436 		if (prim == DL_UNITDATA_REQ) {
    437 			proto_unitdata_req(dsp, mp);
    438 		} else {
    439 			dld_wput_nondata(dsp, mp);
    440 		}
    441 		break;
    442 	}
    443 
    444 	case M_IOCTL:
    445 		dld_wput_nondata(dsp, mp);
    446 		break;
    447 
    448 	case M_FLUSH:
    449 		if (*mp->b_rptr & FLUSHW) {
    450 			DLD_CLRQFULL(dsp);
    451 			*mp->b_rptr &= ~FLUSHW;
    452 		}
    453 
    454 		if (*mp->b_rptr & FLUSHR) {
    455 			qreply(wq, mp);
    456 		} else {
    457 			freemsg(mp);
    458 		}
    459 		break;
    460 
    461 	default:
    462 		freemsg(mp);
    463 		break;
    464 	}
    465 }
    466 
    467 /*
    468  * qi_srvp: srv(9e)
    469  */
    470 void
    471 dld_wsrv(queue_t *wq)
    472 {
    473 	dld_str_t	*dsp = wq->q_ptr;
    474 
    475 	DLD_CLRQFULL(dsp);
    476 }
    477 
    478 void
    479 dld_init_ops(struct dev_ops *ops, const char *name)
    480 {
    481 	struct streamtab *stream;
    482 	struct qinit *rq, *wq;
    483 	struct module_info *modinfo;
    484 
    485 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
    486 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
    487 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
    488 	modinfo->mi_minpsz = 0;
    489 	modinfo->mi_maxpsz = 64*1024;
    490 	modinfo->mi_hiwat  = 1;
    491 	modinfo->mi_lowat = 0;
    492 
    493 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
    494 	rq->qi_qopen = dld_open;
    495 	rq->qi_qclose = dld_close;
    496 	rq->qi_minfo = modinfo;
    497 
    498 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
    499 	wq->qi_putp = (pfi_t)dld_wput;
    500 	wq->qi_srvp = (pfi_t)dld_wsrv;
    501 	wq->qi_minfo = modinfo;
    502 
    503 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
    504 	stream->st_rdinit = rq;
    505 	stream->st_wrinit = wq;
    506 	ops->devo_cb_ops->cb_str = stream;
    507 
    508 	if (ops->devo_getinfo == NULL)
    509 		ops->devo_getinfo = &dld_getinfo;
    510 }
    511 
    512 void
    513 dld_fini_ops(struct dev_ops *ops)
    514 {
    515 	struct streamtab *stream;
    516 	struct qinit *rq, *wq;
    517 	struct module_info *modinfo;
    518 
    519 	stream = ops->devo_cb_ops->cb_str;
    520 	rq = stream->st_rdinit;
    521 	wq = stream->st_wrinit;
    522 	modinfo = rq->qi_minfo;
    523 	ASSERT(wq->qi_minfo == modinfo);
    524 
    525 	kmem_free(stream, sizeof (struct streamtab));
    526 	kmem_free(wq, sizeof (struct qinit));
    527 	kmem_free(rq, sizeof (struct qinit));
    528 	kmem_free(modinfo->mi_idname, FMNAMESZ);
    529 	kmem_free(modinfo, sizeof (struct module_info));
    530 }
    531 
    532 /*
    533  * Initialize this module's data structures.
    534  */
    535 void
    536 dld_str_init(void)
    537 {
    538 	/*
    539 	 * Create dld_str_t object cache.
    540 	 */
    541 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
    542 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
    543 	ASSERT(str_cachep != NULL);
    544 
    545 	/*
    546 	 * Create a hash table for maintaining dld_str_t's.
    547 	 * The ds_minor field (the clone minor number) of a dld_str_t
    548 	 * is used as a key for this hash table because this number is
    549 	 * globally unique (allocated from "dls_minor_arena").
    550 	 */
    551 	str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
    552 	    mod_hash_null_valdtor);
    553 
    554 	mutex_init(&dld_taskq_lock, NULL, MUTEX_DRIVER, NULL);
    555 	cv_init(&dld_taskq_cv, NULL, CV_DRIVER, NULL);
    556 
    557 	dld_taskq_quit = B_FALSE;
    558 	dld_taskq_done = B_FALSE;
    559 	list_create(&dld_taskq_list, sizeof (dld_str_t),
    560 	    offsetof(dld_str_t, ds_tqlist));
    561 	(void) thread_create(NULL, 0, dld_taskq_dispatch, NULL, 0,
    562 	    &p0, TS_RUN, minclsyspri);
    563 }
    564 
    565 /*
    566  * Tear down this module's data structures.
    567  */
    568 int
    569 dld_str_fini(void)
    570 {
    571 	/*
    572 	 * Make sure that there are no objects in use.
    573 	 */
    574 	if (str_count != 0)
    575 		return (EBUSY);
    576 
    577 	/*
    578 	 * Ask the dld_taskq thread to quit and wait for it to be done
    579 	 */
    580 	mutex_enter(&dld_taskq_lock);
    581 	dld_taskq_quit = B_TRUE;
    582 	cv_signal(&dld_taskq_cv);
    583 	while (!dld_taskq_done)
    584 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
    585 	mutex_exit(&dld_taskq_lock);
    586 	list_destroy(&dld_taskq_list);
    587 	/*
    588 	 * Destroy object cache.
    589 	 */
    590 	kmem_cache_destroy(str_cachep);
    591 	mod_hash_destroy_idhash(str_hashp);
    592 	return (0);
    593 }
    594 
    595 /*
    596  * Create a new dld_str_t object.
    597  */
    598 dld_str_t *
    599 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
    600 {
    601 	dld_str_t	*dsp;
    602 	int		err;
    603 
    604 	/*
    605 	 * Allocate an object from the cache.
    606 	 */
    607 	atomic_add_32(&str_count, 1);
    608 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
    609 
    610 	/*
    611 	 * Allocate the dummy mblk for flow-control.
    612 	 */
    613 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
    614 	if (dsp->ds_tx_flow_mp == NULL) {
    615 		kmem_cache_free(str_cachep, dsp);
    616 		atomic_add_32(&str_count, -1);
    617 		return (NULL);
    618 	}
    619 	dsp->ds_type = type;
    620 	dsp->ds_major = major;
    621 	dsp->ds_style = style;
    622 
    623 	/*
    624 	 * Initialize the queue pointers.
    625 	 */
    626 	ASSERT(RD(rq) == rq);
    627 	dsp->ds_rq = rq;
    628 	dsp->ds_wq = WR(rq);
    629 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
    630 
    631 	/*
    632 	 * We want explicit control over our write-side STREAMS queue
    633 	 * where the dummy mblk gets added/removed for flow-control.
    634 	 */
    635 	noenable(WR(rq));
    636 
    637 	err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor),
    638 	    (mod_hash_val_t)dsp);
    639 	ASSERT(err == 0);
    640 	return (dsp);
    641 }
    642 
    643 /*
    644  * Destroy a dld_str_t object.
    645  */
    646 void
    647 dld_str_destroy(dld_str_t *dsp)
    648 {
    649 	queue_t		*rq;
    650 	queue_t		*wq;
    651 	mod_hash_val_t	val;
    652 
    653 	/*
    654 	 * Clear the queue pointers.
    655 	 */
    656 	rq = dsp->ds_rq;
    657 	wq = dsp->ds_wq;
    658 	ASSERT(wq == WR(rq));
    659 	rq->q_ptr = wq->q_ptr = NULL;
    660 	dsp->ds_rq = dsp->ds_wq = NULL;
    661 
    662 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
    663 	ASSERT(dsp->ds_sap == 0);
    664 	ASSERT(dsp->ds_mh == NULL);
    665 	ASSERT(dsp->ds_mch == NULL);
    666 	ASSERT(dsp->ds_promisc == 0);
    667 	ASSERT(dsp->ds_mph == NULL);
    668 	ASSERT(dsp->ds_mip == NULL);
    669 	ASSERT(dsp->ds_mnh == NULL);
    670 
    671 	ASSERT(dsp->ds_polling == B_FALSE);
    672 	ASSERT(dsp->ds_direct == B_FALSE);
    673 	ASSERT(dsp->ds_lso == B_FALSE);
    674 	ASSERT(dsp->ds_lso_max == 0);
    675 	ASSERT(dsp->ds_passivestate != DLD_ACTIVE);
    676 
    677 	/*
    678 	 * Reinitialize all the flags.
    679 	 */
    680 	dsp->ds_notifications = 0;
    681 	dsp->ds_passivestate = DLD_UNINITIALIZED;
    682 	dsp->ds_mode = DLD_UNITDATA;
    683 	dsp->ds_native = B_FALSE;
    684 	dsp->ds_nonip = B_FALSE;
    685 
    686 	ASSERT(dsp->ds_datathr_cnt == 0);
    687 	ASSERT(dsp->ds_pending_head == NULL);
    688 	ASSERT(dsp->ds_pending_tail == NULL);
    689 	ASSERT(!dsp->ds_dlpi_pending);
    690 
    691 	ASSERT(dsp->ds_dlp == NULL);
    692 	ASSERT(dsp->ds_dmap == NULL);
    693 	ASSERT(dsp->ds_rx == NULL);
    694 	ASSERT(dsp->ds_rx_arg == NULL);
    695 	ASSERT(dsp->ds_next == NULL);
    696 	ASSERT(dsp->ds_head == NULL);
    697 
    698 	/*
    699 	 * Free the dummy mblk if exists.
    700 	 */
    701 	if (dsp->ds_tx_flow_mp != NULL) {
    702 		freeb(dsp->ds_tx_flow_mp);
    703 		dsp->ds_tx_flow_mp = NULL;
    704 	}
    705 
    706 	(void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val);
    707 	ASSERT(dsp == (dld_str_t *)val);
    708 
    709 	/*
    710 	 * Free the object back to the cache.
    711 	 */
    712 	kmem_cache_free(str_cachep, dsp);
    713 	atomic_add_32(&str_count, -1);
    714 }
    715 
    716 /*
    717  * kmem_cache contructor function: see kmem_cache_create(9f).
    718  */
    719 /*ARGSUSED*/
    720 static int
    721 str_constructor(void *buf, void *cdrarg, int kmflags)
    722 {
    723 	dld_str_t	*dsp = buf;
    724 
    725 	bzero(buf, sizeof (dld_str_t));
    726 
    727 	/*
    728 	 * Allocate a new minor number.
    729 	 */
    730 	if ((dsp->ds_minor = mac_minor_hold(kmflags == KM_SLEEP)) == 0)
    731 		return (-1);
    732 
    733 	/*
    734 	 * Initialize the DLPI state machine.
    735 	 */
    736 	dsp->ds_dlstate = DL_UNATTACHED;
    737 
    738 	mutex_init(&dsp->ds_lock, NULL, MUTEX_DRIVER, NULL);
    739 	cv_init(&dsp->ds_datathr_cv, NULL, CV_DRIVER, NULL);
    740 	cv_init(&dsp->ds_dlpi_pending_cv, NULL, CV_DRIVER, NULL);
    741 
    742 	return (0);
    743 }
    744 
    745 /*
    746  * kmem_cache destructor function.
    747  */
    748 /*ARGSUSED*/
    749 static void
    750 str_destructor(void *buf, void *cdrarg)
    751 {
    752 	dld_str_t	*dsp = buf;
    753 
    754 	/*
    755 	 * Release the minor number.
    756 	 */
    757 	mac_minor_rele(dsp->ds_minor);
    758 
    759 	ASSERT(dsp->ds_tx_flow_mp == NULL);
    760 
    761 	mutex_destroy(&dsp->ds_lock);
    762 	cv_destroy(&dsp->ds_datathr_cv);
    763 	cv_destroy(&dsp->ds_dlpi_pending_cv);
    764 }
    765 
    766 /*
    767  * Update the priority bits and VID (may need to insert tag if mp points
    768  * to an untagged packet.
    769  * If vid is VLAN_ID_NONE, use the VID encoded in the packet.
    770  */
    771 static mblk_t *
    772 i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid,
    773     link_tagmode_t tagmode)
    774 {
    775 	mblk_t *hmp;
    776 	struct ether_vlan_header *evhp;
    777 	struct ether_header *ehp;
    778 	uint16_t old_tci = 0;
    779 	size_t len;
    780 
    781 	ASSERT(pri != 0 || vid != VLAN_ID_NONE);
    782 
    783 	evhp = (struct ether_vlan_header *)mp->b_rptr;
    784 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
    785 		/*
    786 		 * Tagged packet, update the priority bits.
    787 		 */
    788 		len = sizeof (struct ether_vlan_header);
    789 
    790 		if ((DB_REF(mp) > 1) || (MBLKL(mp) < len)) {
    791 			/*
    792 			 * In case some drivers only check the db_ref
    793 			 * count of the first mblk, we pullup the
    794 			 * message into a single mblk.
    795 			 */
    796 			hmp = msgpullup(mp, -1);
    797 			if ((hmp == NULL) || (MBLKL(hmp) < len)) {
    798 				freemsg(hmp);
    799 				return (NULL);
    800 			} else {
    801 				freemsg(mp);
    802 				mp = hmp;
    803 			}
    804 		}
    805 
    806 		evhp = (struct ether_vlan_header *)mp->b_rptr;
    807 		old_tci = ntohs(evhp->ether_tci);
    808 	} else {
    809 		/*
    810 		 * Untagged packet.  Two factors will cause us to insert a
    811 		 * VLAN header:
    812 		 * - This is a VLAN link (vid is specified)
    813 		 * - The link supports user priority tagging and the priority
    814 		 *   is non-zero.
    815 		 */
    816 		if (vid == VLAN_ID_NONE && tagmode == LINK_TAGMODE_VLANONLY)
    817 			return (mp);
    818 
    819 		hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
    820 		if (hmp == NULL)
    821 			return (NULL);
    822 
    823 		evhp = (struct ether_vlan_header *)hmp->b_rptr;
    824 		ehp = (struct ether_header *)mp->b_rptr;
    825 
    826 		/*
    827 		 * Copy the MAC addresses and typelen
    828 		 */
    829 		bcopy(ehp, evhp, (ETHERADDRL * 2));
    830 		evhp->ether_type = ehp->ether_type;
    831 		evhp->ether_tpid = htons(ETHERTYPE_VLAN);
    832 
    833 		hmp->b_wptr += sizeof (struct ether_vlan_header);
    834 		mp->b_rptr += sizeof (struct ether_header);
    835 
    836 		/*
    837 		 * Free the original message if it's now empty. Link the
    838 		 * rest of the messages to the header message.
    839 		 */
    840 		if (MBLKL(mp) == 0) {
    841 			hmp->b_cont = mp->b_cont;
    842 			freeb(mp);
    843 		} else {
    844 			hmp->b_cont = mp;
    845 		}
    846 		mp = hmp;
    847 	}
    848 
    849 	if (pri == 0)
    850 		pri = VLAN_PRI(old_tci);
    851 	if (vid == VLAN_ID_NONE)
    852 		vid = VLAN_ID(old_tci);
    853 	evhp->ether_tci = htons(VLAN_TCI(pri, VLAN_CFI(old_tci), vid));
    854 	return (mp);
    855 }
    856 
    857 /*
    858  * M_DATA put (IP fast-path mode)
    859  */
    860 mac_tx_cookie_t
    861 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint,
    862     uint16_t flag)
    863 {
    864 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
    865 	mblk_t *newmp;
    866 	uint_t pri;
    867 	mac_tx_cookie_t cookie;
    868 
    869 	if (is_ethernet) {
    870 		/*
    871 		 * Update the priority bits to the assigned priority.
    872 		 */
    873 		pri = (VLAN_MBLKPRI(mp) == 0) ? dsp->ds_pri : VLAN_MBLKPRI(mp);
    874 
    875 		if (pri != 0) {
    876 			newmp = i_dld_ether_header_update_tag(mp, pri,
    877 			    VLAN_ID_NONE, dsp->ds_dlp->dl_tagmode);
    878 			if (newmp == NULL)
    879 				goto discard;
    880 			mp = newmp;
    881 		}
    882 	}
    883 
    884 	if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) {
    885 		DLD_SETQFULL(dsp);
    886 	}
    887 	return (cookie);
    888 
    889 discard:
    890 	/* TODO: bump kstat? */
    891 	freemsg(mp);
    892 	return (NULL);
    893 }
    894 
    895 /*
    896  * M_DATA put (DLIOCRAW mode)
    897  */
    898 static void
    899 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
    900 {
    901 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
    902 	mblk_t *bp, *newmp;
    903 	size_t size;
    904 	mac_header_info_t mhi;
    905 	uint_t pri, vid, dvid;
    906 	uint_t max_sdu;
    907 
    908 	/*
    909 	 * Certain MAC type plugins provide an illusion for raw DLPI
    910 	 * consumers.  They pretend that the MAC layer is something that
    911 	 * it's not for the benefit of observability tools.  For example,
    912 	 * mac_wifi pretends that it's Ethernet for such consumers.
    913 	 * Here, unless native mode is enabled, we call into the MAC layer so
    914 	 * that this illusion can be maintained.  The plugin will optionally
    915 	 * transform the MAC header here into something that can be passed
    916 	 * down.  The header goes from raw mode to "cooked" mode.
    917 	 */
    918 	if (!dsp->ds_native) {
    919 		if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL)
    920 			goto discard;
    921 		mp = newmp;
    922 	}
    923 
    924 	size = MBLKL(mp);
    925 
    926 	/*
    927 	 * Check the packet is not too big and that any remaining
    928 	 * fragment list is composed entirely of M_DATA messages. (We
    929 	 * know the first fragment was M_DATA otherwise we could not
    930 	 * have got here).
    931 	 */
    932 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
    933 		if (DB_TYPE(bp) != M_DATA)
    934 			goto discard;
    935 		size += MBLKL(bp);
    936 	}
    937 
    938 	if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
    939 		goto discard;
    940 
    941 	mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
    942 	/*
    943 	 * If LSO is enabled, check the size against lso_max. Otherwise,
    944 	 * compare the packet size with max_sdu.
    945 	 */
    946 	max_sdu = dsp->ds_lso ? dsp->ds_lso_max : max_sdu;
    947 	if (size > max_sdu + mhi.mhi_hdrsize)
    948 		goto discard;
    949 
    950 	if (is_ethernet) {
    951 		dvid = mac_client_vid(dsp->ds_mch);
    952 
    953 		/*
    954 		 * Discard the packet if this is a VLAN stream but the VID in
    955 		 * the packet is not correct.
    956 		 */
    957 		vid = VLAN_ID(mhi.mhi_tci);
    958 		if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
    959 			goto discard;
    960 
    961 		/*
    962 		 * Discard the packet if this packet is a tagged packet
    963 		 * but both pri and VID are 0.
    964 		 */
    965 		pri = VLAN_PRI(mhi.mhi_tci);
    966 		if (mhi.mhi_istagged && !mhi.mhi_ispvid && pri == 0 &&
    967 		    vid == VLAN_ID_NONE)
    968 			goto discard;
    969 
    970 		/*
    971 		 * Update the priority bits to the per-stream priority if
    972 		 * priority is not set in the packet. Update the VID for
    973 		 * packets on a VLAN stream.
    974 		 */
    975 		pri = (pri == 0) ? dsp->ds_pri : 0;
    976 		if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
    977 			if ((newmp = i_dld_ether_header_update_tag(mp, pri,
    978 			    dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
    979 				goto discard;
    980 			}
    981 			mp = newmp;
    982 		}
    983 	}
    984 
    985 	if (DLD_TX(dsp, mp, 0, 0) != NULL) {
    986 		/* Turn on flow-control for dld */
    987 		DLD_SETQFULL(dsp);
    988 	}
    989 	return;
    990 
    991 discard:
    992 	/* TODO: bump kstat? */
    993 	freemsg(mp);
    994 }
    995 
    996 /*
    997  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
    998  */
    999 int
   1000 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
   1001 {
   1002 	dev_t			dev;
   1003 	int			err;
   1004 	const char		*drvname;
   1005 	mac_perim_handle_t	mph = NULL;
   1006 	boolean_t		qassociated = B_FALSE;
   1007 	dls_link_t		*dlp = NULL;
   1008 	dls_dl_handle_t		ddp = NULL;
   1009 
   1010 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
   1011 		return (EINVAL);
   1012 
   1013 	if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA)
   1014 		return (ENOTSUP);
   1015 
   1016 	/*
   1017 	 * /dev node access. This will still be supported for backward
   1018 	 * compatibility reason.
   1019 	 */
   1020 	if ((dsp->ds_style == DL_STYLE2) && (strcmp(drvname, "aggr") != 0) &&
   1021 	    (strcmp(drvname, "vnic") != 0)) {
   1022 		if (qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
   1023 			return (EINVAL);
   1024 		qassociated = B_TRUE;
   1025 	}
   1026 
   1027 	dev = makedevice(dsp->ds_major, (minor_t)ppa + 1);
   1028 	if ((err = dls_devnet_hold_by_dev(dev, &ddp)) != 0)
   1029 		goto failed;
   1030 
   1031 	if ((err = mac_perim_enter_by_macname(dls_devnet_mac(ddp), &mph)) != 0)
   1032 		goto failed;
   1033 
   1034 	/*
   1035 	 * Open a channel.
   1036 	 */
   1037 	if ((err = dls_link_hold(dls_devnet_mac(ddp), &dlp)) != 0)
   1038 		goto failed;
   1039 
   1040 	if ((err = dls_open(dlp, ddp, dsp)) != 0)
   1041 		goto failed;
   1042 
   1043 	/*
   1044 	 * Set the default packet priority.
   1045 	 */
   1046 	dsp->ds_pri = 0;
   1047 
   1048 	/*
   1049 	 * Add a notify function so that the we get updates from the MAC.
   1050 	 */
   1051 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, dsp);
   1052 	dsp->ds_dlstate = DL_UNBOUND;
   1053 	mac_perim_exit(mph);
   1054 	return (0);
   1055 
   1056 failed:
   1057 	if (dlp != NULL)
   1058 		dls_link_rele(dlp);
   1059 	if (mph != NULL)
   1060 		mac_perim_exit(mph);
   1061 	if (ddp != NULL)
   1062 		dls_devnet_rele(ddp);
   1063 	if (qassociated)
   1064 		(void) qassociate(dsp->ds_wq, -1);
   1065 
   1066 	return (err);
   1067 }
   1068 
   1069 /*
   1070  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
   1071  * from close(2) for style 2.
   1072  */
   1073 void
   1074 dld_str_detach(dld_str_t *dsp)
   1075 {
   1076 	mac_perim_handle_t	mph;
   1077 	int			err;
   1078 
   1079 	ASSERT(dsp->ds_datathr_cnt == 0);
   1080 
   1081 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
   1082 	/*
   1083 	 * Remove the notify function.
   1084 	 *
   1085 	 * Note that we cannot wait for the notification callback to be removed
   1086 	 * since it could cause the deadlock with str_notify() since they both
   1087 	 * need the mac perimeter. Continue if we cannot remove the
   1088 	 * notification callback right now and wait after we leave the
   1089 	 * perimeter.
   1090 	 */
   1091 	err = mac_notify_remove(dsp->ds_mnh, B_FALSE);
   1092 	dsp->ds_mnh = NULL;
   1093 
   1094 	/*
   1095 	 * Disable the capabilities
   1096 	 */
   1097 	dld_capabilities_disable(dsp);
   1098 
   1099 	/*
   1100 	 * Clear LSO flags.
   1101 	 */
   1102 	dsp->ds_lso = B_FALSE;
   1103 	dsp->ds_lso_max = 0;
   1104 
   1105 	dls_close(dsp);
   1106 	mac_perim_exit(mph);
   1107 
   1108 	/*
   1109 	 * Now we leave the mac perimeter. If mac_notify_remove() failed
   1110 	 * because the notification callback was in progress, wait for
   1111 	 * it to finish before we proceed.
   1112 	 */
   1113 	if (err != 0)
   1114 		mac_notify_remove_wait(dsp->ds_mh);
   1115 
   1116 	/*
   1117 	 * An unreferenced tagged (non-persistent) vlan gets destroyed
   1118 	 * automatically in the call to dls_devnet_rele.
   1119 	 */
   1120 	dls_devnet_rele(dsp->ds_ddh);
   1121 
   1122 	dsp->ds_sap = 0;
   1123 	dsp->ds_mh = NULL;
   1124 	dsp->ds_mch = NULL;
   1125 	dsp->ds_mip = NULL;
   1126 
   1127 	if (dsp->ds_style == DL_STYLE2)
   1128 		(void) qassociate(dsp->ds_wq, -1);
   1129 
   1130 	/*
   1131 	 * Re-initialize the DLPI state machine.
   1132 	 */
   1133 	dsp->ds_dlstate = DL_UNATTACHED;
   1134 }
   1135 
   1136 /*
   1137  * This function is only called for VLAN streams. In raw mode, we strip VLAN
   1138  * tags before sending packets up to the DLS clients, with the exception of
   1139  * special priority tagged packets, in that case, we set the VID to 0.
   1140  * mp must be a VLAN tagged packet.
   1141  */
   1142 static mblk_t *
   1143 i_dld_ether_header_strip_tag(mblk_t *mp, boolean_t keep_pri)
   1144 {
   1145 	mblk_t *newmp;
   1146 	struct ether_vlan_header *evhp;
   1147 	uint16_t tci, new_tci;
   1148 
   1149 	ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
   1150 	if (DB_REF(mp) > 1) {
   1151 		newmp = copymsg(mp);
   1152 		if (newmp == NULL)
   1153 			return (NULL);
   1154 		freemsg(mp);
   1155 		mp = newmp;
   1156 	}
   1157 	evhp = (struct ether_vlan_header *)mp->b_rptr;
   1158 
   1159 	tci = ntohs(evhp->ether_tci);
   1160 	if (VLAN_PRI(tci) == 0 || !keep_pri) {
   1161 		/*
   1162 		 * Priority is 0, strip the tag.
   1163 		 */
   1164 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
   1165 		mp->b_rptr += VLAN_TAGSZ;
   1166 	} else {
   1167 		/*
   1168 		 * Priority is not 0, update the VID to 0.
   1169 		 */
   1170 		new_tci = VLAN_TCI(VLAN_PRI(tci), VLAN_CFI(tci), VLAN_ID_NONE);
   1171 		evhp->ether_tci = htons(new_tci);
   1172 	}
   1173 	return (mp);
   1174 }
   1175 
   1176 /*
   1177  * Raw mode receive function.
   1178  */
   1179 /*ARGSUSED*/
   1180 void
   1181 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
   1182     mac_header_info_t *mhip)
   1183 {
   1184 	dld_str_t *dsp = (dld_str_t *)arg;
   1185 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
   1186 	mblk_t *next, *newmp;
   1187 
   1188 	ASSERT(mp != NULL);
   1189 	do {
   1190 		/*
   1191 		 * Get the pointer to the next packet in the chain and then
   1192 		 * clear b_next before the packet gets passed on.
   1193 		 */
   1194 		next = mp->b_next;
   1195 		mp->b_next = NULL;
   1196 
   1197 		/*
   1198 		 * Wind back b_rptr to point at the MAC header.
   1199 		 */
   1200 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
   1201 		mp->b_rptr -= mhip->mhi_hdrsize;
   1202 
   1203 		/*
   1204 		 * Certain MAC type plugins provide an illusion for raw
   1205 		 * DLPI consumers.  They pretend that the MAC layer is
   1206 		 * something that it's not for the benefit of observability
   1207 		 * tools.  For example, mac_wifi pretends that it's Ethernet
   1208 		 * for such consumers.	Here, unless native mode is enabled,
   1209 		 * we call into the MAC layer so that this illusion can be
   1210 		 * maintained.	The plugin will optionally transform the MAC
   1211 		 * header here into something that can be passed up to raw
   1212 		 * consumers.  The header goes from "cooked" mode to raw mode.
   1213 		 */
   1214 		if (!dsp->ds_native) {
   1215 			newmp = mac_header_uncook(dsp->ds_mh, mp);
   1216 			if (newmp == NULL) {
   1217 				freemsg(mp);
   1218 				goto next;
   1219 			}
   1220 			mp = newmp;
   1221 		}
   1222 
   1223 		/*
   1224 		 * Strip the VLAN tag for VLAN streams.
   1225 		 */
   1226 		if (is_ethernet &&
   1227 		    mac_client_vid(dsp->ds_mch) != VLAN_ID_NONE) {
   1228 			/*
   1229 			 * The priority should be kept only for VLAN
   1230 			 * data-links.
   1231 			 */
   1232 			newmp = i_dld_ether_header_strip_tag(mp,
   1233 			    mac_client_is_vlan_vnic(dsp->ds_mch));
   1234 			if (newmp == NULL) {
   1235 				freemsg(mp);
   1236 				goto next;
   1237 			}
   1238 			mp = newmp;
   1239 		}
   1240 
   1241 		/*
   1242 		 * Pass the packet on.
   1243 		 */
   1244 		if (canputnext(dsp->ds_rq))
   1245 			putnext(dsp->ds_rq, mp);
   1246 		else
   1247 			freemsg(mp);
   1248 
   1249 next:
   1250 		/*
   1251 		 * Move on to the next packet in the chain.
   1252 		 */
   1253 		mp = next;
   1254 	} while (mp != NULL);
   1255 }
   1256 
   1257 /*
   1258  * Fast-path receive function.
   1259  */
   1260 /*ARGSUSED*/
   1261 void
   1262 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
   1263     mac_header_info_t *mhip)
   1264 {
   1265 	dld_str_t *dsp = (dld_str_t *)arg;
   1266 	mblk_t *next;
   1267 	size_t offset = 0;
   1268 
   1269 	/*
   1270 	 * MAC header stripping rules:
   1271 	 *    - Tagged packets:
   1272 	 *	a. VLAN streams. Strip the whole VLAN header including the tag.
   1273 	 *	b. Physical streams
   1274 	 *	- VLAN packets (non-zero VID). The stream must be either a
   1275 	 *	  DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener.
   1276 	 *	  Strip the Ethernet header but keep the VLAN header.
   1277 	 *	- Special tagged packets (zero VID)
   1278 	 *	  * The stream is either a DL_PROMISC_SAP listener or a
   1279 	 *	    ETHERTYPE_VLAN listener, strip the Ethernet header but
   1280 	 *	    keep the VLAN header.
   1281 	 *	  * Otherwise, strip the whole VLAN header.
   1282 	 *    - Untagged packets. Strip the whole MAC header.
   1283 	 */
   1284 	if (mhip->mhi_istagged &&
   1285 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
   1286 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
   1287 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
   1288 		offset = VLAN_TAGSZ;
   1289 	}
   1290 
   1291 	ASSERT(mp != NULL);
   1292 	do {
   1293 		/*
   1294 		 * Get the pointer to the next packet in the chain and then
   1295 		 * clear b_next before the packet gets passed on.
   1296 		 */
   1297 		next = mp->b_next;
   1298 		mp->b_next = NULL;
   1299 
   1300 		/*
   1301 		 * Wind back b_rptr to point at the VLAN header.
   1302 		 */
   1303 		ASSERT(mp->b_rptr >= DB_BASE(mp) + offset);
   1304 		mp->b_rptr -= offset;
   1305 
   1306 		/*
   1307 		 * Pass the packet on.
   1308 		 */
   1309 		if (canputnext(dsp->ds_rq))
   1310 			putnext(dsp->ds_rq, mp);
   1311 		else
   1312 			freemsg(mp);
   1313 		/*
   1314 		 * Move on to the next packet in the chain.
   1315 		 */
   1316 		mp = next;
   1317 	} while (mp != NULL);
   1318 }
   1319 
   1320 /*
   1321  * Default receive function (send DL_UNITDATA_IND messages).
   1322  */
   1323 /*ARGSUSED*/
   1324 void
   1325 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
   1326     mac_header_info_t *mhip)
   1327 {
   1328 	dld_str_t		*dsp = (dld_str_t *)arg;
   1329 	mblk_t			*ud_mp;
   1330 	mblk_t			*next;
   1331 	size_t			offset = 0;
   1332 	boolean_t		strip_vlan = B_TRUE;
   1333 
   1334 	/*
   1335 	 * See MAC header stripping rules in the dld_str_rx_fastpath() function.
   1336 	 */
   1337 	if (mhip->mhi_istagged &&
   1338 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
   1339 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
   1340 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
   1341 		offset = VLAN_TAGSZ;
   1342 		strip_vlan = B_FALSE;
   1343 	}
   1344 
   1345 	ASSERT(mp != NULL);
   1346 	do {
   1347 		/*
   1348 		 * Get the pointer to the next packet in the chain and then
   1349 		 * clear b_next before the packet gets passed on.
   1350 		 */
   1351 		next = mp->b_next;
   1352 		mp->b_next = NULL;
   1353 
   1354 		/*
   1355 		 * Wind back b_rptr to point at the MAC header.
   1356 		 */
   1357 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
   1358 		mp->b_rptr -= mhip->mhi_hdrsize;
   1359 
   1360 		/*
   1361 		 * Create the DL_UNITDATA_IND M_PROTO.
   1362 		 */
   1363 		if ((ud_mp = str_unitdata_ind(dsp, mp, strip_vlan)) == NULL) {
   1364 			freemsgchain(mp);
   1365 			return;
   1366 		}
   1367 
   1368 		/*
   1369 		 * Advance b_rptr to point at the payload (or the VLAN header).
   1370 		 */
   1371 		mp->b_rptr += (mhip->mhi_hdrsize - offset);
   1372 
   1373 		/*
   1374 		 * Prepend the DL_UNITDATA_IND.
   1375 		 */
   1376 		ud_mp->b_cont = mp;
   1377 
   1378 		/*
   1379 		 * Send the message.
   1380 		 */
   1381 		if (canputnext(dsp->ds_rq))
   1382 			putnext(dsp->ds_rq, ud_mp);
   1383 		else
   1384 			freemsg(ud_mp);
   1385 
   1386 		/*
   1387 		 * Move on to the next packet in the chain.
   1388 		 */
   1389 		mp = next;
   1390 	} while (mp != NULL);
   1391 }
   1392 
   1393 /*
   1394  * DL_NOTIFY_IND: DL_NOTE_SDU_SIZE
   1395  */
   1396 static void
   1397 str_notify_sdu_size(dld_str_t *dsp, uint_t max_sdu)
   1398 {
   1399 	mblk_t		*mp;
   1400 	dl_notify_ind_t *dlip;
   1401 
   1402 	if (!(dsp->ds_notifications & DL_NOTE_SDU_SIZE))
   1403 		return;
   1404 
   1405 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
   1406 	    M_PROTO, 0)) == NULL)
   1407 		return;
   1408 
   1409 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
   1410 	dlip = (dl_notify_ind_t *)mp->b_rptr;
   1411 	dlip->dl_primitive = DL_NOTIFY_IND;
   1412 	dlip->dl_notification = DL_NOTE_SDU_SIZE;
   1413 	dlip->dl_data = max_sdu;
   1414 
   1415 	qreply(dsp->ds_wq, mp);
   1416 }
   1417 
   1418 /*
   1419  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
   1420  * current state of the interface.
   1421  */
   1422 void
   1423 dld_str_notify_ind(dld_str_t *dsp)
   1424 {
   1425 	mac_notify_type_t	type;
   1426 
   1427 	for (type = 0; type < MAC_NNOTE; type++)
   1428 		str_notify(dsp, type);
   1429 }
   1430 
   1431 typedef struct dl_unitdata_ind_wrapper {
   1432 	dl_unitdata_ind_t	dl_unitdata;
   1433 	uint8_t			dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)];
   1434 	uint8_t			dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)];
   1435 } dl_unitdata_ind_wrapper_t;
   1436 
   1437 /*
   1438  * Create a DL_UNITDATA_IND M_PROTO message.
   1439  */
   1440 static mblk_t *
   1441 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan)
   1442 {
   1443 	mblk_t				*nmp;
   1444 	dl_unitdata_ind_wrapper_t	*dlwp;
   1445 	dl_unitdata_ind_t		*dlp;
   1446 	mac_header_info_t		mhi;
   1447 	uint_t				addr_length;
   1448 	uint8_t				*daddr;
   1449 	uint8_t				*saddr;
   1450 
   1451 	/*
   1452 	 * Get the packet header information.
   1453 	 */
   1454 	if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
   1455 		return (NULL);
   1456 
   1457 	/*
   1458 	 * Allocate a message large enough to contain the wrapper structure
   1459 	 * defined above.
   1460 	 */
   1461 	if ((nmp = mexchange(dsp->ds_wq, NULL,
   1462 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
   1463 	    DL_UNITDATA_IND)) == NULL)
   1464 		return (NULL);
   1465 
   1466 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
   1467 
   1468 	dlp = &(dlwp->dl_unitdata);
   1469 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
   1470 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
   1471 
   1472 	/*
   1473 	 * Copy in the destination address.
   1474 	 */
   1475 	addr_length = dsp->ds_mip->mi_addr_length;
   1476 	daddr = dlwp->dl_dest_addr;
   1477 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
   1478 	bcopy(mhi.mhi_daddr, daddr, addr_length);
   1479 
   1480 	/*
   1481 	 * Set the destination DLSAP to the SAP value encoded in the packet.
   1482 	 */
   1483 	if (mhi.mhi_istagged && !strip_vlan)
   1484 		*(uint16_t *)(daddr + addr_length) = ETHERTYPE_VLAN;
   1485 	else
   1486 		*(uint16_t *)(daddr + addr_length) = mhi.mhi_bindsap;
   1487 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
   1488 
   1489 	/*
   1490 	 * If the destination address was multicast or broadcast then the
   1491 	 * dl_group_address field should be non-zero.
   1492 	 */
   1493 	dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) ||
   1494 	    (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST);
   1495 
   1496 	/*
   1497 	 * Copy in the source address if one exists.  Some MAC types (DL_IB
   1498 	 * for example) may not have access to source information.
   1499 	 */
   1500 	if (mhi.mhi_saddr == NULL) {
   1501 		dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0;
   1502 	} else {
   1503 		saddr = dlwp->dl_src_addr;
   1504 		dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
   1505 		bcopy(mhi.mhi_saddr, saddr, addr_length);
   1506 
   1507 		/*
   1508 		 * Set the source DLSAP to the packet ethertype.
   1509 		 */
   1510 		*(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap;
   1511 		dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
   1512 	}
   1513 
   1514 	return (nmp);
   1515 }
   1516 
   1517 /*
   1518  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
   1519  */
   1520 static void
   1521 str_notify_promisc_on_phys(dld_str_t *dsp)
   1522 {
   1523 	mblk_t		*mp;
   1524 	dl_notify_ind_t	*dlip;
   1525 
   1526 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
   1527 		return;
   1528 
   1529 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
   1530 	    M_PROTO, 0)) == NULL)
   1531 		return;
   1532 
   1533 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
   1534 	dlip = (dl_notify_ind_t *)mp->b_rptr;
   1535 	dlip->dl_primitive = DL_NOTIFY_IND;
   1536 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
   1537 
   1538 	qreply(dsp->ds_wq, mp);
   1539 }
   1540 
   1541 /*
   1542  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
   1543  */
   1544 static void
   1545 str_notify_promisc_off_phys(dld_str_t *dsp)
   1546 {
   1547 	mblk_t		*mp;
   1548 	dl_notify_ind_t	*dlip;
   1549 
   1550 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
   1551 		return;
   1552 
   1553 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
   1554 	    M_PROTO, 0)) == NULL)
   1555 		return;
   1556 
   1557 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
   1558 	dlip = (dl_notify_ind_t *)mp->b_rptr;
   1559 	dlip->dl_primitive = DL_NOTIFY_IND;
   1560 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
   1561 
   1562 	qreply(dsp->ds_wq, mp);
   1563 }
   1564 
   1565 /*
   1566  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
   1567  */
   1568 static void
   1569 str_notify_phys_addr(dld_str_t *dsp, uint_t addr_type, const uint8_t *addr)
   1570 {
   1571 	mblk_t		*mp;
   1572 	dl_notify_ind_t	*dlip;
   1573 	uint_t		addr_length;
   1574 	uint16_t	ethertype;
   1575 
   1576 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
   1577 		return;
   1578 
   1579 	addr_length = dsp->ds_mip->mi_addr_length;
   1580 	if ((mp = mexchange(dsp->ds_wq, NULL,
   1581 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
   1582 	    M_PROTO, 0)) == NULL)
   1583 		return;
   1584 
   1585 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
   1586 	dlip = (dl_notify_ind_t *)mp->b_rptr;
   1587 	dlip->dl_primitive = DL_NOTIFY_IND;
   1588 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
   1589 	dlip->dl_data = addr_type;
   1590 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
   1591 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
   1592 
   1593 	bcopy(addr, &dlip[1], addr_length);
   1594 
   1595 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
   1596 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) = ethertype;
   1597 
   1598 	qreply(dsp->ds_wq, mp);
   1599 }
   1600 
   1601 /*
   1602  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
   1603  */
   1604 static void
   1605 str_notify_link_up(dld_str_t *dsp)
   1606 {
   1607 	mblk_t		*mp;
   1608 	dl_notify_ind_t	*dlip;
   1609 
   1610 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
   1611 		return;
   1612 
   1613 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
   1614 	    M_PROTO, 0)) == NULL)
   1615 		return;
   1616 
   1617 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
   1618 	dlip = (dl_notify_ind_t *)mp->b_rptr;
   1619 	dlip->dl_primitive = DL_NOTIFY_IND;
   1620 	dlip->dl_notification = DL_NOTE_LINK_UP;
   1621 
   1622 	qreply(dsp->ds_wq, mp);
   1623 }
   1624 
   1625 /*
   1626  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
   1627  */
   1628 static void
   1629 str_notify_link_down(dld_str_t *dsp)
   1630 {
   1631 	mblk_t		*mp;
   1632 	dl_notify_ind_t	*dlip;
   1633 
   1634 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
   1635 		return;
   1636 
   1637 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
   1638 	    M_PROTO, 0)) == NULL)
   1639 		return;
   1640 
   1641 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
   1642 	dlip = (dl_notify_ind_t *)mp->b_rptr;
   1643 	dlip->dl_primitive = DL_NOTIFY_IND;
   1644 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
   1645 
   1646 	qreply(dsp->ds_wq, mp);
   1647 }
   1648 
   1649 /*
   1650  * DL_NOTIFY_IND: DL_NOTE_SPEED
   1651  */
   1652 static void
   1653 str_notify_speed(dld_str_t *dsp, uint32_t speed)
   1654 {
   1655 	mblk_t		*mp;
   1656 	dl_notify_ind_t	*dlip;
   1657 
   1658 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
   1659 		return;
   1660 
   1661 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
   1662 	    M_PROTO, 0)) == NULL)
   1663 		return;
   1664 
   1665 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
   1666 	dlip = (dl_notify_ind_t *)mp->b_rptr;
   1667 	dlip->dl_primitive = DL_NOTIFY_IND;
   1668 	dlip->dl_notification = DL_NOTE_SPEED;
   1669 	dlip->dl_data = speed;
   1670 
   1671 	qreply(dsp->ds_wq, mp);
   1672 }
   1673 
   1674 /*
   1675  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
   1676  */
   1677 static void
   1678 str_notify_capab_reneg(dld_str_t *dsp)
   1679 {
   1680 	mblk_t		*mp;
   1681 	dl_notify_ind_t	*dlip;
   1682 
   1683 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
   1684 		return;
   1685 
   1686 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
   1687 	    M_PROTO, 0)) == NULL)
   1688 		return;
   1689 
   1690 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
   1691 	dlip = (dl_notify_ind_t *)mp->b_rptr;
   1692 	dlip->dl_primitive = DL_NOTIFY_IND;
   1693 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
   1694 
   1695 	qreply(dsp->ds_wq, mp);
   1696 }
   1697 
   1698 /*
   1699  * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
   1700  */
   1701 static void
   1702 str_notify_fastpath_flush(dld_str_t *dsp)
   1703 {
   1704 	mblk_t		*mp;
   1705 	dl_notify_ind_t	*dlip;
   1706 
   1707 	if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH))
   1708 		return;
   1709 
   1710 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
   1711 	    M_PROTO, 0)) == NULL)
   1712 		return;
   1713 
   1714 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
   1715 	dlip = (dl_notify_ind_t *)mp->b_rptr;
   1716 	dlip->dl_primitive = DL_NOTIFY_IND;
   1717 	dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH;
   1718 
   1719 	qreply(dsp->ds_wq, mp);
   1720 }
   1721 
   1722 /*
   1723  * MAC notification callback.
   1724  */
   1725 void
   1726 str_notify(void *arg, mac_notify_type_t type)
   1727 {
   1728 	dld_str_t		*dsp = (dld_str_t *)arg;
   1729 	queue_t			*q = dsp->ds_wq;
   1730 	mac_handle_t		mh = dsp->ds_mh;
   1731 	mac_client_handle_t	mch = dsp->ds_mch;
   1732 	uint8_t			addr[MAXMACADDRLEN];
   1733 
   1734 	switch (type) {
   1735 	case MAC_NOTE_TX:
   1736 		qenable(q);
   1737 		break;
   1738 
   1739 	case MAC_NOTE_DEVPROMISC:
   1740 		/*
   1741 		 * Send the appropriate DL_NOTIFY_IND.
   1742 		 */
   1743 		if (mac_promisc_get(mh))
   1744 			str_notify_promisc_on_phys(dsp);
   1745 		else
   1746 			str_notify_promisc_off_phys(dsp);
   1747 		break;
   1748 
   1749 	case MAC_NOTE_UNICST:
   1750 		/*
   1751 		 * This notification is sent whenever the MAC unicast
   1752 		 * address changes.
   1753 		 */
   1754 		mac_unicast_primary_get(mh, addr);
   1755 
   1756 		/*
   1757 		 * Send the appropriate DL_NOTIFY_IND.
   1758 		 */
   1759 		str_notify_phys_addr(dsp, DL_CURR_PHYS_ADDR, addr);
   1760 		break;
   1761 
   1762 	case MAC_NOTE_DEST:
   1763 		/*
   1764 		 * Only send up DL_NOTE_DEST_ADDR if the link has a
   1765 		 * destination address.
   1766 		 */
   1767 		if (mac_dst_get(dsp->ds_mh, addr))
   1768 			str_notify_phys_addr(dsp, DL_CURR_DEST_ADDR, addr);
   1769 		break;
   1770 
   1771 	case MAC_NOTE_LOWLINK:
   1772 	case MAC_NOTE_LINK:
   1773 		/*
   1774 		 * LOWLINK refers to the actual link status. For links that
   1775 		 * are not part of a bridge instance LOWLINK and LINK state
   1776 		 * are the same. But for a link part of a bridge instance
   1777 		 * LINK state refers to the aggregate link status: "up" when
   1778 		 * at least one link part of the bridge is up and is "down"
   1779 		 * when all links part of the bridge are down.
   1780 		 *
   1781 		 * Clients can request to be notified of the LOWLINK state
   1782 		 * using the DLIOCLOWLINK ioctl. Clients such as the bridge
   1783 		 * daemon request lowlink state changes and upper layer clients
   1784 		 * receive notifications of the aggregate link state changes
   1785 		 * which is the default when requesting LINK UP/DOWN state
   1786 		 * notifications.
   1787 		 */
   1788 
   1789 		/*
   1790 		 * Check that the notification type matches the one that we
   1791 		 * want.  If we want lower-level link notifications, and this
   1792 		 * is upper, or if we want upper and this is lower, then
   1793 		 * ignore.
   1794 		 */
   1795 		if ((type == MAC_NOTE_LOWLINK) != dsp->ds_lowlink)
   1796 			break;
   1797 		/*
   1798 		 * This notification is sent every time the MAC driver
   1799 		 * updates the link state.
   1800 		 */
   1801 		switch (mac_client_stat_get(mch, dsp->ds_lowlink ?
   1802 		    MAC_STAT_LOWLINK_STATE : MAC_STAT_LINK_STATE)) {
   1803 		case LINK_STATE_UP: {
   1804 			uint64_t speed;
   1805 			/*
   1806 			 * The link is up so send the appropriate
   1807 			 * DL_NOTIFY_IND.
   1808 			 */
   1809 			str_notify_link_up(dsp);
   1810 
   1811 			speed = mac_stat_get(mh, MAC_STAT_IFSPEED);
   1812 			str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
   1813 			break;
   1814 		}
   1815 		case LINK_STATE_DOWN:
   1816 			/*
   1817 			 * The link is down so send the appropriate
   1818 			 * DL_NOTIFY_IND.
   1819 			 */
   1820 			str_notify_link_down(dsp);
   1821 			break;
   1822 
   1823 		default:
   1824 			break;
   1825 		}
   1826 		break;
   1827 
   1828 	case MAC_NOTE_CAPAB_CHG:
   1829 		/*
   1830 		 * This notification is sent whenever the MAC resources
   1831 		 * change or capabilities change. We need to renegotiate
   1832 		 * the capabilities. Send the appropriate DL_NOTIFY_IND.
   1833 		 */
   1834 		str_notify_capab_reneg(dsp);
   1835 		break;
   1836 
   1837 	case MAC_NOTE_SDU_SIZE: {
   1838 		uint_t  max_sdu;
   1839 		mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
   1840 		str_notify_sdu_size(dsp, max_sdu);
   1841 		break;
   1842 	}
   1843 
   1844 	case MAC_NOTE_FASTPATH_FLUSH:
   1845 		str_notify_fastpath_flush(dsp);
   1846 		break;
   1847 
   1848 	/* Unused notifications */
   1849 	case MAC_NOTE_MARGIN:
   1850 		break;
   1851 
   1852 	default:
   1853 		ASSERT(B_FALSE);
   1854 		break;
   1855 	}
   1856 }
   1857 
   1858 /*
   1859  * This function is called via a taskq mechansim to process all control
   1860  * messages on a per 'dsp' end point.
   1861  */
   1862 static void
   1863 dld_wput_nondata_task(void *arg)
   1864 {
   1865 	dld_str_t	*dsp = arg;
   1866 	mblk_t		*mp;
   1867 
   1868 	mutex_enter(&dsp->ds_lock);
   1869 	while (dsp->ds_pending_head != NULL) {
   1870 		mp = dsp->ds_pending_head;
   1871 		dsp->ds_pending_head = mp->b_next;
   1872 		mp->b_next = NULL;
   1873 		if (dsp->ds_pending_head == NULL)
   1874 			dsp->ds_pending_tail = NULL;
   1875 		mutex_exit(&dsp->ds_lock);
   1876 
   1877 		switch (DB_TYPE(mp)) {
   1878 		case M_PROTO:
   1879 		case M_PCPROTO:
   1880 			dld_proto(dsp, mp);
   1881 			break;
   1882 		case M_IOCTL:
   1883 			dld_ioc(dsp, mp);
   1884 			break;
   1885 		default:
   1886 			ASSERT(0);
   1887 		}
   1888 
   1889 		mutex_enter(&dsp->ds_lock);
   1890 	}
   1891 	ASSERT(dsp->ds_pending_tail == NULL);
   1892 	dsp->ds_dlpi_pending = 0;
   1893 	cv_broadcast(&dsp->ds_dlpi_pending_cv);
   1894 	mutex_exit(&dsp->ds_lock);
   1895 }
   1896 
   1897 /*
   1898  * Kernel thread to handle taskq dispatch failures in dld_wput_data. This
   1899  * thread is started at boot time.
   1900  */
   1901 static void
   1902 dld_taskq_dispatch(void)
   1903 {
   1904 	callb_cpr_t	cprinfo;
   1905 	dld_str_t	*dsp;
   1906 
   1907 	CALLB_CPR_INIT(&cprinfo, &dld_taskq_lock, callb_generic_cpr,
   1908 	    "dld_taskq_dispatch");
   1909 	mutex_enter(&dld_taskq_lock);
   1910 
   1911 	while (!dld_taskq_quit) {
   1912 		dsp = list_head(&dld_taskq_list);
   1913 		while (dsp != NULL) {
   1914 			list_remove(&dld_taskq_list, dsp);
   1915 			mutex_exit(&dld_taskq_lock);
   1916 			VERIFY(taskq_dispatch(dld_taskq, dld_wput_nondata_task,
   1917 			    dsp, TQ_SLEEP) != 0);
   1918 			mutex_enter(&dld_taskq_lock);
   1919 			dsp = list_head(&dld_taskq_list);
   1920 		}
   1921 
   1922 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
   1923 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
   1924 		CALLB_CPR_SAFE_END(&cprinfo, &dld_taskq_lock);
   1925 	}
   1926 
   1927 	dld_taskq_done = B_TRUE;
   1928 	cv_signal(&dld_taskq_cv);
   1929 	CALLB_CPR_EXIT(&cprinfo);
   1930 	thread_exit();
   1931 }
   1932 
   1933 /*
   1934  * All control operations are serialized on the 'dsp' and are also funneled
   1935  * through a taskq mechanism to ensure that subsequent processing has kernel
   1936  * context and can safely use cv_wait.
   1937  *
   1938  * Mechanisms to handle taskq dispatch failures
   1939  *
   1940  * The only way to be sure that taskq dispatch does not fail is to either
   1941  * specify TQ_SLEEP or to use a static taskq and prepopulate it with
   1942  * some number of entries and make sure that the number of outstanding requests
   1943  * are less than that number. We can't use TQ_SLEEP since we don't know the
   1944  * context. Nor can we bound the total number of 'dsp' end points. So we are
   1945  * unable to use either of the above schemes, and are forced to deal with
   1946  * taskq dispatch failures. Note that even dynamic taskq could fail in
   1947  * dispatch if TQ_NOSLEEP is specified, since this flag is translated
   1948  * eventually to KM_NOSLEEP and kmem allocations could fail in the taskq
   1949  * framework.
   1950  *
   1951  * We maintain a queue of 'dsp's that encountered taskq dispatch failure.
   1952  * We also have a single global thread to retry the taskq dispatch. This
   1953  * thread loops in 'dld_taskq_dispatch' and retries the taskq dispatch, but
   1954  * uses TQ_SLEEP to ensure eventual success of the dispatch operation.
   1955  */
   1956 static void
   1957 dld_wput_nondata(dld_str_t *dsp, mblk_t *mp)
   1958 {
   1959 	ASSERT(mp->b_next == NULL);
   1960 	mutex_enter(&dsp->ds_lock);
   1961 	if (dsp->ds_pending_head != NULL) {
   1962 		ASSERT(dsp->ds_dlpi_pending);
   1963 		dsp->ds_pending_tail->b_next = mp;
   1964 		dsp->ds_pending_tail = mp;
   1965 		mutex_exit(&dsp->ds_lock);
   1966 		return;
   1967 	}
   1968 	ASSERT(dsp->ds_pending_tail == NULL);
   1969 	dsp->ds_pending_head = dsp->ds_pending_tail = mp;
   1970 	/*
   1971 	 * At this point if ds_dlpi_pending is set, it implies that the taskq
   1972 	 * thread is still active and is processing the last message, though
   1973 	 * the pending queue has been emptied.
   1974 	 */
   1975 	if (dsp->ds_dlpi_pending) {
   1976 		mutex_exit(&dsp->ds_lock);
   1977 		return;
   1978 	}
   1979 
   1980 	dsp->ds_dlpi_pending = 1;
   1981 	mutex_exit(&dsp->ds_lock);
   1982 
   1983 	if (taskq_dispatch(dld_taskq, dld_wput_nondata_task, dsp,
   1984 	    TQ_NOSLEEP) != 0)
   1985 		return;
   1986 
   1987 	mutex_enter(&dld_taskq_lock);
   1988 	list_insert_tail(&dld_taskq_list, dsp);
   1989 	cv_signal(&dld_taskq_cv);
   1990 	mutex_exit(&dld_taskq_lock);
   1991 }
   1992 
   1993 /*
   1994  * Process an M_IOCTL message.
   1995  */
   1996 static void
   1997 dld_ioc(dld_str_t *dsp, mblk_t *mp)
   1998 {
   1999 	uint_t			cmd;
   2000 
   2001 	cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
   2002 	ASSERT(dsp->ds_type == DLD_DLPI);
   2003 
   2004 	switch (cmd) {
   2005 	case DLIOCNATIVE:
   2006 		ioc_native(dsp, mp);
   2007 		break;
   2008 	case DLIOCMARGININFO:
   2009 		ioc_margin(dsp, mp);
   2010 		break;
   2011 	case DLIOCRAW:
   2012 		ioc_raw(dsp, mp);
   2013 		break;
   2014 	case DLIOCHDRINFO:
   2015 		ioc_fast(dsp, mp);
   2016 		break;
   2017 	case DLIOCLOWLINK:
   2018 		ioc_lowlink(dsp, mp);
   2019 		break;
   2020 	default:
   2021 		ioc(dsp, mp);
   2022 	}
   2023 }
   2024 
   2025 /*
   2026  * DLIOCNATIVE
   2027  */
   2028 static void
   2029 ioc_native(dld_str_t *dsp, mblk_t *mp)
   2030 {
   2031 	queue_t *q = dsp->ds_wq;
   2032 	const mac_info_t *mip = dsp->ds_mip;
   2033 
   2034 	/*
   2035 	 * Native mode can be enabled if it's disabled and if the
   2036 	 * native media type is different.
   2037 	 */
   2038 	if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia)
   2039 		dsp->ds_native = B_TRUE;
   2040 
   2041 	if (dsp->ds_native)
   2042 		miocack(q, mp, 0, mip->mi_nativemedia);
   2043 	else
   2044 		miocnak(q, mp, 0, ENOTSUP);
   2045 }
   2046 
   2047 /*
   2048  * DLIOCMARGININFO
   2049  */
   2050 static void
   2051 ioc_margin(dld_str_t *dsp, mblk_t *mp)
   2052 {
   2053 	queue_t *q = dsp->ds_wq;
   2054 	uint32_t margin;
   2055 	int err;
   2056 
   2057 	if (dsp->ds_dlstate == DL_UNATTACHED) {
   2058 		err = EINVAL;
   2059 		goto failed;
   2060 	}
   2061 	if ((err = miocpullup(mp, sizeof (uint32_t))) != 0)
   2062 		goto failed;
   2063 
   2064 	mac_margin_get(dsp->ds_mh, &margin);
   2065 	*((uint32_t *)mp->b_cont->b_rptr) = margin;
   2066 	miocack(q, mp, sizeof (uint32_t), 0);
   2067 	return;
   2068 
   2069 failed:
   2070 	miocnak(q, mp, 0, err);
   2071 }
   2072 
   2073 /*
   2074  * DLIOCRAW
   2075  */
   2076 static void
   2077 ioc_raw(dld_str_t *dsp, mblk_t *mp)
   2078 {
   2079 	queue_t *q = dsp->ds_wq;
   2080 	mac_perim_handle_t	mph;
   2081 
   2082 	if (dsp->ds_mh == NULL) {
   2083 		dsp->ds_mode = DLD_RAW;
   2084 		miocack(q, mp, 0, 0);
   2085 		return;
   2086 	}
   2087 
   2088 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
   2089 	if (dsp->ds_polling || dsp->ds_direct) {
   2090 		mac_perim_exit(mph);
   2091 		miocnak(q, mp, 0, EPROTO);
   2092 		return;
   2093 	}
   2094 
   2095 	if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
   2096 		/*
   2097 		 * Set the receive callback.
   2098 		 */
   2099 		dls_rx_set(dsp, dld_str_rx_raw, dsp);
   2100 	}
   2101 
   2102 	/*
   2103 	 * Note that raw mode is enabled.
   2104 	 */
   2105 	dsp->ds_mode = DLD_RAW;
   2106 	mac_perim_exit(mph);
   2107 
   2108 	miocack(q, mp, 0, 0);
   2109 }
   2110 
   2111 /*
   2112  * DLIOCHDRINFO
   2113  */
   2114 static void
   2115 ioc_fast(dld_str_t *dsp, mblk_t *mp)
   2116 {
   2117 	dl_unitdata_req_t *dlp;
   2118 	off_t		off;
   2119 	size_t		len;
   2120 	const uint8_t	*addr;
   2121 	uint16_t	sap;
   2122 	mblk_t		*nmp;
   2123 	mblk_t		*hmp;
   2124 	uint_t		addr_length;
   2125 	queue_t		*q = dsp->ds_wq;
   2126 	int		err;
   2127 	mac_perim_handle_t	mph;
   2128 
   2129 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
   2130 		err = ENOTSUP;
   2131 		goto failed;
   2132 	}
   2133 
   2134 	/*
   2135 	 * DLIOCHDRINFO should only come from IP. The one initiated from
   2136 	 * user-land should not be allowed.
   2137 	 */
   2138 	if (((struct iocblk *)mp->b_rptr)->ioc_cr != kcred) {
   2139 		err = EINVAL;
   2140 		goto failed;
   2141 	}
   2142 
   2143 	nmp = mp->b_cont;
   2144 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
   2145 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
   2146 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
   2147 		err = EINVAL;
   2148 		goto failed;
   2149 	}
   2150 
   2151 	off = dlp->dl_dest_addr_offset;
   2152 	len = dlp->dl_dest_addr_length;
   2153 
   2154 	if (!MBLKIN(nmp, off, len)) {
   2155 		err = EINVAL;
   2156 		goto failed;
   2157 	}
   2158 
   2159 	if (dsp->ds_dlstate != DL_IDLE) {
   2160 		err = ENOTSUP;
   2161 		goto failed;
   2162 	}
   2163 
   2164 	addr_length = dsp->ds_mip->mi_addr_length;
   2165 	if (len != addr_length + sizeof (uint16_t)) {
   2166 		err = EINVAL;
   2167 		goto failed;
   2168 	}
   2169 
   2170 	addr = nmp->b_rptr + off;
   2171 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
   2172 
   2173 	if ((hmp = dls_header(dsp, addr, sap, 0, NULL)) == NULL) {
   2174 		err = ENOMEM;
   2175 		goto failed;
   2176 	}
   2177 
   2178 	/*
   2179 	 * This ioctl might happen concurrently with a direct call to dld_capab
   2180 	 * that tries to enable direct and/or poll capabilities. Since the
   2181 	 * stack does not serialize them, we do so here to avoid mixing
   2182 	 * the callbacks.
   2183 	 */
   2184 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
   2185 	if (dsp->ds_mode != DLD_FASTPATH) {
   2186 		/*
   2187 		 * Set the receive callback (unless polling is enabled).
   2188 		 */
   2189 		if (!dsp->ds_polling && !dsp->ds_direct)
   2190 			dls_rx_set(dsp, dld_str_rx_fastpath, dsp);
   2191 
   2192 		/*
   2193 		 * Note that fast-path mode is enabled.
   2194 		 */
   2195 		dsp->ds_mode = DLD_FASTPATH;
   2196 	}
   2197 	mac_perim_exit(mph);
   2198 
   2199 	freemsg(nmp->b_cont);
   2200 	nmp->b_cont = hmp;
   2201 
   2202 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
   2203 	return;
   2204 failed:
   2205 	miocnak(q, mp, 0, err);
   2206 }
   2207 
   2208 /*
   2209  * DLIOCLOWLINK: request actual link state changes. When the
   2210  * link is part of a bridge instance the client receives actual
   2211  * link state changes and not the aggregate link status. Used by
   2212  * the bridging daemon (bridged) for proper RSTP operation.
   2213  */
   2214 static void
   2215 ioc_lowlink(dld_str_t *dsp, mblk_t *mp)
   2216 {
   2217 	queue_t *q = dsp->ds_wq;
   2218 	int err;
   2219 
   2220 	if ((err = miocpullup(mp, sizeof (int))) != 0) {
   2221 		miocnak(q, mp, 0, err);
   2222 	} else {
   2223 		/* LINTED: alignment */
   2224 		dsp->ds_lowlink = *(boolean_t *)mp->b_cont->b_rptr;
   2225 		miocack(q, mp, 0, 0);
   2226 	}
   2227 }
   2228 
   2229 /*
   2230  * Catch-all handler.
   2231  */
   2232 static void
   2233 ioc(dld_str_t *dsp, mblk_t *mp)
   2234 {
   2235 	queue_t	*q = dsp->ds_wq;
   2236 
   2237 	if (dsp->ds_dlstate == DL_UNATTACHED) {
   2238 		miocnak(q, mp, 0, EINVAL);
   2239 		return;
   2240 	}
   2241 	mac_ioctl(dsp->ds_mh, q, mp);
   2242 }
   2243