Home | History | Annotate | Download | only in mac
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * MAC Services Module - misc utilities
     28  */
     29 
     30 #include <sys/types.h>
     31 #include <sys/mac.h>
     32 #include <sys/mac_impl.h>
     33 #include <sys/mac_client_priv.h>
     34 #include <sys/mac_client_impl.h>
     35 #include <sys/mac_soft_ring.h>
     36 #include <sys/strsubr.h>
     37 #include <sys/strsun.h>
     38 #include <sys/vlan.h>
     39 #include <sys/pattr.h>
     40 #include <sys/pci_tools.h>
     41 #include <inet/ip.h>
     42 #include <inet/ip_impl.h>
     43 #include <inet/ip6.h>
     44 #include <sys/vtrace.h>
     45 #include <sys/dlpi.h>
     46 #include <sys/sunndi.h>
     47 #include <inet/ipsec_impl.h>
     48 #include <inet/sadb.h>
     49 #include <inet/ipsecesp.h>
     50 #include <inet/ipsecah.h>
     51 
     52 /*
     53  * Copy an mblk, preserving its hardware checksum flags.
     54  */
     55 static mblk_t *
     56 mac_copymsg_cksum(mblk_t *mp)
     57 {
     58 	mblk_t *mp1;
     59 	uint32_t start, stuff, end, value, flags;
     60 
     61 	mp1 = copymsg(mp);
     62 	if (mp1 == NULL)
     63 		return (NULL);
     64 
     65 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
     66 	(void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value,
     67 	    flags, KM_NOSLEEP);
     68 
     69 	return (mp1);
     70 }
     71 
     72 /*
     73  * Copy an mblk chain, presenting the hardware checksum flags of the
     74  * individual mblks.
     75  */
     76 mblk_t *
     77 mac_copymsgchain_cksum(mblk_t *mp)
     78 {
     79 	mblk_t *nmp = NULL;
     80 	mblk_t **nmpp = &nmp;
     81 
     82 	for (; mp != NULL; mp = mp->b_next) {
     83 		if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) {
     84 			freemsgchain(nmp);
     85 			return (NULL);
     86 		}
     87 
     88 		nmpp = &((*nmpp)->b_next);
     89 	}
     90 
     91 	return (nmp);
     92 }
     93 
     94 /*
     95  * Process the specified mblk chain for proper handling of hardware
     96  * checksum offload. This routine is invoked for loopback traffic
     97  * between MAC clients.
     98  * The function handles a NULL mblk chain passed as argument.
     99  */
    100 mblk_t *
    101 mac_fix_cksum(mblk_t *mp_chain)
    102 {
    103 	mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
    104 	uint32_t flags, start, stuff, end, value;
    105 
    106 	for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
    107 		uint16_t len;
    108 		uint32_t offset;
    109 		struct ether_header *ehp;
    110 		uint16_t sap;
    111 
    112 		hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value,
    113 		    &flags);
    114 		if (flags == 0)
    115 			continue;
    116 
    117 		/*
    118 		 * Since the processing of checksum offload for loopback
    119 		 * traffic requires modification of the packet contents,
    120 		 * ensure sure that we are always modifying our own copy.
    121 		 */
    122 		if (DB_REF(mp) > 1) {
    123 			mp1 = copymsg(mp);
    124 			if (mp1 == NULL)
    125 				continue;
    126 			mp1->b_next = mp->b_next;
    127 			mp->b_next = NULL;
    128 			freemsg(mp);
    129 			if (prev != NULL)
    130 				prev->b_next = mp1;
    131 			else
    132 				new_chain = mp1;
    133 			mp = mp1;
    134 		}
    135 
    136 		/*
    137 		 * Ethernet, and optionally VLAN header.
    138 		 */
    139 		/* LINTED: improper alignment cast */
    140 		ehp = (struct ether_header *)mp->b_rptr;
    141 		if (ntohs(ehp->ether_type) == VLAN_TPID) {
    142 			struct ether_vlan_header *evhp;
    143 
    144 			ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
    145 			/* LINTED: improper alignment cast */
    146 			evhp = (struct ether_vlan_header *)mp->b_rptr;
    147 			sap = ntohs(evhp->ether_type);
    148 			offset = sizeof (struct ether_vlan_header);
    149 		} else {
    150 			sap = ntohs(ehp->ether_type);
    151 			offset = sizeof (struct ether_header);
    152 		}
    153 
    154 		if (MBLKL(mp) <= offset) {
    155 			offset -= MBLKL(mp);
    156 			if (mp->b_cont == NULL) {
    157 				/* corrupted packet, skip it */
    158 				if (prev != NULL)
    159 					prev->b_next = mp->b_next;
    160 				else
    161 					new_chain = mp->b_next;
    162 				mp1 = mp->b_next;
    163 				mp->b_next = NULL;
    164 				freemsg(mp);
    165 				mp = mp1;
    166 				continue;
    167 			}
    168 			mp = mp->b_cont;
    169 		}
    170 
    171 		if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
    172 			ipha_t *ipha = NULL;
    173 
    174 			/*
    175 			 * In order to compute the full and header
    176 			 * checksums, we need to find and parse
    177 			 * the IP and/or ULP headers.
    178 			 */
    179 
    180 			sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
    181 
    182 			/*
    183 			 * IP header.
    184 			 */
    185 			if (sap != ETHERTYPE_IP)
    186 				continue;
    187 
    188 			ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
    189 			/* LINTED: improper alignment cast */
    190 			ipha = (ipha_t *)(mp->b_rptr + offset);
    191 
    192 			if (flags & HCK_FULLCKSUM) {
    193 				ipaddr_t src, dst;
    194 				uint32_t cksum;
    195 				uint16_t *up;
    196 				uint8_t proto;
    197 
    198 				/*
    199 				 * Pointer to checksum field in ULP header.
    200 				 */
    201 				proto = ipha->ipha_protocol;
    202 				ASSERT(ipha->ipha_version_and_hdr_length ==
    203 				    IP_SIMPLE_HDR_VERSION);
    204 
    205 				switch (proto) {
    206 				case IPPROTO_TCP:
    207 					/* LINTED: improper alignment cast */
    208 					up = IPH_TCPH_CHECKSUMP(ipha,
    209 					    IP_SIMPLE_HDR_LENGTH);
    210 					break;
    211 
    212 				case IPPROTO_UDP:
    213 					/* LINTED: improper alignment cast */
    214 					up = IPH_UDPH_CHECKSUMP(ipha,
    215 					    IP_SIMPLE_HDR_LENGTH);
    216 					break;
    217 
    218 				default:
    219 					cmn_err(CE_WARN, "mac_fix_cksum: "
    220 					    "unexpected protocol: %d", proto);
    221 					continue;
    222 				}
    223 
    224 				/*
    225 				 * Pseudo-header checksum.
    226 				 */
    227 				src = ipha->ipha_src;
    228 				dst = ipha->ipha_dst;
    229 				len = ntohs(ipha->ipha_length) -
    230 				    IP_SIMPLE_HDR_LENGTH;
    231 
    232 				cksum = (dst >> 16) + (dst & 0xFFFF) +
    233 				    (src >> 16) + (src & 0xFFFF);
    234 				cksum += htons(len);
    235 
    236 				/*
    237 				 * The checksum value stored in the packet needs
    238 				 * to be correct. Compute it here.
    239 				 */
    240 				*up = 0;
    241 				cksum += (((proto) == IPPROTO_UDP) ?
    242 				    IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
    243 				cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
    244 				    offset, cksum);
    245 				*(up) = (uint16_t)(cksum ? cksum : ~cksum);
    246 
    247 				flags |= HCK_FULLCKSUM_OK;
    248 				value = 0xffff;
    249 			}
    250 
    251 			if (flags & HCK_IPV4_HDRCKSUM) {
    252 				ASSERT(ipha != NULL);
    253 				ipha->ipha_hdr_checksum =
    254 				    (uint16_t)ip_csum_hdr(ipha);
    255 			}
    256 		}
    257 
    258 		if (flags & HCK_PARTIALCKSUM) {
    259 			uint16_t *up, partial, cksum;
    260 			uchar_t *ipp; /* ptr to beginning of IP header */
    261 
    262 			if (mp->b_cont != NULL) {
    263 				mblk_t *mp1;
    264 
    265 				mp1 = msgpullup(mp, offset + end);
    266 				if (mp1 == NULL)
    267 					continue;
    268 				mp1->b_next = mp->b_next;
    269 				mp->b_next = NULL;
    270 				freemsg(mp);
    271 				if (prev != NULL)
    272 					prev->b_next = mp1;
    273 				else
    274 					new_chain = mp1;
    275 				mp = mp1;
    276 			}
    277 
    278 			ipp = mp->b_rptr + offset;
    279 			/* LINTED: cast may result in improper alignment */
    280 			up = (uint16_t *)((uchar_t *)ipp + stuff);
    281 			partial = *up;
    282 			*up = 0;
    283 
    284 			cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
    285 			    end - start, partial);
    286 			cksum = ~cksum;
    287 			*up = cksum ? cksum : ~cksum;
    288 
    289 			/*
    290 			 * Since we already computed the whole checksum,
    291 			 * indicate to the stack that it has already
    292 			 * been verified by the hardware.
    293 			 */
    294 			flags &= ~HCK_PARTIALCKSUM;
    295 			flags |= (HCK_FULLCKSUM | HCK_FULLCKSUM_OK);
    296 			value = 0xffff;
    297 		}
    298 
    299 		(void) hcksum_assoc(mp, NULL, NULL, start, stuff, end,
    300 		    value, flags, KM_NOSLEEP);
    301 	}
    302 
    303 	return (new_chain);
    304 }
    305 
    306 /*
    307  * Add VLAN tag to the specified mblk.
    308  */
    309 mblk_t *
    310 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
    311 {
    312 	mblk_t *hmp;
    313 	struct ether_vlan_header *evhp;
    314 	struct ether_header *ehp;
    315 	uint32_t start, stuff, end, value, flags;
    316 
    317 	ASSERT(pri != 0 || vid != 0);
    318 
    319 	/*
    320 	 * Allocate an mblk for the new tagged ethernet header,
    321 	 * and copy the MAC addresses and ethertype from the
    322 	 * original header.
    323 	 */
    324 
    325 	hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
    326 	if (hmp == NULL) {
    327 		freemsg(mp);
    328 		return (NULL);
    329 	}
    330 
    331 	evhp = (struct ether_vlan_header *)hmp->b_rptr;
    332 	ehp = (struct ether_header *)mp->b_rptr;
    333 
    334 	bcopy(ehp, evhp, (ETHERADDRL * 2));
    335 	evhp->ether_type = ehp->ether_type;
    336 	evhp->ether_tpid = htons(ETHERTYPE_VLAN);
    337 
    338 	hmp->b_wptr += sizeof (struct ether_vlan_header);
    339 	mp->b_rptr += sizeof (struct ether_header);
    340 
    341 	/*
    342 	 * Free the original message if it's now empty. Link the
    343 	 * rest of messages to the header message.
    344 	 */
    345 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
    346 	(void) hcksum_assoc(hmp, NULL, NULL, start, stuff, end, value, flags,
    347 	    KM_NOSLEEP);
    348 	if (MBLKL(mp) == 0) {
    349 		hmp->b_cont = mp->b_cont;
    350 		freeb(mp);
    351 	} else {
    352 		hmp->b_cont = mp;
    353 	}
    354 	ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header));
    355 
    356 	/*
    357 	 * Initialize the new TCI (Tag Control Information).
    358 	 */
    359 	evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid));
    360 
    361 	return (hmp);
    362 }
    363 
    364 /*
    365  * Adds a VLAN tag with the specified VID and priority to each mblk of
    366  * the specified chain.
    367  */
    368 mblk_t *
    369 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid)
    370 {
    371 	mblk_t *next_mp, **prev, *mp;
    372 
    373 	mp = mp_chain;
    374 	prev = &mp_chain;
    375 
    376 	while (mp != NULL) {
    377 		next_mp = mp->b_next;
    378 		mp->b_next = NULL;
    379 		if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) {
    380 			freemsgchain(next_mp);
    381 			break;
    382 		}
    383 		*prev = mp;
    384 		prev = &mp->b_next;
    385 		mp = mp->b_next = next_mp;
    386 	}
    387 
    388 	return (mp_chain);
    389 }
    390 
    391 /*
    392  * Strip VLAN tag
    393  */
    394 mblk_t *
    395 mac_strip_vlan_tag(mblk_t *mp)
    396 {
    397 	mblk_t *newmp;
    398 	struct ether_vlan_header *evhp;
    399 
    400 	evhp = (struct ether_vlan_header *)mp->b_rptr;
    401 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
    402 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
    403 
    404 		if (DB_REF(mp) > 1) {
    405 			newmp = copymsg(mp);
    406 			if (newmp == NULL)
    407 				return (NULL);
    408 			freemsg(mp);
    409 			mp = newmp;
    410 		}
    411 
    412 		evhp = (struct ether_vlan_header *)mp->b_rptr;
    413 
    414 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
    415 		mp->b_rptr += VLAN_TAGSZ;
    416 	}
    417 	return (mp);
    418 }
    419 
    420 /*
    421  * Strip VLAN tag from each mblk of the chain.
    422  */
    423 mblk_t *
    424 mac_strip_vlan_tag_chain(mblk_t *mp_chain)
    425 {
    426 	mblk_t *mp, *next_mp, **prev;
    427 
    428 	mp = mp_chain;
    429 	prev = &mp_chain;
    430 
    431 	while (mp != NULL) {
    432 		next_mp = mp->b_next;
    433 		mp->b_next = NULL;
    434 		if ((mp = mac_strip_vlan_tag(mp)) == NULL) {
    435 			freemsgchain(next_mp);
    436 			break;
    437 		}
    438 		*prev = mp;
    439 		prev = &mp->b_next;
    440 		mp = mp->b_next = next_mp;
    441 	}
    442 
    443 	return (mp_chain);
    444 }
    445 
    446 /*
    447  * Default callback function. Used when the datapath is not yet initialized.
    448  */
    449 /* ARGSUSED */
    450 void
    451 mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp,
    452     boolean_t loopback)
    453 {
    454 	mblk_t	*mp1 = mp;
    455 
    456 	while (mp1 != NULL) {
    457 		mp1->b_prev = NULL;
    458 		mp1->b_queue = NULL;
    459 		mp1 = mp1->b_next;
    460 	}
    461 	freemsgchain(mp);
    462 }
    463 
    464 /*
    465  * Determines the IPv6 header length accounting for all the optional IPv6
    466  * headers (hop-by-hop, destination, routing and fragment). The header length
    467  * and next header value (a transport header) is captured.
    468  *
    469  * Returns B_FALSE if all the IP headers are not in the same mblk otherwise
    470  * returns B_TRUE.
    471  */
    472 boolean_t
    473 mac_ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length,
    474     uint8_t *next_hdr, boolean_t *ip_fragmented, uint32_t *ip_frag_ident)
    475 {
    476 	uint16_t length;
    477 	uint_t	ehdrlen;
    478 	uint8_t *whereptr;
    479 	uint8_t *endptr;
    480 	uint8_t *nexthdrp;
    481 	ip6_dest_t *desthdr;
    482 	ip6_rthdr_t *rthdr;
    483 	ip6_frag_t *fraghdr;
    484 
    485 	endptr = mp->b_wptr;
    486 	if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
    487 		return (B_FALSE);
    488 	ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
    489 	length = IPV6_HDR_LEN;
    490 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
    491 
    492 	if (ip_fragmented != NULL)
    493 		*ip_fragmented = B_FALSE;
    494 
    495 	nexthdrp = &ip6h->ip6_nxt;
    496 	while (whereptr < endptr) {
    497 		/* Is there enough left for len + nexthdr? */
    498 		if (whereptr + MIN_EHDR_LEN > endptr)
    499 			break;
    500 
    501 		switch (*nexthdrp) {
    502 		case IPPROTO_HOPOPTS:
    503 		case IPPROTO_DSTOPTS:
    504 			/* Assumes the headers are identical for hbh and dst */
    505 			desthdr = (ip6_dest_t *)whereptr;
    506 			ehdrlen = 8 * (desthdr->ip6d_len + 1);
    507 			if ((uchar_t *)desthdr +  ehdrlen > endptr)
    508 				return (B_FALSE);
    509 			nexthdrp = &desthdr->ip6d_nxt;
    510 			break;
    511 		case IPPROTO_ROUTING:
    512 			rthdr = (ip6_rthdr_t *)whereptr;
    513 			ehdrlen =  8 * (rthdr->ip6r_len + 1);
    514 			if ((uchar_t *)rthdr +  ehdrlen > endptr)
    515 				return (B_FALSE);
    516 			nexthdrp = &rthdr->ip6r_nxt;
    517 			break;
    518 		case IPPROTO_FRAGMENT:
    519 			fraghdr = (ip6_frag_t *)whereptr;
    520 			ehdrlen = sizeof (ip6_frag_t);
    521 			if ((uchar_t *)&fraghdr[1] > endptr)
    522 				return (B_FALSE);
    523 			nexthdrp = &fraghdr->ip6f_nxt;
    524 			if (ip_fragmented != NULL)
    525 				*ip_fragmented = B_TRUE;
    526 			if (ip_frag_ident != NULL)
    527 				*ip_frag_ident = fraghdr->ip6f_ident;
    528 			break;
    529 		case IPPROTO_NONE:
    530 			/* No next header means we're finished */
    531 		default:
    532 			*hdr_length = length;
    533 			*next_hdr = *nexthdrp;
    534 			return (B_TRUE);
    535 		}
    536 		length += ehdrlen;
    537 		whereptr += ehdrlen;
    538 		*hdr_length = length;
    539 		*next_hdr = *nexthdrp;
    540 	}
    541 	switch (*nexthdrp) {
    542 	case IPPROTO_HOPOPTS:
    543 	case IPPROTO_DSTOPTS:
    544 	case IPPROTO_ROUTING:
    545 	case IPPROTO_FRAGMENT:
    546 		/*
    547 		 * If any know extension headers are still to be processed,
    548 		 * the packet's malformed (or at least all the IP header(s) are
    549 		 * not in the same mblk - and that should never happen.
    550 		 */
    551 		return (B_FALSE);
    552 
    553 	default:
    554 		/*
    555 		 * If we get here, we know that all of the IP headers were in
    556 		 * the same mblk, even if the ULP header is in the next mblk.
    557 		 */
    558 		*hdr_length = length;
    559 		*next_hdr = *nexthdrp;
    560 		return (B_TRUE);
    561 	}
    562 }
    563 
    564 typedef struct mac_dladm_intr {
    565 	int	ino;
    566 	int	cpu_id;
    567 	char	driver_path[MAXPATHLEN];
    568 	char	nexus_path[MAXPATHLEN];
    569 } mac_dladm_intr_t;
    570 
    571 /* Bind the interrupt to cpu_num */
    572 static int
    573 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int ino)
    574 {
    575 	pcitool_intr_set_t	iset;
    576 	int			err;
    577 
    578 	iset.ino = ino;
    579 	iset.cpu_id = cpu_num;
    580 	iset.user_version = PCITOOL_VERSION;
    581 	err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL,
    582 	    kcred, NULL);
    583 
    584 	return (err);
    585 }
    586 
    587 /*
    588  * Search interrupt information. iget is filled in with the info to search
    589  */
    590 static boolean_t
    591 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln)
    592 {
    593 	int	i;
    594 	char	driver_path[2 * MAXPATHLEN];
    595 
    596 	for (i = 0; i < iget_p->num_devs; i++) {
    597 		(void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN);
    598 		(void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN,
    599 		    ":%s%d", iget_p->dev[i].driver_name,
    600 		    iget_p->dev[i].dev_inst);
    601 		/* Match the device path for the device path */
    602 		if (strcmp(driver_path, dln->driver_path) == 0) {
    603 			dln->ino = iget_p->ino;
    604 			dln->cpu_id = iget_p->cpu_id;
    605 			return (B_TRUE);
    606 		}
    607 	}
    608 	return (B_FALSE);
    609 }
    610 
    611 /*
    612  * Get information about ino, i.e. if this is the interrupt for our
    613  * device and where it is bound etc.
    614  */
    615 static boolean_t
    616 mac_get_single_intr(ldi_handle_t lh, int ino, mac_dladm_intr_t *dln)
    617 {
    618 	pcitool_intr_get_t	*iget_p;
    619 	int			ipsz;
    620 	int			nipsz;
    621 	int			err;
    622 	uint8_t			inum;
    623 
    624 	/*
    625 	 * Check if SLEEP is OK, i.e if could come here in response to
    626 	 * changing the fanout due to some callback from the driver, say
    627 	 * link speed changes.
    628 	 */
    629 	ipsz = PCITOOL_IGET_SIZE(0);
    630 	iget_p = kmem_zalloc(ipsz, KM_SLEEP);
    631 
    632 	iget_p->num_devs_ret = 0;
    633 	iget_p->user_version = PCITOOL_VERSION;
    634 	iget_p->ino = ino;
    635 
    636 	err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
    637 	    FKIOCTL, kcred, NULL);
    638 	if (err != 0) {
    639 		kmem_free(iget_p, ipsz);
    640 		return (B_FALSE);
    641 	}
    642 	if (iget_p->num_devs == 0) {
    643 		kmem_free(iget_p, ipsz);
    644 		return (B_FALSE);
    645 	}
    646 	inum = iget_p->num_devs;
    647 	if (iget_p->num_devs_ret < iget_p->num_devs) {
    648 		/* Reallocate */
    649 		nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs);
    650 
    651 		kmem_free(iget_p, ipsz);
    652 		ipsz = nipsz;
    653 		iget_p = kmem_zalloc(ipsz, KM_SLEEP);
    654 
    655 		iget_p->num_devs_ret = inum;
    656 		iget_p->ino = ino;
    657 		iget_p->user_version = PCITOOL_VERSION;
    658 		err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
    659 		    FKIOCTL, kcred, NULL);
    660 		if (err != 0) {
    661 			kmem_free(iget_p, ipsz);
    662 			return (B_FALSE);
    663 		}
    664 		/* defensive */
    665 		if (iget_p->num_devs != iget_p->num_devs_ret) {
    666 			kmem_free(iget_p, ipsz);
    667 			return (B_FALSE);
    668 		}
    669 	}
    670 
    671 	if (mac_search_intrinfo(iget_p, dln)) {
    672 		kmem_free(iget_p, ipsz);
    673 		return (B_TRUE);
    674 	}
    675 	kmem_free(iget_p, ipsz);
    676 	return (B_FALSE);
    677 }
    678 
    679 /*
    680  * Get the interrupts and check each one to see if it is for our device.
    681  */
    682 static int
    683 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid)
    684 {
    685 	pcitool_intr_info_t	intr_info;
    686 	int			err;
    687 	int			ino;
    688 
    689 	err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info,
    690 	    FKIOCTL, kcred, NULL);
    691 	if (err != 0)
    692 		return (-1);
    693 
    694 	for (ino = 0; ino < intr_info.num_intr; ino++) {
    695 		if (mac_get_single_intr(lh, ino, dln)) {
    696 			if (dln->cpu_id == cpuid)
    697 				return (0);
    698 			return (1);
    699 		}
    700 	}
    701 	return (-1);
    702 }
    703 
    704 /*
    705  * Obtain the nexus parent node info. for mdip.
    706  */
    707 static dev_info_t *
    708 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln)
    709 {
    710 	struct dev_info		*tdip = (struct dev_info *)mdip;
    711 	struct ddi_minor_data	*minordata;
    712 	int			circ;
    713 	dev_info_t		*pdip;
    714 	char			pathname[MAXPATHLEN];
    715 
    716 	while (tdip != NULL) {
    717 		/*
    718 		 * The netboot code could call this function while walking the
    719 		 * device tree so we need to use ndi_devi_tryenter() here to
    720 		 * avoid deadlock.
    721 		 */
    722 		if (ndi_devi_tryenter((dev_info_t *)tdip, &circ) == 0)
    723 			break;
    724 
    725 		for (minordata = tdip->devi_minor; minordata != NULL;
    726 		    minordata = minordata->next) {
    727 			if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL,
    728 			    strlen(DDI_NT_INTRCTL)) == 0) {
    729 				pdip = minordata->dip;
    730 				(void) ddi_pathname(pdip, pathname);
    731 				(void) snprintf(dln->nexus_path, MAXPATHLEN,
    732 				    "/devices%s:intr", pathname);
    733 				(void) ddi_pathname_minor(minordata, pathname);
    734 				ndi_devi_exit((dev_info_t *)tdip, circ);
    735 				return (pdip);
    736 			}
    737 		}
    738 		ndi_devi_exit((dev_info_t *)tdip, circ);
    739 		tdip = tdip->devi_parent;
    740 	}
    741 	return (NULL);
    742 }
    743 
    744 /*
    745  * For a primary MAC client, if the user has set a list or CPUs or
    746  * we have obtained it implicitly, we try to retarget the interrupt
    747  * for that device on one of the CPUs in the list.
    748  * We assign the interrupt to the same CPU as the poll thread.
    749  */
    750 static boolean_t
    751 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid)
    752 {
    753 	ldi_handle_t		lh = NULL;
    754 	ldi_ident_t		li = NULL;
    755 	int			err;
    756 	int			ret;
    757 	mac_dladm_intr_t	dln;
    758 	dev_info_t		*dip;
    759 	struct ddi_minor_data	*minordata;
    760 
    761 	dln.nexus_path[0] = '\0';
    762 	dln.driver_path[0] = '\0';
    763 
    764 	minordata = ((struct dev_info *)mdip)->devi_minor;
    765 	while (minordata != NULL) {
    766 		if (minordata->type == DDM_MINOR)
    767 			break;
    768 		minordata = minordata->next;
    769 	}
    770 	if (minordata == NULL)
    771 		return (B_FALSE);
    772 
    773 	(void) ddi_pathname_minor(minordata, dln.driver_path);
    774 
    775 	dip = mac_get_nexus_node(mdip, &dln);
    776 	/* defensive */
    777 	if (dip == NULL)
    778 		return (B_FALSE);
    779 
    780 	err = ldi_ident_from_major(ddi_driver_major(dip), &li);
    781 	if (err != 0)
    782 		return (B_FALSE);
    783 
    784 	err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li);
    785 	if (err != 0)
    786 		return (B_FALSE);
    787 
    788 	ret = mac_validate_intr(lh, &dln, cpuid);
    789 	if (ret < 0) {
    790 		(void) ldi_close(lh, FREAD|FWRITE, kcred);
    791 		return (B_FALSE);
    792 	}
    793 	/* cmn_note? */
    794 	if (ret != 0)
    795 		if ((err = (mac_set_intr(lh, cpuid, dln.ino))) != 0) {
    796 			(void) ldi_close(lh, FREAD|FWRITE, kcred);
    797 			return (B_FALSE);
    798 		}
    799 	(void) ldi_close(lh, FREAD|FWRITE, kcred);
    800 	return (B_TRUE);
    801 }
    802 
    803 void
    804 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid)
    805 {
    806 	dev_info_t		*mdip = (dev_info_t *)arg;
    807 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
    808 	mac_resource_props_t	*mrp;
    809 	mac_perim_handle_t	mph;
    810 
    811 	if (cpuid == -1 || !mac_check_interrupt_binding(mdip, cpuid))
    812 		return;
    813 
    814 	mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph);
    815 	mrp = MCIP_RESOURCE_PROPS(mcip);
    816 	mrp->mrp_intr_cpu = cpuid;
    817 	mac_perim_exit(mph);
    818 }
    819 
    820 int32_t
    821 mac_client_intr_cpu(mac_client_handle_t mch)
    822 {
    823 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
    824 	mac_cpus_t		*srs_cpu;
    825 	mac_soft_ring_set_t	*rx_srs;
    826 	flow_entry_t		*flent = mcip->mci_flent;
    827 	mac_resource_props_t	*mrp = MCIP_RESOURCE_PROPS(mcip);
    828 
    829 	/*
    830 	 * Check if we need to retarget the interrupt. We do this only
    831 	 * for the primary MAC client. We do this if we have the only
    832 	 *  exclusive ring in the group.
    833 	 */
    834 	if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) {
    835 		rx_srs = flent->fe_rx_srs[1];
    836 		srs_cpu = &rx_srs->srs_cpu;
    837 		if (mrp->mrp_intr_cpu == srs_cpu->mc_pollid)
    838 			return (-1);
    839 		return (srs_cpu->mc_pollid);
    840 	}
    841 	return (-1);
    842 }
    843 
    844 void *
    845 mac_get_devinfo(mac_handle_t mh)
    846 {
    847 	mac_impl_t	*mip = (mac_impl_t *)mh;
    848 
    849 	return ((void *)mip->mi_dip);
    850 }
    851 
    852 #define	PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1])
    853 #define	PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
    854 #define	PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
    855 
    856 uint64_t
    857 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound)
    858 {
    859 	struct ether_header *ehp;
    860 	uint64_t hash = 0;
    861 	uint16_t sap;
    862 	uint_t skip_len;
    863 	uint8_t proto;
    864 	boolean_t ip_fragmented;
    865 
    866 	/*
    867 	 * We may want to have one of these per MAC type plugin in the
    868 	 * future. For now supports only ethernet.
    869 	 */
    870 	if (media != DL_ETHER)
    871 		return (0L);
    872 
    873 	/* for now we support only outbound packets */
    874 	ASSERT(is_outbound);
    875 	ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
    876 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
    877 
    878 	/* compute L2 hash */
    879 
    880 	ehp = (struct ether_header *)mp->b_rptr;
    881 
    882 	if ((policy & MAC_PKT_HASH_L2) != 0) {
    883 		uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
    884 		uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
    885 		hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst);
    886 		policy &= ~MAC_PKT_HASH_L2;
    887 	}
    888 
    889 	if (policy == 0)
    890 		goto done;
    891 
    892 	/* skip ethernet header */
    893 
    894 	sap = ntohs(ehp->ether_type);
    895 	if (sap == ETHERTYPE_VLAN) {
    896 		struct ether_vlan_header *evhp;
    897 		mblk_t *newmp = NULL;
    898 
    899 		skip_len = sizeof (struct ether_vlan_header);
    900 		if (MBLKL(mp) < skip_len) {
    901 			/* the vlan tag is the payload, pull up first */
    902 			newmp = msgpullup(mp, -1);
    903 			if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
    904 				goto done;
    905 			}
    906 			evhp = (struct ether_vlan_header *)newmp->b_rptr;
    907 		} else {
    908 			evhp = (struct ether_vlan_header *)mp->b_rptr;
    909 		}
    910 
    911 		sap = ntohs(evhp->ether_type);
    912 		freemsg(newmp);
    913 	} else {
    914 		skip_len = sizeof (struct ether_header);
    915 	}
    916 
    917 	/* if ethernet header is in its own mblk, skip it */
    918 	if (MBLKL(mp) <= skip_len) {
    919 		skip_len -= MBLKL(mp);
    920 		mp = mp->b_cont;
    921 		if (mp == NULL)
    922 			goto done;
    923 	}
    924 
    925 	sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
    926 
    927 	/* compute IP src/dst addresses hash and skip IPv{4,6} header */
    928 
    929 	switch (sap) {
    930 	case ETHERTYPE_IP: {
    931 		ipha_t *iphp;
    932 
    933 		/*
    934 		 * If the header is not aligned or the header doesn't fit
    935 		 * in the mblk, bail now. Note that this may cause packets
    936 		 * reordering.
    937 		 */
    938 		iphp = (ipha_t *)(mp->b_rptr + skip_len);
    939 		if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
    940 		    !OK_32PTR((char *)iphp))
    941 			goto done;
    942 
    943 		proto = iphp->ipha_protocol;
    944 		skip_len += IPH_HDR_LENGTH(iphp);
    945 
    946 		/* Check if the packet is fragmented. */
    947 		ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) &
    948 		    IPH_OFFSET;
    949 
    950 		/*
    951 		 * For fragmented packets, use addresses in addition to
    952 		 * the frag_id to generate the hash inorder to get
    953 		 * better distribution.
    954 		 */
    955 		if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) {
    956 			uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
    957 			uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
    958 
    959 			hash ^= (PKT_HASH_4BYTES(ip_src) ^
    960 			    PKT_HASH_4BYTES(ip_dst));
    961 			policy &= ~MAC_PKT_HASH_L3;
    962 		}
    963 
    964 		if (ip_fragmented) {
    965 			uint8_t *identp = (uint8_t *)&iphp->ipha_ident;
    966 			hash ^= PKT_HASH_2BYTES(identp);
    967 			goto done;
    968 		}
    969 		break;
    970 	}
    971 	case ETHERTYPE_IPV6: {
    972 		ip6_t *ip6hp;
    973 		uint16_t hdr_length;
    974 		uint32_t ip_frag_ident;
    975 
    976 		/*
    977 		 * If the header is not aligned or the header doesn't fit
    978 		 * in the mblk, bail now. Note that this may cause packets
    979 		 * reordering.
    980 		 */
    981 
    982 		ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
    983 		if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
    984 		    !OK_32PTR((char *)ip6hp))
    985 			goto done;
    986 
    987 		if (!mac_ip_hdr_length_v6(mp, ip6hp, &hdr_length, &proto,
    988 		    &ip_fragmented, &ip_frag_ident))
    989 			goto done;
    990 		skip_len += hdr_length;
    991 
    992 		/*
    993 		 * For fragmented packets, use addresses in addition to
    994 		 * the frag_id to generate the hash inorder to get
    995 		 * better distribution.
    996 		 */
    997 		if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) {
    998 			uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
    999 			uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
   1000 
   1001 			hash ^= (PKT_HASH_4BYTES(ip_src) ^
   1002 			    PKT_HASH_4BYTES(ip_dst));
   1003 			policy &= ~MAC_PKT_HASH_L3;
   1004 		}
   1005 
   1006 		if (ip_fragmented) {
   1007 			uint8_t *identp = (uint8_t *)&ip_frag_ident;
   1008 			hash ^= PKT_HASH_4BYTES(identp);
   1009 			goto done;
   1010 		}
   1011 		break;
   1012 	}
   1013 	default:
   1014 		goto done;
   1015 	}
   1016 
   1017 	if (policy == 0)
   1018 		goto done;
   1019 
   1020 	/* if ip header is in its own mblk, skip it */
   1021 	if (MBLKL(mp) <= skip_len) {
   1022 		skip_len -= MBLKL(mp);
   1023 		mp = mp->b_cont;
   1024 		if (mp == NULL)
   1025 			goto done;
   1026 	}
   1027 
   1028 	/* parse ULP header */
   1029 again:
   1030 	switch (proto) {
   1031 	case IPPROTO_TCP:
   1032 	case IPPROTO_UDP:
   1033 	case IPPROTO_ESP:
   1034 	case IPPROTO_SCTP:
   1035 		/*
   1036 		 * These Internet Protocols are intentionally designed
   1037 		 * for hashing from the git-go.  Port numbers are in the first
   1038 		 * word for transports, SPI is first for ESP.
   1039 		 */
   1040 		if (mp->b_rptr + skip_len + 4 > mp->b_wptr)
   1041 			goto done;
   1042 		hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len));
   1043 		break;
   1044 
   1045 	case IPPROTO_AH: {
   1046 		ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
   1047 		uint_t ah_length = AH_TOTAL_LEN(ah);
   1048 
   1049 		if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr)
   1050 			goto done;
   1051 
   1052 		proto = ah->ah_nexthdr;
   1053 		skip_len += ah_length;
   1054 
   1055 		/* if AH header is in its own mblk, skip it */
   1056 		if (MBLKL(mp) <= skip_len) {
   1057 			skip_len -= MBLKL(mp);
   1058 			mp = mp->b_cont;
   1059 			if (mp == NULL)
   1060 				goto done;
   1061 		}
   1062 
   1063 		goto again;
   1064 	}
   1065 	}
   1066 
   1067 done:
   1068 	return (hash);
   1069 }
   1070