Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 /* Copyright (c) 1990 Mentat Inc. */
     26 
     27 /*
     28  * Internet Group Management Protocol (IGMP) routines.
     29  * Multicast Listener Discovery Protocol (MLD) routines.
     30  *
     31  * Written by Steve Deering, Stanford, May 1988.
     32  * Modified by Rosen Sharma, Stanford, Aug 1994.
     33  * Modified by Bill Fenner, Xerox PARC, Feb. 1995.
     34  *
     35  * MULTICAST 3.5.1.1
     36  */
     37 
     38 #include <sys/types.h>
     39 #include <sys/stream.h>
     40 #include <sys/stropts.h>
     41 #include <sys/strlog.h>
     42 #include <sys/strsun.h>
     43 #include <sys/systm.h>
     44 #include <sys/ddi.h>
     45 #include <sys/sunddi.h>
     46 #include <sys/cmn_err.h>
     47 #include <sys/atomic.h>
     48 #include <sys/zone.h>
     49 #include <sys/callb.h>
     50 #include <sys/param.h>
     51 #include <sys/socket.h>
     52 #include <inet/ipclassifier.h>
     53 #include <net/if.h>
     54 #include <net/route.h>
     55 #include <netinet/in.h>
     56 #include <netinet/igmp_var.h>
     57 #include <netinet/ip6.h>
     58 #include <netinet/icmp6.h>
     59 #include <inet/ipsec_impl.h>
     60 
     61 #include <inet/common.h>
     62 #include <inet/mi.h>
     63 #include <inet/nd.h>
     64 #include <inet/ip.h>
     65 #include <inet/ip6.h>
     66 #include <inet/ip_multi.h>
     67 #include <inet/ip_listutils.h>
     68 
     69 #include <netinet/igmp.h>
     70 #include <inet/ip_ndp.h>
     71 #include <inet/ip_if.h>
     72 
     73 static uint_t	igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill);
     74 static uint_t	igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen);
     75 static uint_t	mld_query_in(mld_hdr_t *mldh, ill_t *ill);
     76 static uint_t	mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen);
     77 static void	igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr);
     78 static void	mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr);
     79 static void	igmpv3_sendrpt(ill_t *ill, mrec_t *reclist);
     80 static void	mldv2_sendrpt(ill_t *ill, mrec_t *reclist);
     81 static mrec_t	*mcast_bldmrec(mcast_record_t type, in6_addr_t *grp,
     82 		    slist_t *srclist, mrec_t *next);
     83 static void	mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp,
     84 		    mcast_record_t rtype, slist_t *flist);
     85 static mrec_t	*mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
     86 
     87 /*
     88  * Macros used to do timer len conversions.  Timer values are always
     89  * stored and passed to the timer functions as milliseconds; but the
     90  * default values and values from the wire may not be.
     91  *
     92  * And yes, it's obscure, but decisecond is easier to abbreviate than
     93  * "tenths of a second".
     94  */
     95 #define	DSEC_TO_MSEC(dsec)	((dsec) * 100)
     96 #define	SEC_TO_MSEC(sec)	((sec) * 1000)
     97 
     98 /*
     99  * A running timer (scheduled thru timeout) can be cancelled if another
    100  * timer with a shorter timeout value is scheduled before it has timed
    101  * out.  When the shorter timer expires, the original timer is updated
    102  * to account for the time elapsed while the shorter timer ran; but this
    103  * does not take into account the amount of time already spent in timeout
    104  * state before being preempted by the shorter timer, that is the time
    105  * interval between time scheduled to time cancelled.  This can cause
    106  * delays in sending out multicast membership reports.  To resolve this
    107  * problem, wallclock time (absolute time) is used instead of deltas
    108  * (relative time) to track timers.
    109  *
    110  * The MACRO below gets the lbolt value, used for proper timer scheduling
    111  * and firing. Therefore multicast membership reports are sent on time.
    112  * The timer does not exactly fire at the time it was scehduled to fire,
    113  * there is a difference of a few milliseconds observed. An offset is used
    114  * to take care of the difference.
    115  */
    116 
    117 #define	CURRENT_MSTIME	((uint_t)TICK_TO_MSEC(ddi_get_lbolt()))
    118 #define	CURRENT_OFFSET	(999)
    119 
    120 /*
    121  * The first multicast join will trigger the igmp timers / mld timers
    122  * The unit for next is milliseconds.
    123  */
    124 void
    125 igmp_start_timers(unsigned next, ip_stack_t *ipst)
    126 {
    127 	int	time_left;
    128 	int	ret;
    129 	timeout_id_t id;
    130 
    131 	ASSERT(next != 0 && next != INFINITY);
    132 
    133 	mutex_enter(&ipst->ips_igmp_timer_lock);
    134 
    135 	if (ipst->ips_igmp_timer_setter_active) {
    136 		/*
    137 		 * Serialize timer setters, one at a time. If the
    138 		 * timer is currently being set by someone,
    139 		 * just record the next time when it has to be
    140 		 * invoked and return. The current setter will
    141 		 * take care.
    142 		 */
    143 		ipst->ips_igmp_time_to_next =
    144 		    MIN(ipst->ips_igmp_time_to_next, next);
    145 		mutex_exit(&ipst->ips_igmp_timer_lock);
    146 		return;
    147 	} else {
    148 		ipst->ips_igmp_timer_setter_active = B_TRUE;
    149 	}
    150 	if (ipst->ips_igmp_timeout_id == 0) {
    151 		/*
    152 		 * The timer is inactive. We need to start a timer
    153 		 */
    154 		ipst->ips_igmp_time_to_next = next;
    155 		ipst->ips_igmp_timeout_id = timeout(igmp_timeout_handler,
    156 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_igmp_time_to_next));
    157 		ipst->ips_igmp_timer_scheduled_last = ddi_get_lbolt();
    158 		ipst->ips_igmp_timer_setter_active = B_FALSE;
    159 		mutex_exit(&ipst->ips_igmp_timer_lock);
    160 		return;
    161 	}
    162 
    163 	/*
    164 	 * The timer was scheduled sometime back for firing in
    165 	 * 'igmp_time_to_next' ms and is active. We need to
    166 	 * reschedule the timeout if the new 'next' will happen
    167 	 * earlier than the currently scheduled timeout
    168 	 */
    169 	time_left = ipst->ips_igmp_timer_scheduled_last +
    170 	    MSEC_TO_TICK(ipst->ips_igmp_time_to_next) - ddi_get_lbolt();
    171 	if (time_left < MSEC_TO_TICK(next)) {
    172 		ipst->ips_igmp_timer_setter_active = B_FALSE;
    173 		mutex_exit(&ipst->ips_igmp_timer_lock);
    174 		return;
    175 	}
    176 	id = ipst->ips_igmp_timeout_id;
    177 
    178 	mutex_exit(&ipst->ips_igmp_timer_lock);
    179 	ret = untimeout(id);
    180 	mutex_enter(&ipst->ips_igmp_timer_lock);
    181 	/*
    182 	 * The timeout was cancelled, or the timeout handler
    183 	 * completed, while we were blocked in the untimeout.
    184 	 * No other thread could have set the timer meanwhile
    185 	 * since we serialized all the timer setters. Thus
    186 	 * no timer is currently active nor executing nor will
    187 	 * any timer fire in the future. We start the timer now
    188 	 * if needed.
    189 	 */
    190 	if (ret == -1) {
    191 		ASSERT(ipst->ips_igmp_timeout_id == 0);
    192 	} else {
    193 		ASSERT(ipst->ips_igmp_timeout_id != 0);
    194 		ipst->ips_igmp_timeout_id = 0;
    195 	}
    196 	if (ipst->ips_igmp_time_to_next != 0) {
    197 		ipst->ips_igmp_time_to_next =
    198 		    MIN(ipst->ips_igmp_time_to_next, next);
    199 		ipst->ips_igmp_timeout_id = timeout(igmp_timeout_handler,
    200 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_igmp_time_to_next));
    201 		ipst->ips_igmp_timer_scheduled_last = ddi_get_lbolt();
    202 	}
    203 	ipst->ips_igmp_timer_setter_active = B_FALSE;
    204 	mutex_exit(&ipst->ips_igmp_timer_lock);
    205 }
    206 
    207 /*
    208  * mld_start_timers:
    209  * The unit for next is milliseconds.
    210  */
    211 void
    212 mld_start_timers(unsigned next, ip_stack_t *ipst)
    213 {
    214 	int	time_left;
    215 	int	ret;
    216 	timeout_id_t id;
    217 
    218 	ASSERT(next != 0 && next != INFINITY);
    219 
    220 	mutex_enter(&ipst->ips_mld_timer_lock);
    221 	if (ipst->ips_mld_timer_setter_active) {
    222 		/*
    223 		 * Serialize timer setters, one at a time. If the
    224 		 * timer is currently being set by someone,
    225 		 * just record the next time when it has to be
    226 		 * invoked and return. The current setter will
    227 		 * take care.
    228 		 */
    229 		ipst->ips_mld_time_to_next =
    230 		    MIN(ipst->ips_mld_time_to_next, next);
    231 		mutex_exit(&ipst->ips_mld_timer_lock);
    232 		return;
    233 	} else {
    234 		ipst->ips_mld_timer_setter_active = B_TRUE;
    235 	}
    236 	if (ipst->ips_mld_timeout_id == 0) {
    237 		/*
    238 		 * The timer is inactive. We need to start a timer
    239 		 */
    240 		ipst->ips_mld_time_to_next = next;
    241 		ipst->ips_mld_timeout_id = timeout(mld_timeout_handler,
    242 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_mld_time_to_next));
    243 		ipst->ips_mld_timer_scheduled_last = ddi_get_lbolt();
    244 		ipst->ips_mld_timer_setter_active = B_FALSE;
    245 		mutex_exit(&ipst->ips_mld_timer_lock);
    246 		return;
    247 	}
    248 
    249 	/*
    250 	 * The timer was scheduled sometime back for firing in
    251 	 * 'igmp_time_to_next' ms and is active. We need to
    252 	 * reschedule the timeout if the new 'next' will happen
    253 	 * earlier than the currently scheduled timeout
    254 	 */
    255 	time_left = ipst->ips_mld_timer_scheduled_last +
    256 	    MSEC_TO_TICK(ipst->ips_mld_time_to_next) - ddi_get_lbolt();
    257 	if (time_left < MSEC_TO_TICK(next)) {
    258 		ipst->ips_mld_timer_setter_active = B_FALSE;
    259 		mutex_exit(&ipst->ips_mld_timer_lock);
    260 		return;
    261 	}
    262 	id = ipst->ips_mld_timeout_id;
    263 
    264 	mutex_exit(&ipst->ips_mld_timer_lock);
    265 	ret = untimeout(id);
    266 	mutex_enter(&ipst->ips_mld_timer_lock);
    267 	/*
    268 	 * The timeout was cancelled, or the timeout handler
    269 	 * completed, while we were blocked in the untimeout.
    270 	 * No other thread could have set the timer meanwhile
    271 	 * since we serialized all the timer setters. Thus
    272 	 * no timer is currently active nor executing nor will
    273 	 * any timer fire in the future. We start the timer now
    274 	 * if needed.
    275 	 */
    276 	if (ret == -1) {
    277 		ASSERT(ipst->ips_mld_timeout_id == 0);
    278 	} else {
    279 		ASSERT(ipst->ips_mld_timeout_id != 0);
    280 		ipst->ips_mld_timeout_id = 0;
    281 	}
    282 	if (ipst->ips_mld_time_to_next != 0) {
    283 		ipst->ips_mld_time_to_next =
    284 		    MIN(ipst->ips_mld_time_to_next, next);
    285 		ipst->ips_mld_timeout_id = timeout(mld_timeout_handler,
    286 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_mld_time_to_next));
    287 		ipst->ips_mld_timer_scheduled_last = ddi_get_lbolt();
    288 	}
    289 	ipst->ips_mld_timer_setter_active = B_FALSE;
    290 	mutex_exit(&ipst->ips_mld_timer_lock);
    291 }
    292 
    293 /*
    294  * igmp_input:
    295  * Return NULL for a bad packet that is discarded here.
    296  * Return mp if the message is OK and should be handed to "raw" receivers.
    297  * Callers of igmp_input() may need to reinitialize variables that were copied
    298  * from the mblk as this calls pullupmsg().
    299  */
    300 mblk_t *
    301 igmp_input(mblk_t *mp, ip_recv_attr_t *ira)
    302 {
    303 	igmpa_t 	*igmpa;
    304 	ipha_t		*ipha = (ipha_t *)(mp->b_rptr);
    305 	int		iphlen, igmplen, mblklen;
    306 	ilm_t 		*ilm;
    307 	uint32_t	src, dst;
    308 	uint32_t 	group;
    309 	in6_addr_t	v6group;
    310 	uint_t		next;
    311 	ipif_t 		*ipif;
    312 	ill_t		*ill = ira->ira_ill;
    313 	ip_stack_t	*ipst = ill->ill_ipst;
    314 
    315 	ASSERT(!ill->ill_isv6);
    316 	++ipst->ips_igmpstat.igps_rcv_total;
    317 
    318 	mblklen = MBLKL(mp);
    319 	iphlen = ira->ira_ip_hdr_length;
    320 	if (mblklen < 1 || mblklen < iphlen) {
    321 		++ipst->ips_igmpstat.igps_rcv_tooshort;
    322 		goto bad_pkt;
    323 	}
    324 	igmplen = ira->ira_pktlen - iphlen;
    325 	/*
    326 	 * Since msg sizes are more variable with v3, just pullup the
    327 	 * whole thing now.
    328 	 */
    329 	if (MBLKL(mp) < (igmplen + iphlen)) {
    330 		mblk_t *mp1;
    331 		if ((mp1 = msgpullup(mp, -1)) == NULL) {
    332 			++ipst->ips_igmpstat.igps_rcv_tooshort;
    333 			goto bad_pkt;
    334 		}
    335 		freemsg(mp);
    336 		mp = mp1;
    337 		ipha = (ipha_t *)(mp->b_rptr);
    338 	}
    339 
    340 	/*
    341 	 * Validate lengths
    342 	 */
    343 	if (igmplen < IGMP_MINLEN) {
    344 		++ipst->ips_igmpstat.igps_rcv_tooshort;
    345 		goto bad_pkt;
    346 	}
    347 
    348 	igmpa = (igmpa_t *)(&mp->b_rptr[iphlen]);
    349 	src = ipha->ipha_src;
    350 	dst = ipha->ipha_dst;
    351 	if (ip_debug > 1)
    352 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
    353 		    "igmp_input: src 0x%x, dst 0x%x on %s\n",
    354 		    (int)ntohl(src), (int)ntohl(dst),
    355 		    ill->ill_name);
    356 
    357 	switch (igmpa->igmpa_type) {
    358 	case IGMP_MEMBERSHIP_QUERY:
    359 		/*
    360 		 * packet length differentiates between v1/v2 and v3
    361 		 * v1/v2 should be exactly 8 octets long; v3 is >= 12
    362 		 */
    363 		if ((igmplen == IGMP_MINLEN) ||
    364 		    (ipst->ips_igmp_max_version <= IGMP_V2_ROUTER)) {
    365 			next = igmp_query_in(ipha, igmpa, ill);
    366 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
    367 			next = igmpv3_query_in((igmp3qa_t *)igmpa, ill,
    368 			    igmplen);
    369 		} else {
    370 			++ipst->ips_igmpstat.igps_rcv_tooshort;
    371 			goto bad_pkt;
    372 		}
    373 		if (next == 0)
    374 			goto bad_pkt;
    375 
    376 		if (next != INFINITY)
    377 			igmp_start_timers(next, ipst);
    378 
    379 		break;
    380 
    381 	case IGMP_V1_MEMBERSHIP_REPORT:
    382 	case IGMP_V2_MEMBERSHIP_REPORT:
    383 		/*
    384 		 * For fast leave to work, we have to know that we are the
    385 		 * last person to send a report for this group. Reports
    386 		 * generated by us are looped back since we could potentially
    387 		 * be a multicast router, so discard reports sourced by me.
    388 		 */
    389 		mutex_enter(&ill->ill_lock);
    390 		for (ipif = ill->ill_ipif; ipif != NULL;
    391 		    ipif = ipif->ipif_next) {
    392 			if (ipif->ipif_lcl_addr == src) {
    393 				if (ip_debug > 1) {
    394 					(void) mi_strlog(ill->ill_rq,
    395 					    1,
    396 					    SL_TRACE,
    397 					    "igmp_input: we are only "
    398 					    "member src 0x%x\n",
    399 					    (int)ntohl(src));
    400 				}
    401 				mutex_exit(&ill->ill_lock);
    402 				return (mp);
    403 			}
    404 		}
    405 		mutex_exit(&ill->ill_lock);
    406 
    407 		++ipst->ips_igmpstat.igps_rcv_reports;
    408 		group = igmpa->igmpa_group;
    409 		if (!CLASSD(group)) {
    410 			++ipst->ips_igmpstat.igps_rcv_badreports;
    411 			goto bad_pkt;
    412 		}
    413 
    414 		/*
    415 		 * KLUDGE: if the IP source address of the report has an
    416 		 * unspecified (i.e., zero) subnet number, as is allowed for
    417 		 * a booting host, replace it with the correct subnet number
    418 		 * so that a process-level multicast routing demon can
    419 		 * determine which subnet it arrived from.  This is necessary
    420 		 * to compensate for the lack of any way for a process to
    421 		 * determine the arrival interface of an incoming packet.
    422 		 *
    423 		 * Requires that a copy of *this* message it passed up
    424 		 * to the raw interface which is done by our caller.
    425 		 */
    426 		if ((src & htonl(0xFF000000U)) == 0) {	/* Minimum net mask */
    427 			/* Pick the first ipif on this ill */
    428 			mutex_enter(&ill->ill_lock);
    429 			src = ill->ill_ipif->ipif_subnet;
    430 			mutex_exit(&ill->ill_lock);
    431 			ip1dbg(("igmp_input: changed src to 0x%x\n",
    432 			    (int)ntohl(src)));
    433 			ipha->ipha_src = src;
    434 		}
    435 
    436 		/*
    437 		 * If our ill has ILMs that belong to the group being
    438 		 * reported, and we are a 'Delaying Member' in the RFC
    439 		 * terminology, stop our timer for that group and 'clear
    440 		 * flag' i.e. mark as IGMP_OTHERMEMBER.
    441 		 */
    442 		rw_enter(&ill->ill_mcast_lock, RW_WRITER);
    443 		IN6_IPADDR_TO_V4MAPPED(group, &v6group);
    444 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
    445 			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, &v6group))
    446 				continue;
    447 
    448 			++ipst->ips_igmpstat.igps_rcv_ourreports;
    449 			ilm->ilm_timer = INFINITY;
    450 			ilm->ilm_state = IGMP_OTHERMEMBER;
    451 		} /* for */
    452 		rw_exit(&ill->ill_mcast_lock);
    453 		ill_mcast_timer_start(ill->ill_ipst);
    454 		break;
    455 
    456 	case IGMP_V3_MEMBERSHIP_REPORT:
    457 		/*
    458 		 * Currently nothing to do here; IGMP router is not
    459 		 * implemented in ip, and v3 hosts don't pay attention
    460 		 * to membership reports.
    461 		 */
    462 		break;
    463 	}
    464 	/*
    465 	 * Pass all valid IGMP packets up to any process(es) listening
    466 	 * on a raw IGMP socket. Do not free the packet.
    467 	 */
    468 	return (mp);
    469 
    470 bad_pkt:
    471 	freemsg(mp);
    472 	return (NULL);
    473 }
    474 
    475 static uint_t
    476 igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
    477 {
    478 	ilm_t	*ilm;
    479 	int	timer;
    480 	uint_t	next, current;
    481 	ip_stack_t	 *ipst;
    482 
    483 	ipst = ill->ill_ipst;
    484 	++ipst->ips_igmpstat.igps_rcv_queries;
    485 
    486 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
    487 	/*
    488 	 * In the IGMPv2 specification, there are 3 states and a flag.
    489 	 *
    490 	 * In Non-Member state, we simply don't have a membership record.
    491 	 * In Delaying Member state, our timer is running (ilm->ilm_timer
    492 	 * < INFINITY).  In Idle Member state, our timer is not running
    493 	 * (ilm->ilm_timer == INFINITY).
    494 	 *
    495 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
    496 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
    497 	 * if I sent the last report.
    498 	 */
    499 	if ((igmpa->igmpa_code == 0) ||
    500 	    (ipst->ips_igmp_max_version == IGMP_V1_ROUTER)) {
    501 		/*
    502 		 * Query from an old router.
    503 		 * Remember that the querier on this interface is old,
    504 		 * and set the timer to the value in RFC 1112.
    505 		 */
    506 		ill->ill_mcast_v1_time = 0;
    507 		ill->ill_mcast_v1_tset = 1;
    508 		if (ill->ill_mcast_type != IGMP_V1_ROUTER) {
    509 			ip1dbg(("Received IGMPv1 Query on %s, switching mode "
    510 			    "to IGMP_V1_ROUTER\n", ill->ill_name));
    511 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
    512 			ill->ill_mcast_type = IGMP_V1_ROUTER;
    513 		}
    514 
    515 		timer = SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY);
    516 
    517 		if (ipha->ipha_dst != htonl(INADDR_ALLHOSTS_GROUP) ||
    518 		    igmpa->igmpa_group != 0) {
    519 			++ipst->ips_igmpstat.igps_rcv_badqueries;
    520 			rw_exit(&ill->ill_mcast_lock);
    521 			ill_mcast_timer_start(ill->ill_ipst);
    522 			return (0);
    523 		}
    524 
    525 	} else {
    526 		in_addr_t group;
    527 
    528 		/*
    529 		 * Query from a new router
    530 		 * Simply do a validity check
    531 		 */
    532 		group = igmpa->igmpa_group;
    533 		if (group != 0 && (!CLASSD(group))) {
    534 			++ipst->ips_igmpstat.igps_rcv_badqueries;
    535 			rw_exit(&ill->ill_mcast_lock);
    536 			ill_mcast_timer_start(ill->ill_ipst);
    537 			return (0);
    538 		}
    539 
    540 		/*
    541 		 * Switch interface state to v2 on receipt of a v2 query
    542 		 * ONLY IF current state is v3.  Let things be if current
    543 		 * state if v1 but do reset the v2-querier-present timer.
    544 		 */
    545 		if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
    546 			ip1dbg(("Received IGMPv2 Query on %s, switching mode "
    547 			    "to IGMP_V2_ROUTER", ill->ill_name));
    548 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v2, 1);
    549 			ill->ill_mcast_type = IGMP_V2_ROUTER;
    550 		}
    551 		ill->ill_mcast_v2_time = 0;
    552 		ill->ill_mcast_v2_tset = 1;
    553 
    554 		timer = DSEC_TO_MSEC((int)igmpa->igmpa_code);
    555 	}
    556 
    557 	if (ip_debug > 1) {
    558 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
    559 		    "igmp_input: TIMER = igmp_code %d igmp_type 0x%x",
    560 		    (int)ntohs(igmpa->igmpa_code),
    561 		    (int)ntohs(igmpa->igmpa_type));
    562 	}
    563 
    564 	/*
    565 	 * -Start the timers in all of our membership records
    566 	 *  for the physical interface on which the query
    567 	 *  arrived, excluding those that belong to the "all
    568 	 *  hosts" group (224.0.0.1).
    569 	 *
    570 	 * -Restart any timer that is already running but has
    571 	 *  a value longer than the requested timeout.
    572 	 *
    573 	 * -Use the value specified in the query message as
    574 	 *  the maximum timeout.
    575 	 */
    576 	next = (unsigned)INFINITY;
    577 
    578 	current = CURRENT_MSTIME;
    579 	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
    580 
    581 		/*
    582 		 * A multicast router joins INADDR_ANY address
    583 		 * to enable promiscuous reception of all
    584 		 * mcasts from the interface. This INADDR_ANY
    585 		 * is stored in the ilm_v6addr as V6 unspec addr
    586 		 */
    587 		if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr))
    588 			continue;
    589 		if (ilm->ilm_addr == htonl(INADDR_ANY))
    590 			continue;
    591 		if (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP) &&
    592 		    (igmpa->igmpa_group == 0) ||
    593 		    (igmpa->igmpa_group == ilm->ilm_addr)) {
    594 			if (ilm->ilm_timer > timer) {
    595 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
    596 				if (ilm->ilm_timer < next)
    597 					next = ilm->ilm_timer;
    598 				ilm->ilm_timer += current;
    599 			}
    600 		}
    601 	}
    602 	rw_exit(&ill->ill_mcast_lock);
    603 	/*
    604 	 * No packets have been sent above - no
    605 	 * ill_mcast_send_queued is needed.
    606 	 */
    607 	ill_mcast_timer_start(ill->ill_ipst);
    608 
    609 	return (next);
    610 }
    611 
    612 static uint_t
    613 igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
    614 {
    615 	uint_t		i, next, mrd, qqi, timer, delay, numsrc;
    616 	uint_t		current;
    617 	ilm_t		*ilm;
    618 	ipaddr_t	*src_array;
    619 	uint8_t		qrv;
    620 	ip_stack_t	 *ipst;
    621 
    622 	ipst = ill->ill_ipst;
    623 	/* make sure numsrc matches packet size */
    624 	numsrc = ntohs(igmp3qa->igmp3qa_numsrc);
    625 	if (igmplen < IGMP_V3_QUERY_MINLEN + (numsrc * sizeof (ipaddr_t))) {
    626 		++ipst->ips_igmpstat.igps_rcv_tooshort;
    627 		return (0);
    628 	}
    629 	src_array = (ipaddr_t *)&igmp3qa[1];
    630 
    631 	++ipst->ips_igmpstat.igps_rcv_queries;
    632 
    633 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
    634 
    635 	if ((mrd = (uint_t)igmp3qa->igmp3qa_mxrc) >= IGMP_V3_MAXRT_FPMIN) {
    636 		uint_t hdrval, mant, exp;
    637 		hdrval = (uint_t)igmp3qa->igmp3qa_mxrc;
    638 		mant = hdrval & IGMP_V3_MAXRT_MANT_MASK;
    639 		exp = (hdrval & IGMP_V3_MAXRT_EXP_MASK) >> 4;
    640 		mrd = (mant | 0x10) << (exp + 3);
    641 	}
    642 	if (mrd == 0)
    643 		mrd = MCAST_DEF_QUERY_RESP_INTERVAL;
    644 	timer = DSEC_TO_MSEC(mrd);
    645 	MCAST_RANDOM_DELAY(delay, timer);
    646 	next = (unsigned)INFINITY;
    647 	current = CURRENT_MSTIME;
    648 
    649 	if ((qrv = igmp3qa->igmp3qa_sqrv & IGMP_V3_RV_MASK) == 0)
    650 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
    651 	else
    652 		ill->ill_mcast_rv = qrv;
    653 
    654 	if ((qqi = (uint_t)igmp3qa->igmp3qa_qqic) >= IGMP_V3_QQI_FPMIN) {
    655 		uint_t hdrval, mant, exp;
    656 		hdrval = (uint_t)igmp3qa->igmp3qa_qqic;
    657 		mant = hdrval & IGMP_V3_QQI_MANT_MASK;
    658 		exp = (hdrval & IGMP_V3_QQI_EXP_MASK) >> 4;
    659 		qqi = (mant | 0x10) << (exp + 3);
    660 	}
    661 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
    662 
    663 	/*
    664 	 * If we have a pending general query response that's scheduled
    665 	 * sooner than the delay we calculated for this response, then
    666 	 * no action is required (RFC3376 section 5.2 rule 1)
    667 	 */
    668 	if (ill->ill_global_timer < (current + delay)) {
    669 		rw_exit(&ill->ill_mcast_lock);
    670 		ill_mcast_timer_start(ill->ill_ipst);
    671 		return (next);
    672 	}
    673 
    674 	/*
    675 	 * Now take action depending upon query type:
    676 	 * general, group specific, or group/source specific.
    677 	 */
    678 	if ((numsrc == 0) && (igmp3qa->igmp3qa_group == INADDR_ANY)) {
    679 		/*
    680 		 * general query
    681 		 * We know global timer is either not running or is
    682 		 * greater than our calculated delay, so reset it to
    683 		 * our delay (random value in range [0, response time]).
    684 		 */
    685 		ill->ill_global_timer =  current + delay;
    686 		next = delay;
    687 	} else {
    688 		/* group or group/source specific query */
    689 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
    690 			if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) ||
    691 			    (ilm->ilm_addr == htonl(INADDR_ANY)) ||
    692 			    (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) ||
    693 			    (igmp3qa->igmp3qa_group != ilm->ilm_addr))
    694 				continue;
    695 			/*
    696 			 * If the query is group specific or we have a
    697 			 * pending group specific query, the response is
    698 			 * group specific (pending sources list should be
    699 			 * empty).  Otherwise, need to update the pending
    700 			 * sources list for the group and source specific
    701 			 * response.
    702 			 */
    703 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
    704 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
    705 group_query:
    706 				FREE_SLIST(ilm->ilm_pendsrcs);
    707 				ilm->ilm_pendsrcs = NULL;
    708 			} else {
    709 				boolean_t overflow;
    710 				slist_t *pktl;
    711 				if (numsrc > MAX_FILTER_SIZE ||
    712 				    (ilm->ilm_pendsrcs == NULL &&
    713 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
    714 					/*
    715 					 * We've been sent more sources than
    716 					 * we can deal with; or we can't deal
    717 					 * with a source list at all.  Revert
    718 					 * to a group specific query.
    719 					 */
    720 					goto group_query;
    721 				}
    722 				if ((pktl = l_alloc()) == NULL)
    723 					goto group_query;
    724 				pktl->sl_numsrc = numsrc;
    725 				for (i = 0; i < numsrc; i++)
    726 					IN6_IPADDR_TO_V4MAPPED(src_array[i],
    727 					    &(pktl->sl_addr[i]));
    728 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
    729 				    &overflow);
    730 				l_free(pktl);
    731 				if (overflow)
    732 					goto group_query;
    733 			}
    734 
    735 			ilm->ilm_timer = (ilm->ilm_timer == INFINITY) ?
    736 			    INFINITY : (ilm->ilm_timer - current);
    737 			/* choose soonest timer */
    738 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
    739 			if (ilm->ilm_timer < next)
    740 				next = ilm->ilm_timer;
    741 			ilm->ilm_timer += current;
    742 		}
    743 	}
    744 	rw_exit(&ill->ill_mcast_lock);
    745 	/*
    746 	 * No packets have been sent above - no
    747 	 * ill_mcast_send_queued is needed.
    748 	 */
    749 	ill_mcast_timer_start(ill->ill_ipst);
    750 
    751 	return (next);
    752 }
    753 
    754 /*
    755  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
    756  * and it gets sent after the lock is dropped.
    757  */
    758 void
    759 igmp_joingroup(ilm_t *ilm)
    760 {
    761 	uint_t	timer;
    762 	ill_t	*ill;
    763 	ip_stack_t	*ipst = ilm->ilm_ipst;
    764 
    765 	ill = ilm->ilm_ill;
    766 
    767 	ASSERT(!ill->ill_isv6);
    768 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
    769 
    770 	if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) {
    771 		ilm->ilm_rtx.rtx_timer = INFINITY;
    772 		ilm->ilm_state = IGMP_OTHERMEMBER;
    773 	} else {
    774 		ip1dbg(("Querier mode %d, sending report, group %x\n",
    775 		    ill->ill_mcast_type, htonl(ilm->ilm_addr)));
    776 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
    777 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
    778 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
    779 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
    780 		} else if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
    781 			mrec_t *rp;
    782 			mcast_record_t rtype;
    783 			/*
    784 			 * The possible state changes we need to handle here:
    785 			 *   Old State	New State	Report
    786 			 *
    787 			 *   INCLUDE(0)	INCLUDE(X)	ALLOW(X),BLOCK(0)
    788 			 *   INCLUDE(0)	EXCLUDE(X)	TO_EX(X)
    789 			 *
    790 			 * No need to send the BLOCK(0) report; ALLOW(X)
    791 			 * is enough.
    792 			 */
    793 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
    794 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
    795 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
    796 			    ilm->ilm_filter, NULL);
    797 			igmpv3_sendrpt(ill, rp);
    798 			/*
    799 			 * Set up retransmission state.  Timer is set below,
    800 			 * for both v3 and older versions.
    801 			 */
    802 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
    803 			    ilm->ilm_filter);
    804 		}
    805 
    806 		/* Set the ilm timer value */
    807 		ilm->ilm_rtx.rtx_cnt = ill->ill_mcast_rv;
    808 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
    809 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
    810 		timer = ilm->ilm_rtx.rtx_timer;
    811 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
    812 		ilm->ilm_state = IGMP_IREPORTEDLAST;
    813 
    814 		/*
    815 		 * We are holding ill_mcast_lock here and the timeout
    816 		 * handler (igmp_timeout_handler_per_ill) acquires that
    817 		 * lock. Hence we can't call igmp_start_timers since it could
    818 		 * deadlock in untimeout().
    819 		 * Instead the thread which drops ill_mcast_lock will have
    820 		 * to call ill_mcast_timer_start().
    821 		 */
    822 		mutex_enter(&ipst->ips_igmp_timer_lock);
    823 		ipst->ips_igmp_deferred_next = MIN(timer,
    824 		    ipst->ips_igmp_deferred_next);
    825 		mutex_exit(&ipst->ips_igmp_timer_lock);
    826 	}
    827 
    828 	if (ip_debug > 1) {
    829 		(void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
    830 		    "igmp_joingroup: multicast_type %d timer %d",
    831 		    (ilm->ilm_ill->ill_mcast_type),
    832 		    (int)ntohl(timer));
    833 	}
    834 }
    835 
    836 /*
    837  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
    838  * and it gets sent after the lock is dropped.
    839  */
    840 void
    841 mld_joingroup(ilm_t *ilm)
    842 {
    843 	uint_t	timer;
    844 	ill_t	*ill;
    845 	ip_stack_t	*ipst = ilm->ilm_ipst;
    846 
    847 	ill = ilm->ilm_ill;
    848 
    849 	ASSERT(ill->ill_isv6);
    850 
    851 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
    852 
    853 	if (IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr)) {
    854 		ilm->ilm_rtx.rtx_timer = INFINITY;
    855 		ilm->ilm_state = IGMP_OTHERMEMBER;
    856 	} else {
    857 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
    858 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
    859 		} else {
    860 			mrec_t *rp;
    861 			mcast_record_t rtype;
    862 			/*
    863 			 * The possible state changes we need to handle here:
    864 			 *	Old State   New State	Report
    865 			 *
    866 			 *	INCLUDE(0)  INCLUDE(X)	ALLOW(X),BLOCK(0)
    867 			 *	INCLUDE(0)  EXCLUDE(X)	TO_EX(X)
    868 			 *
    869 			 * No need to send the BLOCK(0) report; ALLOW(X)
    870 			 * is enough
    871 			 */
    872 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
    873 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
    874 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
    875 			    ilm->ilm_filter, NULL);
    876 			mldv2_sendrpt(ill, rp);
    877 			/*
    878 			 * Set up retransmission state.  Timer is set below,
    879 			 * for both v2 and v1.
    880 			 */
    881 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
    882 			    ilm->ilm_filter);
    883 		}
    884 
    885 		/* Set the ilm timer value */
    886 		ASSERT(ill->ill_mcast_type != MLD_V2_ROUTER ||
    887 		    ilm->ilm_rtx.rtx_cnt > 0);
    888 
    889 		ilm->ilm_rtx.rtx_cnt = ill->ill_mcast_rv;
    890 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
    891 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
    892 		timer = ilm->ilm_rtx.rtx_timer;
    893 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
    894 		ilm->ilm_state = IGMP_IREPORTEDLAST;
    895 
    896 		/*
    897 		 * We are holding ill_mcast_lock here and the timeout
    898 		 * handler (mld_timeout_handler_per_ill) acquires that
    899 		 * lock. Hence we can't call mld_start_timers since it could
    900 		 * deadlock in untimeout().
    901 		 * Instead the thread which drops ill_mcast_lock will have
    902 		 * to call ill_mcast_timer_start().
    903 		 */
    904 		mutex_enter(&ipst->ips_mld_timer_lock);
    905 		ipst->ips_mld_deferred_next = MIN(timer,
    906 		    ipst->ips_mld_deferred_next);
    907 		mutex_exit(&ipst->ips_mld_timer_lock);
    908 	}
    909 
    910 	if (ip_debug > 1) {
    911 		(void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
    912 		    "mld_joingroup: multicast_type %d timer %d",
    913 		    (ilm->ilm_ill->ill_mcast_type),
    914 		    (int)ntohl(timer));
    915 	}
    916 }
    917 
    918 /*
    919  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
    920  * and it gets sent after the lock is dropped.
    921  */
    922 void
    923 igmp_leavegroup(ilm_t *ilm)
    924 {
    925 	ill_t *ill = ilm->ilm_ill;
    926 
    927 	ASSERT(!ill->ill_isv6);
    928 
    929 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
    930 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
    931 	    ill->ill_mcast_type == IGMP_V2_ROUTER &&
    932 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
    933 		igmp_sendpkt(ilm, IGMP_V2_LEAVE_GROUP,
    934 		    (htonl(INADDR_ALLRTRS_GROUP)));
    935 		return;
    936 	}
    937 	if ((ill->ill_mcast_type == IGMP_V3_ROUTER) &&
    938 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
    939 		mrec_t *rp;
    940 		/*
    941 		 * The possible state changes we need to handle here:
    942 		 *	Old State	New State	Report
    943 		 *
    944 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
    945 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
    946 		 *
    947 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
    948 		 */
    949 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
    950 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
    951 			    ilm->ilm_filter, NULL);
    952 		} else {
    953 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
    954 			    NULL, NULL);
    955 		}
    956 		igmpv3_sendrpt(ill, rp);
    957 		return;
    958 	}
    959 }
    960 
    961 /*
    962  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
    963  * and it gets sent after the lock is dropped.
    964  */
    965 void
    966 mld_leavegroup(ilm_t *ilm)
    967 {
    968 	ill_t *ill = ilm->ilm_ill;
    969 
    970 	ASSERT(ill->ill_isv6);
    971 
    972 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
    973 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
    974 	    ill->ill_mcast_type == MLD_V1_ROUTER &&
    975 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
    976 		mld_sendpkt(ilm, MLD_LISTENER_REDUCTION, &ipv6_all_rtrs_mcast);
    977 		return;
    978 	}
    979 	if ((ill->ill_mcast_type == MLD_V2_ROUTER) &&
    980 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
    981 		mrec_t *rp;
    982 		/*
    983 		 * The possible state changes we need to handle here:
    984 		 *	Old State	New State	Report
    985 		 *
    986 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
    987 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
    988 		 *
    989 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
    990 		 */
    991 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
    992 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
    993 			    ilm->ilm_filter, NULL);
    994 		} else {
    995 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
    996 			    NULL, NULL);
    997 		}
    998 		mldv2_sendrpt(ill, rp);
    999 		return;
   1000 	}
   1001 }
   1002 
   1003 /*
   1004  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
   1005  * and it gets sent after the lock is dropped.
   1006  */
   1007 void
   1008 igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
   1009 {
   1010 	ill_t *ill;
   1011 	mrec_t *rp;
   1012 	ip_stack_t	*ipst = ilm->ilm_ipst;
   1013 
   1014 	ASSERT(ilm != NULL);
   1015 
   1016 	/* state change reports should only be sent if the router is v3 */
   1017 	if (ilm->ilm_ill->ill_mcast_type != IGMP_V3_ROUTER)
   1018 		return;
   1019 
   1020 	ill = ilm->ilm_ill;
   1021 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
   1022 
   1023 	/*
   1024 	 * Compare existing(old) state with the new state and prepare
   1025 	 * State Change Report, according to the rules in RFC 3376:
   1026 	 *
   1027 	 *	Old State	New State	State Change Report
   1028 	 *
   1029 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
   1030 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
   1031 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
   1032 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
   1033 	 */
   1034 
   1035 	if (ilm->ilm_fmode == fmode) {
   1036 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
   1037 		slist_t *allow, *block;
   1038 		if (((a_minus_b = l_alloc()) == NULL) ||
   1039 		    ((b_minus_a = l_alloc()) == NULL)) {
   1040 			l_free(a_minus_b);
   1041 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
   1042 				goto send_to_ex;
   1043 			else
   1044 				goto send_to_in;
   1045 		}
   1046 		l_difference(ilm->ilm_filter, flist, a_minus_b);
   1047 		l_difference(flist, ilm->ilm_filter, b_minus_a);
   1048 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
   1049 			allow = b_minus_a;
   1050 			block = a_minus_b;
   1051 		} else {
   1052 			allow = a_minus_b;
   1053 			block = b_minus_a;
   1054 		}
   1055 		rp = NULL;
   1056 		if (!SLIST_IS_EMPTY(allow))
   1057 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
   1058 			    allow, rp);
   1059 		if (!SLIST_IS_EMPTY(block))
   1060 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
   1061 			    block, rp);
   1062 		l_free(a_minus_b);
   1063 		l_free(b_minus_a);
   1064 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
   1065 send_to_ex:
   1066 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
   1067 		    NULL);
   1068 	} else {
   1069 send_to_in:
   1070 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
   1071 		    NULL);
   1072 	}
   1073 
   1074 	/*
   1075 	 * Need to set up retransmission state; merge the new info with the
   1076 	 * current state (which may be null).  If the timer is not currently
   1077 	 * running, the caller will start it when dropping ill_mcast_lock.
   1078 	 */
   1079 	rp = mcast_merge_rtx(ilm, rp, flist);
   1080 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
   1081 		ilm->ilm_rtx.rtx_cnt = ill->ill_mcast_rv;
   1082 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
   1083 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
   1084 		mutex_enter(&ipst->ips_igmp_timer_lock);
   1085 		ipst->ips_igmp_deferred_next = MIN(ipst->ips_igmp_deferred_next,
   1086 		    ilm->ilm_rtx.rtx_timer);
   1087 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
   1088 		mutex_exit(&ipst->ips_igmp_timer_lock);
   1089 	}
   1090 
   1091 	igmpv3_sendrpt(ill, rp);
   1092 }
   1093 
   1094 /*
   1095  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
   1096  * and it gets sent after the lock is dropped.
   1097  */
   1098 void
   1099 mld_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
   1100 {
   1101 	ill_t *ill;
   1102 	mrec_t *rp = NULL;
   1103 	ip_stack_t	*ipst = ilm->ilm_ipst;
   1104 
   1105 	ASSERT(ilm != NULL);
   1106 
   1107 	ill = ilm->ilm_ill;
   1108 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
   1109 
   1110 	/* only need to send if we have an mldv2-capable router */
   1111 	if (ill->ill_mcast_type != MLD_V2_ROUTER) {
   1112 		return;
   1113 	}
   1114 
   1115 	/*
   1116 	 * Compare existing (old) state with the new state passed in
   1117 	 * and send appropriate MLDv2 State Change Report.
   1118 	 *
   1119 	 *	Old State	New State	State Change Report
   1120 	 *
   1121 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
   1122 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
   1123 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
   1124 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
   1125 	 */
   1126 	if (ilm->ilm_fmode == fmode) {
   1127 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
   1128 		slist_t *allow, *block;
   1129 		if (((a_minus_b = l_alloc()) == NULL) ||
   1130 		    ((b_minus_a = l_alloc()) == NULL)) {
   1131 			l_free(a_minus_b);
   1132 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
   1133 				goto send_to_ex;
   1134 			else
   1135 				goto send_to_in;
   1136 		}
   1137 		l_difference(ilm->ilm_filter, flist, a_minus_b);
   1138 		l_difference(flist, ilm->ilm_filter, b_minus_a);
   1139 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
   1140 			allow = b_minus_a;
   1141 			block = a_minus_b;
   1142 		} else {
   1143 			allow = a_minus_b;
   1144 			block = b_minus_a;
   1145 		}
   1146 		if (!SLIST_IS_EMPTY(allow))
   1147 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
   1148 			    allow, rp);
   1149 		if (!SLIST_IS_EMPTY(block))
   1150 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
   1151 			    block, rp);
   1152 		l_free(a_minus_b);
   1153 		l_free(b_minus_a);
   1154 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
   1155 send_to_ex:
   1156 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
   1157 		    NULL);
   1158 	} else {
   1159 send_to_in:
   1160 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
   1161 		    NULL);
   1162 	}
   1163 
   1164 	/*
   1165 	 * Need to set up retransmission state; merge the new info with the
   1166 	 * current state (which may be null).  If the timer is not currently
   1167 	 * running, the caller will start it when dropping ill_mcast_lock.
   1168 	 */
   1169 	rp = mcast_merge_rtx(ilm, rp, flist);
   1170 	ASSERT(ilm->ilm_rtx.rtx_cnt > 0);
   1171 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
   1172 		ilm->ilm_rtx.rtx_cnt = ill->ill_mcast_rv;
   1173 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
   1174 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
   1175 		mutex_enter(&ipst->ips_mld_timer_lock);
   1176 		ipst->ips_mld_deferred_next =
   1177 		    MIN(ipst->ips_mld_deferred_next, ilm->ilm_rtx.rtx_timer);
   1178 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
   1179 		mutex_exit(&ipst->ips_mld_timer_lock);
   1180 	}
   1181 
   1182 	mldv2_sendrpt(ill, rp);
   1183 }
   1184 
   1185 uint_t
   1186 igmp_timeout_handler_per_ill(ill_t *ill)
   1187 {
   1188 	uint_t	next = INFINITY, current;
   1189 	ilm_t	*ilm;
   1190 	mrec_t	*rp = NULL;
   1191 	mrec_t	*rtxrp = NULL;
   1192 	rtx_state_t *rtxp;
   1193 	mcast_record_t	rtype;
   1194 
   1195 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
   1196 
   1197 	current = CURRENT_MSTIME;
   1198 	/* First check the global timer on this interface */
   1199 	if (ill->ill_global_timer == INFINITY)
   1200 		goto per_ilm_timer;
   1201 	if (ill->ill_global_timer <= (current + CURRENT_OFFSET)) {
   1202 		ill->ill_global_timer = INFINITY;
   1203 		/*
   1204 		 * Send report for each group on this interface.
   1205 		 * Since we just set the global timer (received a v3 general
   1206 		 * query), need to skip the all hosts addr (224.0.0.1), per
   1207 		 * RFC 3376 section 5.
   1208 		 */
   1209 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
   1210 			if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP))
   1211 				continue;
   1212 			rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
   1213 			    ilm->ilm_filter, rp);
   1214 			/*
   1215 			 * Since we're sending a report on this group, okay
   1216 			 * to delete pending group-specific timers.  Note
   1217 			 * that group-specific retransmit timers still need
   1218 			 * to be checked in the per_ilm_timer for-loop.
   1219 			 */
   1220 			ilm->ilm_timer = INFINITY;
   1221 			ilm->ilm_state = IGMP_IREPORTEDLAST;
   1222 			FREE_SLIST(ilm->ilm_pendsrcs);
   1223 			ilm->ilm_pendsrcs = NULL;
   1224 		}
   1225 		igmpv3_sendrpt(ill, rp);
   1226 		rp = NULL;
   1227 	} else {
   1228 		if ((ill->ill_global_timer - current) < next)
   1229 			next = ill->ill_global_timer - current;
   1230 	}
   1231 
   1232 per_ilm_timer:
   1233 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
   1234 		if (ilm->ilm_timer == INFINITY)
   1235 			goto per_ilm_rtxtimer;
   1236 
   1237 		if (ilm->ilm_timer > (current + CURRENT_OFFSET)) {
   1238 			if ((ilm->ilm_timer - current) < next)
   1239 				next = ilm->ilm_timer - current;
   1240 
   1241 			if (ip_debug > 1) {
   1242 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
   1243 				    "igmp_timo_hlr 2: ilm_timr %d "
   1244 				    "typ %d nxt %d",
   1245 				    (int)ntohl(ilm->ilm_timer - current),
   1246 				    (ill->ill_mcast_type), next);
   1247 			}
   1248 
   1249 			goto per_ilm_rtxtimer;
   1250 		}
   1251 
   1252 		/* the timer has expired, need to take action */
   1253 		ilm->ilm_timer = INFINITY;
   1254 		ilm->ilm_state = IGMP_IREPORTEDLAST;
   1255 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
   1256 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
   1257 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
   1258 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
   1259 		} else {
   1260 			slist_t *rsp;
   1261 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
   1262 			    (rsp = l_alloc()) != NULL) {
   1263 				/*
   1264 				 * Contents of reply depend on pending
   1265 				 * requested source list.
   1266 				 */
   1267 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
   1268 					l_intersection(ilm->ilm_filter,
   1269 					    ilm->ilm_pendsrcs, rsp);
   1270 				} else {
   1271 					l_difference(ilm->ilm_pendsrcs,
   1272 					    ilm->ilm_filter, rsp);
   1273 				}
   1274 				FREE_SLIST(ilm->ilm_pendsrcs);
   1275 				ilm->ilm_pendsrcs = NULL;
   1276 				if (!SLIST_IS_EMPTY(rsp))
   1277 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
   1278 					    &ilm->ilm_v6addr, rsp, rp);
   1279 				FREE_SLIST(rsp);
   1280 			} else {
   1281 				/*
   1282 				 * Either the pending request is just group-
   1283 				 * specific, or we couldn't get the resources
   1284 				 * (rsp) to build a source-specific reply.
   1285 				 */
   1286 				rp = mcast_bldmrec(ilm->ilm_fmode,
   1287 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
   1288 			}
   1289 			igmpv3_sendrpt(ill, rp);
   1290 			rp = NULL;
   1291 		}
   1292 
   1293 per_ilm_rtxtimer:
   1294 		rtxp = &ilm->ilm_rtx;
   1295 
   1296 		if (rtxp->rtx_timer == INFINITY)
   1297 			continue;
   1298 		if (rtxp->rtx_timer > (current + CURRENT_OFFSET)) {
   1299 			if ((rtxp->rtx_timer - current) < next)
   1300 				next = rtxp->rtx_timer - current;
   1301 			continue;
   1302 		}
   1303 
   1304 		rtxp->rtx_timer = INFINITY;
   1305 		ilm->ilm_state = IGMP_IREPORTEDLAST;
   1306 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
   1307 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
   1308 			continue;
   1309 		}
   1310 		if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
   1311 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
   1312 			continue;
   1313 		}
   1314 
   1315 		/*
   1316 		 * The retransmit timer has popped, and our router is
   1317 		 * IGMPv3.  We have to delve into the retransmit state
   1318 		 * stored in the ilm.
   1319 		 *
   1320 		 * Decrement the retransmit count.  If the fmode rtx
   1321 		 * count is active, decrement it, and send a filter
   1322 		 * mode change report with the ilm's source list.
   1323 		 * Otherwise, send a source list change report with
   1324 		 * the current retransmit lists.
   1325 		 */
   1326 		ASSERT(rtxp->rtx_cnt > 0);
   1327 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
   1328 		rtxp->rtx_cnt--;
   1329 		if (rtxp->rtx_fmode_cnt > 0) {
   1330 			rtxp->rtx_fmode_cnt--;
   1331 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
   1332 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
   1333 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
   1334 			    ilm->ilm_filter, rtxrp);
   1335 		} else {
   1336 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
   1337 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
   1338 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
   1339 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
   1340 		}
   1341 		if (rtxp->rtx_cnt > 0) {
   1342 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
   1343 			    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
   1344 			if (rtxp->rtx_timer < next)
   1345 				next = rtxp->rtx_timer;
   1346 			rtxp->rtx_timer += current;
   1347 		} else {
   1348 			ASSERT(rtxp->rtx_timer == INFINITY);
   1349 			CLEAR_SLIST(rtxp->rtx_allow);
   1350 			CLEAR_SLIST(rtxp->rtx_block);
   1351 		}
   1352 		igmpv3_sendrpt(ill, rtxrp);
   1353 		rtxrp = NULL;
   1354 	}
   1355 
   1356 	rw_exit(&ill->ill_mcast_lock);
   1357 	/* Send any deferred/queued IP packets */
   1358 	ill_mcast_send_queued(ill);
   1359 	/* Defer ill_mcast_timer_start() until the caller is done */
   1360 
   1361 	return (next);
   1362 }
   1363 
   1364 /*
   1365  * igmp_timeout_handler:
   1366  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
   1367  * Returns number of ticks to next event (or 0 if none).
   1368  *
   1369  * As part of multicast join and leave igmp we may need to send out an
   1370  * igmp request. The igmp related state variables in the ilm are protected
   1371  * by ill_mcast_lock. A single global igmp timer is used to track igmp timeouts.
   1372  * igmp_timer_lock protects the global igmp_timeout_id. igmp_start_timers
   1373  * starts the igmp timer if needed. It serializes multiple threads trying to
   1374  * simultaneously start the timer using the igmp_timer_setter_active flag.
   1375  *
   1376  * igmp_input() receives igmp queries and responds to the queries
   1377  * in a delayed fashion by posting a timer i.e. it calls igmp_start_timers().
   1378  * Later the igmp_timer fires, the timeout handler igmp_timerout_handler()
   1379  * performs the action exclusively after acquiring ill_mcast_lock.
   1380  *
   1381  * The igmp_slowtimeo() function is called thru another timer.
   1382  * igmp_slowtimeout_lock protects the igmp_slowtimeout_id
   1383  */
   1384 void
   1385 igmp_timeout_handler(void *arg)
   1386 {
   1387 	ill_t	*ill;
   1388 	uint_t  global_next = INFINITY;
   1389 	uint_t  next;
   1390 	ill_walk_context_t ctx;
   1391 	ip_stack_t *ipst = arg;
   1392 
   1393 	ASSERT(arg != NULL);
   1394 	mutex_enter(&ipst->ips_igmp_timer_lock);
   1395 	ASSERT(ipst->ips_igmp_timeout_id != 0);
   1396 	ipst->ips_igmp_timeout_id = 0;
   1397 	ipst->ips_igmp_timer_scheduled_last = 0;
   1398 	ipst->ips_igmp_time_to_next = 0;
   1399 	mutex_exit(&ipst->ips_igmp_timer_lock);
   1400 
   1401 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   1402 	ill = ILL_START_WALK_V4(&ctx, ipst);
   1403 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   1404 		ASSERT(!ill->ill_isv6);
   1405 		/* Make sure the ill isn't going away. */
   1406 		if (!ill_check_and_refhold(ill))
   1407 			continue;
   1408 		rw_exit(&ipst->ips_ill_g_lock);
   1409 		next = igmp_timeout_handler_per_ill(ill);
   1410 		if (next < global_next)
   1411 			global_next = next;
   1412 		ill_refrele(ill);
   1413 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   1414 	}
   1415 	rw_exit(&ipst->ips_ill_g_lock);
   1416 	if (global_next != INFINITY)
   1417 		igmp_start_timers(global_next, ipst);
   1418 }
   1419 
   1420 /*
   1421  * mld_timeout_handler:
   1422  * Called when there are timeout events, every next (tick).
   1423  * Returns number of ticks to next event (or 0 if none).
   1424  */
   1425 uint_t
   1426 mld_timeout_handler_per_ill(ill_t *ill)
   1427 {
   1428 	ilm_t 	*ilm;
   1429 	uint_t	next = INFINITY, current;
   1430 	mrec_t	*rp, *rtxrp;
   1431 	rtx_state_t *rtxp;
   1432 	mcast_record_t	rtype;
   1433 
   1434 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
   1435 
   1436 	current = CURRENT_MSTIME;
   1437 	/*
   1438 	 * First check the global timer on this interface; the global timer
   1439 	 * is not used for MLDv1, so if it's set we can assume we're v2.
   1440 	 */
   1441 	if (ill->ill_global_timer == INFINITY)
   1442 		goto per_ilm_timer;
   1443 	if (ill->ill_global_timer <= (current + CURRENT_OFFSET)) {
   1444 		ill->ill_global_timer = INFINITY;
   1445 		/*
   1446 		 * Send report for each group on this interface.
   1447 		 * Since we just set the global timer (received a v2 general
   1448 		 * query), need to skip the all hosts addr (ff02::1), per
   1449 		 * RFC 3810 section 6.
   1450 		 */
   1451 		rp = NULL;
   1452 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
   1453 			if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
   1454 			    &ipv6_all_hosts_mcast))
   1455 				continue;
   1456 			rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
   1457 			    ilm->ilm_filter, rp);
   1458 			/*
   1459 			 * Since we're sending a report on this group, okay
   1460 			 * to delete pending group-specific timers.  Note
   1461 			 * that group-specific retransmit timers still need
   1462 			 * to be checked in the per_ilm_timer for-loop.
   1463 			 */
   1464 			ilm->ilm_timer = INFINITY;
   1465 			ilm->ilm_state = IGMP_IREPORTEDLAST;
   1466 			FREE_SLIST(ilm->ilm_pendsrcs);
   1467 			ilm->ilm_pendsrcs = NULL;
   1468 		}
   1469 		mldv2_sendrpt(ill, rp);
   1470 	} else {
   1471 		if ((ill->ill_global_timer - current) < next)
   1472 			next = ill->ill_global_timer - current;
   1473 	}
   1474 
   1475 per_ilm_timer:
   1476 	rp = rtxrp = NULL;
   1477 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
   1478 		if (ilm->ilm_timer == INFINITY)
   1479 			goto per_ilm_rtxtimer;
   1480 
   1481 		if (ilm->ilm_timer > (current + CURRENT_OFFSET)) {
   1482 			if ((ilm->ilm_timer - current) < next)
   1483 				next = ilm->ilm_timer - current;
   1484 
   1485 			if (ip_debug > 1) {
   1486 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
   1487 				    "igmp_timo_hlr 2: ilm_timr"
   1488 				    " %d typ %d nxt %d",
   1489 				    (int)ntohl(ilm->ilm_timer - current),
   1490 				    (ill->ill_mcast_type), next);
   1491 			}
   1492 
   1493 			goto per_ilm_rtxtimer;
   1494 		}
   1495 
   1496 		/* the timer has expired, need to take action */
   1497 		ilm->ilm_timer = INFINITY;
   1498 		ilm->ilm_state = IGMP_IREPORTEDLAST;
   1499 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
   1500 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
   1501 		} else {
   1502 			slist_t *rsp;
   1503 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
   1504 			    (rsp = l_alloc()) != NULL) {
   1505 				/*
   1506 				 * Contents of reply depend on pending
   1507 				 * requested source list.
   1508 				 */
   1509 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
   1510 					l_intersection(ilm->ilm_filter,
   1511 					    ilm->ilm_pendsrcs, rsp);
   1512 				} else {
   1513 					l_difference(ilm->ilm_pendsrcs,
   1514 					    ilm->ilm_filter, rsp);
   1515 				}
   1516 				FREE_SLIST(ilm->ilm_pendsrcs);
   1517 				ilm->ilm_pendsrcs = NULL;
   1518 				if (!SLIST_IS_EMPTY(rsp))
   1519 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
   1520 					    &ilm->ilm_v6addr, rsp, rp);
   1521 				FREE_SLIST(rsp);
   1522 			} else {
   1523 				rp = mcast_bldmrec(ilm->ilm_fmode,
   1524 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
   1525 			}
   1526 		}
   1527 
   1528 per_ilm_rtxtimer:
   1529 		rtxp = &ilm->ilm_rtx;
   1530 
   1531 		if (rtxp->rtx_timer == INFINITY)
   1532 			continue;
   1533 		if (rtxp->rtx_timer > (current + CURRENT_OFFSET)) {
   1534 			if ((rtxp->rtx_timer - current) < next)
   1535 				next = rtxp->rtx_timer - current;
   1536 			continue;
   1537 		}
   1538 
   1539 		rtxp->rtx_timer = INFINITY;
   1540 		ilm->ilm_state = IGMP_IREPORTEDLAST;
   1541 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
   1542 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
   1543 			continue;
   1544 		}
   1545 
   1546 		/*
   1547 		 * The retransmit timer has popped, and our router is
   1548 		 * MLDv2.  We have to delve into the retransmit state
   1549 		 * stored in the ilm.
   1550 		 *
   1551 		 * Decrement the retransmit count.  If the fmode rtx
   1552 		 * count is active, decrement it, and send a filter
   1553 		 * mode change report with the ilm's source list.
   1554 		 * Otherwise, send a source list change report with
   1555 		 * the current retransmit lists.
   1556 		 */
   1557 		ASSERT(rtxp->rtx_cnt > 0);
   1558 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
   1559 		rtxp->rtx_cnt--;
   1560 		if (rtxp->rtx_fmode_cnt > 0) {
   1561 			rtxp->rtx_fmode_cnt--;
   1562 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
   1563 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
   1564 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
   1565 			    ilm->ilm_filter, rtxrp);
   1566 		} else {
   1567 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
   1568 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
   1569 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
   1570 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
   1571 		}
   1572 		if (rtxp->rtx_cnt > 0) {
   1573 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
   1574 			    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
   1575 			if (rtxp->rtx_timer < next)
   1576 				next = rtxp->rtx_timer;
   1577 			rtxp->rtx_timer += current;
   1578 		} else {
   1579 			ASSERT(rtxp->rtx_timer == INFINITY);
   1580 			CLEAR_SLIST(rtxp->rtx_allow);
   1581 			CLEAR_SLIST(rtxp->rtx_block);
   1582 		}
   1583 	}
   1584 
   1585 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
   1586 		mldv2_sendrpt(ill, rp);
   1587 		mldv2_sendrpt(ill, rtxrp);
   1588 	}
   1589 	rw_exit(&ill->ill_mcast_lock);
   1590 	/* Send any deferred/queued IP packets */
   1591 	ill_mcast_send_queued(ill);
   1592 	/* Defer ill_mcast_timer_start() until the caller is done */
   1593 
   1594 	return (next);
   1595 }
   1596 
   1597 /*
   1598  * mld_timeout_handler:
   1599  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
   1600  * Returns number of ticks to next event (or 0 if none).
   1601  * MT issues are same as igmp_timeout_handler
   1602  */
   1603 void
   1604 mld_timeout_handler(void *arg)
   1605 {
   1606 	ill_t	*ill;
   1607 	uint_t  global_next = INFINITY;
   1608 	uint_t  next;
   1609 	ill_walk_context_t ctx;
   1610 	ip_stack_t *ipst = arg;
   1611 
   1612 	ASSERT(arg != NULL);
   1613 	mutex_enter(&ipst->ips_mld_timer_lock);
   1614 	ASSERT(ipst->ips_mld_timeout_id != 0);
   1615 	ipst->ips_mld_timeout_id = 0;
   1616 	ipst->ips_mld_timer_scheduled_last = 0;
   1617 	ipst->ips_mld_time_to_next = 0;
   1618 	mutex_exit(&ipst->ips_mld_timer_lock);
   1619 
   1620 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   1621 	ill = ILL_START_WALK_V6(&ctx, ipst);
   1622 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
   1623 		ASSERT(ill->ill_isv6);
   1624 		/* Make sure the ill isn't going away. */
   1625 		if (!ill_check_and_refhold(ill))
   1626 			continue;
   1627 		rw_exit(&ipst->ips_ill_g_lock);
   1628 		next = mld_timeout_handler_per_ill(ill);
   1629 		if (next < global_next)
   1630 			global_next = next;
   1631 		ill_refrele(ill);
   1632 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   1633 	}
   1634 	rw_exit(&ipst->ips_ill_g_lock);
   1635 	if (global_next != INFINITY)
   1636 		mld_start_timers(global_next, ipst);
   1637 }
   1638 
   1639 /*
   1640  * Calculate the Older Version Querier Present timeout value, in number
   1641  * of slowtimo intervals, for the given ill.
   1642  */
   1643 #define	OVQP(ill) \
   1644 	((1000 * (((ill)->ill_mcast_rv * (ill)->ill_mcast_qi) \
   1645 	+ MCAST_QUERY_RESP_INTERVAL)) / MCAST_SLOWTIMO_INTERVAL)
   1646 
   1647 /*
   1648  * igmp_slowtimo:
   1649  * - Resets to new router if we didnt we hear from the router
   1650  *   in IGMP_AGE_THRESHOLD seconds.
   1651  * - Resets slowtimeout.
   1652  * Check for ips_igmp_max_version ensures that we don't revert to a higher
   1653  * IGMP version than configured.
   1654  */
   1655 void
   1656 igmp_slowtimo(void *arg)
   1657 {
   1658 	ill_t	*ill;
   1659 	ill_if_t *ifp;
   1660 	avl_tree_t *avl_tree;
   1661 	ip_stack_t *ipst = (ip_stack_t *)arg;
   1662 
   1663 	ASSERT(arg != NULL);
   1664 
   1665 	/*
   1666 	 * The ill_if_t list is circular, hence the odd loop parameters.
   1667 	 *
   1668 	 * We can't use the ILL_START_WALK and ill_next() wrappers for this
   1669 	 * walk, as we need to check the illif_mcast_* fields in the ill_if_t
   1670 	 * structure (allowing us to skip if none of the instances have timers
   1671 	 * running).
   1672 	 */
   1673 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   1674 	for (ifp = IP_V4_ILL_G_LIST(ipst);
   1675 	    ifp != (ill_if_t *)&IP_V4_ILL_G_LIST(ipst);
   1676 	    ifp = ifp->illif_next) {
   1677 		/*
   1678 		 * illif_mcast_v[12] are set using atomics. If an ill hears
   1679 		 * a V1 or V2 query now and we miss seeing the count now,
   1680 		 * we will see it the next time igmp_slowtimo is called.
   1681 		 */
   1682 		if (ifp->illif_mcast_v1 == 0 && ifp->illif_mcast_v2 == 0)
   1683 			continue;
   1684 
   1685 		avl_tree = &ifp->illif_avl_by_ppa;
   1686 		for (ill = avl_first(avl_tree); ill != NULL;
   1687 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
   1688 			/* Make sure the ill isn't going away. */
   1689 			if (!ill_check_and_refhold(ill))
   1690 				continue;
   1691 			rw_exit(&ipst->ips_ill_g_lock);
   1692 			rw_enter(&ill->ill_mcast_lock, RW_WRITER);
   1693 			if (ill->ill_mcast_v1_tset == 1)
   1694 				ill->ill_mcast_v1_time++;
   1695 			if (ill->ill_mcast_v2_tset == 1)
   1696 				ill->ill_mcast_v2_time++;
   1697 			if ((ill->ill_mcast_type == IGMP_V1_ROUTER) &&
   1698 			    (ipst->ips_igmp_max_version >= IGMP_V2_ROUTER) &&
   1699 			    (ill->ill_mcast_v1_time >= OVQP(ill))) {
   1700 				if ((ill->ill_mcast_v2_tset > 0) ||
   1701 				    (ipst->ips_igmp_max_version ==
   1702 				    IGMP_V2_ROUTER)) {
   1703 					ip1dbg(("V1 query timer "
   1704 					    "expired on %s; switching "
   1705 					    "mode to IGMP_V2\n",
   1706 					    ill->ill_name));
   1707 					ill->ill_mcast_type =
   1708 					    IGMP_V2_ROUTER;
   1709 				} else {
   1710 					ip1dbg(("V1 query timer "
   1711 					    "expired on %s; switching "
   1712 					    "mode to IGMP_V3\n",
   1713 					    ill->ill_name));
   1714 					ill->ill_mcast_type =
   1715 					    IGMP_V3_ROUTER;
   1716 				}
   1717 				ill->ill_mcast_v1_time = 0;
   1718 				ill->ill_mcast_v1_tset = 0;
   1719 				atomic_add_16(&ifp->illif_mcast_v1, -1);
   1720 			}
   1721 			if ((ill->ill_mcast_type == IGMP_V2_ROUTER) &&
   1722 			    (ipst->ips_igmp_max_version >= IGMP_V3_ROUTER) &&
   1723 			    (ill->ill_mcast_v2_time >= OVQP(ill))) {
   1724 				ip1dbg(("V2 query timer expired on "
   1725 				    "%s; switching mode to IGMP_V3\n",
   1726 				    ill->ill_name));
   1727 				ill->ill_mcast_type = IGMP_V3_ROUTER;
   1728 				ill->ill_mcast_v2_time = 0;
   1729 				ill->ill_mcast_v2_tset = 0;
   1730 				atomic_add_16(&ifp->illif_mcast_v2, -1);
   1731 			}
   1732 			rw_exit(&ill->ill_mcast_lock);
   1733 			ill_refrele(ill);
   1734 			rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   1735 		}
   1736 	}
   1737 	rw_exit(&ipst->ips_ill_g_lock);
   1738 	ill_mcast_timer_start(ipst);
   1739 	mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
   1740 	ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, (void *)ipst,
   1741 	    MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
   1742 	mutex_exit(&ipst->ips_igmp_slowtimeout_lock);
   1743 }
   1744 
   1745 /*
   1746  * mld_slowtimo:
   1747  * - Resets to newer version if we didn't hear from the older version router
   1748  *   in MLD_AGE_THRESHOLD seconds.
   1749  * - Restarts slowtimeout.
   1750  * Check for ips_mld_max_version ensures that we don't revert to a higher
   1751  * IGMP version than configured.
   1752  */
   1753 void
   1754 mld_slowtimo(void *arg)
   1755 {
   1756 	ill_t *ill;
   1757 	ill_if_t *ifp;
   1758 	avl_tree_t *avl_tree;
   1759 	ip_stack_t *ipst = (ip_stack_t *)arg;
   1760 
   1761 	ASSERT(arg != NULL);
   1762 	/* See comments in igmp_slowtimo() above... */
   1763 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   1764 	for (ifp = IP_V6_ILL_G_LIST(ipst);
   1765 	    ifp != (ill_if_t *)&IP_V6_ILL_G_LIST(ipst);
   1766 	    ifp = ifp->illif_next) {
   1767 		if (ifp->illif_mcast_v1 == 0)
   1768 			continue;
   1769 
   1770 		avl_tree = &ifp->illif_avl_by_ppa;
   1771 		for (ill = avl_first(avl_tree); ill != NULL;
   1772 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
   1773 			/* Make sure the ill isn't going away. */
   1774 			if (!ill_check_and_refhold(ill))
   1775 				continue;
   1776 			rw_exit(&ipst->ips_ill_g_lock);
   1777 			rw_enter(&ill->ill_mcast_lock, RW_WRITER);
   1778 			if (ill->ill_mcast_v1_tset == 1)
   1779 				ill->ill_mcast_v1_time++;
   1780 			if ((ill->ill_mcast_type == MLD_V1_ROUTER) &&
   1781 			    (ipst->ips_mld_max_version >= MLD_V2_ROUTER) &&
   1782 			    (ill->ill_mcast_v1_time >= OVQP(ill))) {
   1783 				ip1dbg(("MLD query timer expired on"
   1784 				    " %s; switching mode to MLD_V2\n",
   1785 				    ill->ill_name));
   1786 				ill->ill_mcast_type = MLD_V2_ROUTER;
   1787 				ill->ill_mcast_v1_time = 0;
   1788 				ill->ill_mcast_v1_tset = 0;
   1789 				atomic_add_16(&ifp->illif_mcast_v1, -1);
   1790 			}
   1791 			rw_exit(&ill->ill_mcast_lock);
   1792 			ill_refrele(ill);
   1793 			rw_enter(&ipst->ips_ill_g_lock, RW_READER);
   1794 		}
   1795 	}
   1796 	rw_exit(&ipst->ips_ill_g_lock);
   1797 	ill_mcast_timer_start(ipst);
   1798 	mutex_enter(&ipst->ips_mld_slowtimeout_lock);
   1799 	ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, (void *)ipst,
   1800 	    MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
   1801 	mutex_exit(&ipst->ips_mld_slowtimeout_lock);
   1802 }
   1803 
   1804 /*
   1805  * igmp_sendpkt:
   1806  * This will send to ip_output_simple just like icmp_inbound.
   1807  */
   1808 static void
   1809 igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
   1810 {
   1811 	mblk_t	*mp;
   1812 	igmpa_t	*igmpa;
   1813 	uint8_t *rtralert;
   1814 	ipha_t	*ipha;
   1815 	int	hdrlen = sizeof (ipha_t) + RTRALERT_LEN;
   1816 	size_t	size  = hdrlen + sizeof (igmpa_t);
   1817 	ill_t 	*ill  = ilm->ilm_ill;
   1818 	ip_stack_t *ipst = ill->ill_ipst;
   1819 
   1820 	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
   1821 
   1822 	mp = allocb(size, BPRI_HI);
   1823 	if (mp == NULL) {
   1824 		return;
   1825 	}
   1826 	mp->b_wptr = mp->b_rptr + size;
   1827 
   1828 	ipha = (ipha_t *)mp->b_rptr;
   1829 	rtralert = (uint8_t *)&(ipha[1]);
   1830 	igmpa = (igmpa_t *)&(rtralert[RTRALERT_LEN]);
   1831 	igmpa->igmpa_type   = type;
   1832 	igmpa->igmpa_code   = 0;
   1833 	igmpa->igmpa_group  = ilm->ilm_addr;
   1834 	igmpa->igmpa_cksum  = 0;
   1835 	igmpa->igmpa_cksum  = IP_CSUM(mp, hdrlen, 0);
   1836 
   1837 	rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT;
   1838 	rtralert[1] = RTRALERT_LEN;
   1839 	rtralert[2] = 0;
   1840 	rtralert[3] = 0;
   1841 
   1842 	ipha->ipha_version_and_hdr_length = (IP_VERSION << 4)
   1843 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
   1844 	ipha->ipha_type_of_service 	= 0;
   1845 	ipha->ipha_length = htons(size);
   1846 	ipha->ipha_ident = 0;
   1847 	ipha->ipha_fragment_offset_and_flags = 0;
   1848 	ipha->ipha_ttl 		= IGMP_TTL;
   1849 	ipha->ipha_protocol 	= IPPROTO_IGMP;
   1850 	ipha->ipha_hdr_checksum 	= 0;
   1851 	ipha->ipha_dst 		= addr ? addr : igmpa->igmpa_group;
   1852 	ipha->ipha_src 		= INADDR_ANY;
   1853 
   1854 	ill_mcast_queue(ill, mp);
   1855 
   1856 	++ipst->ips_igmpstat.igps_snd_reports;
   1857 }
   1858 
   1859 /*
   1860  * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill.
   1861  * The report will contain one group record
   1862  * for each element of reclist.  If this causes packet length to
   1863  * exceed ill->ill_mtu, multiple reports are sent.
   1864  * reclist is assumed to be made up of buffers allocated by mcast_bldmrec(),
   1865  * and those buffers are freed here.
   1866  */
   1867 static void
   1868 igmpv3_sendrpt(ill_t *ill, mrec_t *reclist)
   1869 {
   1870 	igmp3ra_t *igmp3ra;
   1871 	grphdra_t *grphdr;
   1872 	mblk_t *mp;
   1873 	ipha_t *ipha;
   1874 	uint8_t *rtralert;
   1875 	ipaddr_t *src_array;
   1876 	int i, j, numrec, more_src_cnt;
   1877 	size_t hdrsize, size, rsize;
   1878 	mrec_t *rp, *cur_reclist;
   1879 	mrec_t *next_reclist = reclist;
   1880 	boolean_t morepkts;
   1881 	ip_stack_t	 *ipst = ill->ill_ipst;
   1882 
   1883 	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
   1884 
   1885 	/* if there aren't any records, there's nothing to send */
   1886 	if (reclist == NULL)
   1887 		return;
   1888 
   1889 	hdrsize = sizeof (ipha_t) + RTRALERT_LEN;
   1890 nextpkt:
   1891 	size = hdrsize + sizeof (igmp3ra_t);
   1892 	morepkts = B_FALSE;
   1893 	more_src_cnt = 0;
   1894 	cur_reclist = next_reclist;
   1895 	numrec = 0;
   1896 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
   1897 		rsize = sizeof (grphdra_t) +
   1898 		    (rp->mrec_srcs.sl_numsrc * sizeof (ipaddr_t));
   1899 		if (size + rsize > ill->ill_mtu) {
   1900 			if (rp == cur_reclist) {
   1901 				/*
   1902 				 * If the first mrec we looked at is too big
   1903 				 * to fit in a single packet (i.e the source
   1904 				 * list is too big), we must either truncate
   1905 				 * the list (if TO_EX or IS_EX), or send
   1906 				 * multiple reports for the same group (all
   1907 				 * other types).
   1908 				 */
   1909 				int srcspace, srcsperpkt;
   1910 				srcspace = ill->ill_mtu - (size +
   1911 				    sizeof (grphdra_t));
   1912 
   1913 				/*
   1914 				 * Skip if there's not even enough room in
   1915 				 * a single packet to send something useful.
   1916 				 */
   1917 				if (srcspace <= sizeof (ipaddr_t))
   1918 					continue;
   1919 
   1920 				srcsperpkt = srcspace / sizeof (ipaddr_t);
   1921 				/*
   1922 				 * Increment size and numrec, because we will
   1923 				 * be sending a record for the mrec we're
   1924 				 * looking at now.
   1925 				 */
   1926 				size += sizeof (grphdra_t) +
   1927 				    (srcsperpkt * sizeof (ipaddr_t));
   1928 				numrec++;
   1929 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
   1930 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
   1931 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
   1932 					if (rp->mrec_next == NULL) {
   1933 						/* no more packets to send */
   1934 						break;
   1935 					} else {
   1936 						/*
   1937 						 * more packets, but we're
   1938 						 * done with this mrec.
   1939 						 */
   1940 						next_reclist = rp->mrec_next;
   1941 					}
   1942 				} else {
   1943 					more_src_cnt = rp->mrec_srcs.sl_numsrc
   1944 					    - srcsperpkt;
   1945 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
   1946 					/*
   1947 					 * We'll fix up this mrec (remove the
   1948 					 * srcs we've already sent) before
   1949 					 * returning to nextpkt above.
   1950 					 */
   1951 					next_reclist = rp;
   1952 				}
   1953 			} else {
   1954 				next_reclist = rp;
   1955 			}
   1956 			morepkts = B_TRUE;
   1957 			break;
   1958 		}
   1959 		size += rsize;
   1960 		numrec++;
   1961 	}
   1962 
   1963 	mp = allocb(size, BPRI_HI);
   1964 	if (mp == NULL) {
   1965 		goto free_reclist;
   1966 	}
   1967 	bzero((char *)mp->b_rptr, size);
   1968 	mp->b_wptr = (uchar_t *)(mp->b_rptr + size);
   1969 
   1970 	ipha = (ipha_t *)mp->b_rptr;
   1971 	rtralert = (uint8_t *)&(ipha[1]);
   1972 	igmp3ra = (igmp3ra_t *)&(rtralert[RTRALERT_LEN]);
   1973 	grphdr = (grphdra_t *)&(igmp3ra[1]);
   1974 
   1975 	rp = cur_reclist;
   1976 	for (i = 0; i < numrec; i++) {
   1977 		grphdr->grphdra_type = rp->mrec_type;
   1978 		grphdr->grphdra_numsrc = htons(rp->mrec_srcs.sl_numsrc);
   1979 		grphdr->grphdra_group = V4_PART_OF_V6(rp->mrec_group);
   1980 		src_array = (ipaddr_t *)&(grphdr[1]);
   1981 
   1982 		for (j = 0; j < rp->mrec_srcs.sl_numsrc; j++)
   1983 			src_array[j] = V4_PART_OF_V6(rp->mrec_srcs.sl_addr[j]);
   1984 
   1985 		grphdr = (grphdra_t *)&(src_array[j]);
   1986 		rp = rp->mrec_next;
   1987 	}
   1988 
   1989 	igmp3ra->igmp3ra_type = IGMP_V3_MEMBERSHIP_REPORT;
   1990 	igmp3ra->igmp3ra_numrec = htons(numrec);
   1991 	igmp3ra->igmp3ra_cksum = IP_CSUM(mp, hdrsize, 0);
   1992 
   1993 	rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT;
   1994 	rtralert[1] = RTRALERT_LEN;
   1995 	rtralert[2] = 0;
   1996 	rtralert[3] = 0;
   1997 
   1998 	ipha->ipha_version_and_hdr_length = IP_VERSION << 4
   1999 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
   2000 	ipha->ipha_type_of_service = IPTOS_PREC_INTERNETCONTROL;
   2001 	ipha->ipha_length = htons(size);
   2002 	ipha->ipha_ttl = IGMP_TTL;
   2003 	ipha->ipha_protocol = IPPROTO_IGMP;
   2004 	ipha->ipha_dst = htonl(INADDR_ALLRPTS_GROUP);
   2005 	ipha->ipha_src = INADDR_ANY;
   2006 
   2007 	ill_mcast_queue(ill, mp);
   2008 
   2009 	++ipst->ips_igmpstat.igps_snd_reports;
   2010 
   2011 	if (morepkts) {
   2012 		if (more_src_cnt > 0) {
   2013 			int index, mvsize;
   2014 			slist_t *sl = &next_reclist->mrec_srcs;
   2015 			index = sl->sl_numsrc;
   2016 			mvsize = more_src_cnt * sizeof (in6_addr_t);
   2017 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
   2018 			    mvsize);
   2019 			sl->sl_numsrc = more_src_cnt;
   2020 		}
   2021 		goto nextpkt;
   2022 	}
   2023 
   2024 free_reclist:
   2025 	while (reclist != NULL) {
   2026 		rp = reclist->mrec_next;
   2027 		mi_free(reclist);
   2028 		reclist = rp;
   2029 	}
   2030 }
   2031 
   2032 /*
   2033  * mld_input:
   2034  * Return NULL for a bad packet that is discarded here.
   2035  * Return mp if the message is OK and should be handed to "raw" receivers.
   2036  * Callers of mld_input() may need to reinitialize variables that were copied
   2037  * from the mblk as this calls pullupmsg().
   2038  */
   2039 mblk_t *
   2040 mld_input(mblk_t *mp, ip_recv_attr_t *ira)
   2041 {
   2042 	ip6_t		*ip6h = (ip6_t *)(mp->b_rptr);
   2043 	mld_hdr_t	*mldh;
   2044 	ilm_t		*ilm;
   2045 	ipif_t		*ipif;
   2046 	uint16_t	hdr_length, exthdr_length;
   2047 	in6_addr_t	*v6group_ptr;
   2048 	uint_t		next;
   2049 	int		mldlen;
   2050 	ill_t		*ill = ira->ira_ill;
   2051 	ip_stack_t	*ipst = ill->ill_ipst;
   2052 
   2053 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal);
   2054 
   2055 	/* Make sure the src address of the packet is link-local */
   2056 	if (!(IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) {
   2057 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
   2058 		freemsg(mp);
   2059 		return (NULL);
   2060 	}
   2061 
   2062 	if (ip6h->ip6_hlim != 1) {
   2063 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpBadHoplimit);
   2064 		freemsg(mp);
   2065 		return (NULL);
   2066 	}
   2067 
   2068 	/* Get to the icmp header part */
   2069 	hdr_length = ira->ira_ip_hdr_length;
   2070 	exthdr_length = hdr_length - IPV6_HDR_LEN;
   2071 
   2072 	mldlen = ntohs(ip6h->ip6_plen) - exthdr_length;
   2073 
   2074 	/* An MLD packet must at least be 24 octets to be valid */
   2075 	if (mldlen < MLD_MINLEN) {
   2076 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
   2077 		freemsg(mp);
   2078 		return (NULL);
   2079 	}
   2080 
   2081 	mldh = (mld_hdr_t *)(&mp->b_rptr[hdr_length]);
   2082 
   2083 	switch (mldh->mld_type) {
   2084 	case MLD_LISTENER_QUERY:
   2085 		/*
   2086 		 * packet length differentiates between v1 and v2.  v1
   2087 		 * query should be exactly 24 octets long; v2 is >= 28.
   2088 		 */
   2089 		if ((mldlen == MLD_MINLEN) ||
   2090 		    (ipst->ips_mld_max_version < MLD_V2_ROUTER)) {
   2091 			next = mld_query_in(mldh, ill);
   2092 		} else if (mldlen >= MLD_V2_QUERY_MINLEN) {
   2093 			next = mldv2_query_in((mld2q_t *)mldh, ill, mldlen);
   2094 		} else {
   2095 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
   2096 			freemsg(mp);
   2097 			return (NULL);
   2098 		}
   2099 		if (next == 0) {
   2100 			return (mp);
   2101 		}
   2102 
   2103 		if (next != INFINITY)
   2104 			mld_start_timers(next, ipst);
   2105 		break;
   2106 
   2107 	case MLD_LISTENER_REPORT:
   2108 		/*
   2109 		 * For fast leave to work, we have to know that we are the
   2110 		 * last person to send a report for this group.  Reports
   2111 		 * generated by us are looped back since we could potentially
   2112 		 * be a multicast router, so discard reports sourced by me.
   2113 		 */
   2114 		mutex_enter(&ill->ill_lock);
   2115 		for (ipif = ill->ill_ipif; ipif != NULL;
   2116 		    ipif = ipif->ipif_next) {
   2117 			if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
   2118 			    &ip6h->ip6_src)) {
   2119 				if (ip_debug > 1) {
   2120 					char    buf1[INET6_ADDRSTRLEN];
   2121 
   2122 					(void) mi_strlog(ill->ill_rq,
   2123 					    1,
   2124 					    SL_TRACE,
   2125 					    "mld_input: we are only "
   2126 					    "member src %s\n",
   2127 					    inet_ntop(AF_INET6, &ip6h->ip6_src,
   2128 					    buf1, sizeof (buf1)));
   2129 				}
   2130 				mutex_exit(&ill->ill_lock);
   2131 				return (mp);
   2132 			}
   2133 		}
   2134 		mutex_exit(&ill->ill_lock);
   2135 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembResponses);
   2136 
   2137 		v6group_ptr = &mldh->mld_addr;
   2138 		if (!IN6_IS_ADDR_MULTICAST(v6group_ptr)) {
   2139 			BUMP_MIB(ill->ill_icmp6_mib,
   2140 			    ipv6IfIcmpInGroupMembBadReports);
   2141 			freemsg(mp);
   2142 			return (NULL);
   2143 		}
   2144 
   2145 
   2146 		/*
   2147 		 * If we belong to the group being reported, and we are a
   2148 		 * 'Delaying member' per the RFC terminology, stop our timer
   2149 		 * for that group and 'clear flag' i.e. mark ilm_state as
   2150 		 * IGMP_OTHERMEMBER. With zones, there can be multiple group
   2151 		 * membership entries for the same group address (one per zone)
   2152 		 * so we need to walk the ill_ilm list.
   2153 		 */
   2154 		rw_enter(&ill->ill_mcast_lock, RW_WRITER);
   2155 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
   2156 			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr))
   2157 				continue;
   2158 			BUMP_MIB(ill->ill_icmp6_mib,
   2159 			    ipv6IfIcmpInGroupMembOurReports);
   2160 
   2161 			ilm->ilm_timer = INFINITY;
   2162 			ilm->ilm_state = IGMP_OTHERMEMBER;
   2163 		}
   2164 		rw_exit(&ill->ill_mcast_lock);
   2165 		/*
   2166 		 * No packets have been sent above - no
   2167 		 * ill_mcast_send_queued is needed.
   2168 		 */
   2169 		ill_mcast_timer_start(ill->ill_ipst);
   2170 		break;
   2171 
   2172 	case MLD_LISTENER_REDUCTION:
   2173 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembReductions);
   2174 		break;
   2175 	}
   2176 	return (mp);
   2177 }
   2178 
   2179 /*
   2180  * Handles an MLDv1 Listener Query.  Returns 0 on error, or the appropriate
   2181  * (non-zero, unsigned) timer value to be set on success.
   2182  */
   2183 static uint_t
   2184 mld_query_in(mld_hdr_t *mldh, ill_t *ill)
   2185 {
   2186 	ilm_t	*ilm;
   2187 	int	timer;
   2188 	uint_t	next, current;
   2189 	in6_addr_t *v6group;
   2190 
   2191 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
   2192 
   2193 	/*
   2194 	 * In the MLD specification, there are 3 states and a flag.
   2195 	 *
   2196 	 * In Non-Listener state, we simply don't have a membership record.
   2197 	 * In Delaying state, our timer is running (ilm->ilm_timer < INFINITY)
   2198 	 * In Idle Member state, our timer is not running (ilm->ilm_timer ==
   2199 	 * INFINITY)
   2200 	 *
   2201 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
   2202 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
   2203 	 * if I sent the last report.
   2204 	 */
   2205 	v6group = &mldh->mld_addr;
   2206 	if (!(IN6_IS_ADDR_UNSPECIFIED(v6group)) &&
   2207 	    ((!IN6_IS_ADDR_MULTICAST(v6group)))) {
   2208 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembBadQueries);
   2209 		return (0);
   2210 	}
   2211 
   2212 	/* Need to do compatibility mode checking */
   2213 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
   2214 	ill->ill_mcast_v1_time = 0;
   2215 	ill->ill_mcast_v1_tset = 1;
   2216 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
   2217 		ip1dbg(("Received MLDv1 Query on %s, switching mode to "
   2218 		    "MLD_V1_ROUTER\n", ill->ill_name));
   2219 		atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
   2220 		ill->ill_mcast_type = MLD_V1_ROUTER;
   2221 	}
   2222 
   2223 	timer = (int)ntohs(mldh->mld_maxdelay);
   2224 	if (ip_debug > 1) {
   2225 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
   2226 		    "mld_input: TIMER = mld_maxdelay %d mld_type 0x%x",
   2227 		    timer, (int)mldh->mld_type);
   2228 	}
   2229 
   2230 	/*
   2231 	 * -Start the timers in all of our membership records for
   2232 	 * the physical interface on which the query arrived,
   2233 	 * excl:
   2234 	 *	1.  those that belong to the "all hosts" group,
   2235 	 *	2.  those with 0 scope, or 1 node-local scope.
   2236 	 *
   2237 	 * -Restart any timer that is already running but has a value
   2238 	 * longer that the requested timeout.
   2239 	 * -Use the value specified in the query message as the
   2240 	 * maximum timeout.
   2241 	 */
   2242 	next = INFINITY;
   2243 
   2244 	current = CURRENT_MSTIME;
   2245 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
   2246 		ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr));
   2247 
   2248 		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
   2249 		    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
   2250 		    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr))
   2251 			continue;
   2252 		if ((!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
   2253 		    &ipv6_all_hosts_mcast)) &&
   2254 		    (IN6_IS_ADDR_UNSPECIFIED(v6group)) ||
   2255 		    (IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))) {
   2256 			if (timer == 0) {
   2257 				/* Respond immediately */
   2258 				ilm->ilm_timer = INFINITY;
   2259 				ilm->ilm_state = IGMP_IREPORTEDLAST;
   2260 				mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
   2261 				break;
   2262 			}
   2263 			if (ilm->ilm_timer > timer) {
   2264 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
   2265 				if (ilm->ilm_timer < next)
   2266 					next = ilm->ilm_timer;
   2267 				ilm->ilm_timer += current;
   2268 			}
   2269 			break;
   2270 		}
   2271 	}
   2272 	rw_exit(&ill->ill_mcast_lock);
   2273 	/* Send any deferred/queued IP packets */
   2274 	ill_mcast_send_queued(ill);
   2275 	ill_mcast_timer_start(ill->ill_ipst);
   2276 
   2277 	return (next);
   2278 }
   2279 
   2280 /*
   2281  * Handles an MLDv2 Listener Query.  On error, returns 0; on success,
   2282  * returns the appropriate (non-zero, unsigned) timer value (which may
   2283  * be INFINITY) to be set.
   2284  */
   2285 static uint_t
   2286 mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
   2287 {
   2288 	ilm_t	*ilm;
   2289 	in6_addr_t *v6group, *src_array;
   2290 	uint_t	next, numsrc, i, mrd, delay, qqi, current;
   2291 	uint8_t	qrv;
   2292 
   2293 	v6group = &mld2q->mld2q_addr;
   2294 	numsrc = ntohs(mld2q->mld2q_numsrc);
   2295 
   2296 	/* make sure numsrc matches packet size */
   2297 	if (mldlen < MLD_V2_QUERY_MINLEN + (numsrc * sizeof (in6_addr_t))) {
   2298 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
   2299 		return (0);
   2300 	}
   2301 	src_array = (in6_addr_t *)&mld2q[1];
   2302 
   2303 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
   2304 
   2305 	/* extract Maximum Response Delay from code in header */
   2306 	mrd = ntohs(mld2q->mld2q_mxrc);
   2307 	if (mrd >= MLD_V2_MAXRT_FPMIN) {
   2308 		uint_t hdrval, mant, exp;
   2309 		hdrval = mrd;
   2310 		mant = hdrval & MLD_V2_MAXRT_MANT_MASK;
   2311 		exp = (hdrval & MLD_V2_MAXRT_EXP_MASK) >> 12;
   2312 		mrd = (mant | 0x1000) << (exp + 3);
   2313 	}
   2314 	if (mrd == 0)
   2315 		mrd = DSEC_TO_MSEC(MCAST_DEF_QUERY_RESP_INTERVAL);
   2316 
   2317 	MCAST_RANDOM_DELAY(delay, mrd);
   2318 	next = (unsigned)INFINITY;
   2319 	current = CURRENT_MSTIME;
   2320 
   2321 	if ((qrv = mld2q->mld2q_sqrv & MLD_V2_RV_MASK) == 0)
   2322 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
   2323 	else
   2324 		ill->ill_mcast_rv = qrv;
   2325 
   2326 	if ((qqi = (uint_t)mld2q->mld2q_qqic) >= MLD_V2_QQI_FPMIN) {
   2327 		uint_t mant, exp;
   2328 		mant = qqi & MLD_V2_QQI_MANT_MASK;
   2329 		exp = (qqi & MLD_V2_QQI_EXP_MASK) >> 12;
   2330 		qqi = (mant | 0x10) << (exp + 3);
   2331 	}
   2332 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
   2333 
   2334 	/*
   2335 	 * If we have a pending general query response that's scheduled
   2336 	 * sooner than the delay we calculated for this response, then
   2337 	 * no action is required (MLDv2 draft section 6.2 rule 1)
   2338 	 */
   2339 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
   2340 	if (ill->ill_global_timer < (current + delay)) {
   2341 		rw_exit(&ill->ill_mcast_lock);
   2342 		return (next);
   2343 	}
   2344 
   2345 	/*
   2346 	 * Now take action depending on query type: general,
   2347 	 * group specific, or group/source specific.
   2348 	 */
   2349 	if ((numsrc == 0) && IN6_IS_ADDR_UNSPECIFIED(v6group)) {
   2350 		/*
   2351 		 * general query
   2352 		 * We know global timer is either not running or is
   2353 		 * greater than our calculated delay, so reset it to
   2354 		 * our delay (random value in range [0, response time])
   2355 		 */
   2356 		ill->ill_global_timer = current + delay;
   2357 		next = delay;
   2358 	} else {
   2359 		/* group or group/source specific query */
   2360 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
   2361 			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
   2362 			    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
   2363 			    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) ||
   2364 			    !IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))
   2365 				continue;
   2366 
   2367 			/*
   2368 			 * If the query is group specific or we have a
   2369 			 * pending group specific query, the response is
   2370 			 * group specific (pending sources list should be
   2371 			 * empty).  Otherwise, need to update the pending
   2372 			 * sources list for the group and source specific
   2373 			 * response.
   2374 			 */
   2375 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
   2376 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
   2377 group_query:
   2378 				FREE_SLIST(ilm->ilm_pendsrcs);
   2379 				ilm->ilm_pendsrcs = NULL;
   2380 			} else {
   2381 				boolean_t overflow;
   2382 				slist_t *pktl;
   2383 				if (numsrc > MAX_FILTER_SIZE ||
   2384 				    (ilm->ilm_pendsrcs == NULL &&
   2385 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
   2386 					/*
   2387 					 * We've been sent more sources than
   2388 					 * we can deal with; or we can't deal
   2389 					 * with a source list at all. Revert
   2390 					 * to a group specific query.
   2391 					 */
   2392 					goto group_query;
   2393 				}
   2394 				if ((pktl = l_alloc()) == NULL)
   2395 					goto group_query;
   2396 				pktl->sl_numsrc = numsrc;
   2397 				for (i = 0; i < numsrc; i++)
   2398 					pktl->sl_addr[i] = src_array[i];
   2399 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
   2400 				    &overflow);
   2401 				l_free(pktl);
   2402 				if (overflow)
   2403 					goto group_query;
   2404 			}
   2405 			ilm->ilm_timer = (ilm->ilm_timer == INFINITY) ?
   2406 			    INFINITY : (ilm->ilm_timer - current);
   2407 			/* set timer to soonest value */
   2408 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
   2409 			if (ilm->ilm_timer < next)
   2410 				next = ilm->ilm_timer;
   2411 			ilm->ilm_timer += current;
   2412 			break;
   2413 		}
   2414 	}
   2415 	rw_exit(&ill->ill_mcast_lock);
   2416 	/*
   2417 	 * No packets have been sent above - no
   2418 	 * ill_mcast_send_queued is needed.
   2419 	 */
   2420 	ill_mcast_timer_start(ill->ill_ipst);
   2421 
   2422 	return (next);
   2423 }
   2424 
   2425 /*
   2426  * Send MLDv1 response packet with hoplimit 1
   2427  */
   2428 static void
   2429 mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
   2430 {
   2431 	mblk_t		*mp;
   2432 	mld_hdr_t	*mldh;
   2433 	ip6_t 		*ip6h;
   2434 	ip6_hbh_t	*ip6hbh;
   2435 	struct ip6_opt_router	*ip6router;
   2436 	size_t		size = IPV6_HDR_LEN + sizeof (mld_hdr_t);
   2437 	ill_t		*ill = ilm->ilm_ill;
   2438 
   2439 	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
   2440 
   2441 	/*
   2442 	 * We need to place a router alert option in this packet.  The length
   2443 	 * of the options must be a multiple of 8.  The hbh option header is 2
   2444 	 * bytes followed by the 4 byte router alert option.  That leaves
   2445 	 * 2 bytes of pad for a total of 8 bytes.
   2446 	 */
   2447 	const int	router_alert_length = 8;
   2448 
   2449 	ASSERT(ill->ill_isv6);
   2450 
   2451 	size += router_alert_length;
   2452 	mp = allocb(size, BPRI_HI);
   2453 	if (mp == NULL)
   2454 		return;
   2455 	bzero(mp->b_rptr, size);
   2456 	mp->b_wptr = mp->b_rptr + size;
   2457 
   2458 	ip6h = (ip6_t *)mp->b_rptr;
   2459 	ip6hbh = (struct ip6_hbh *)&ip6h[1];
   2460 	ip6router = (struct ip6_opt_router *)&ip6hbh[1];
   2461 	/*
   2462 	 * A zero is a pad option of length 1.  The bzero of the whole packet
   2463 	 * above will pad between ip6router and mld.
   2464 	 */
   2465 	mldh = (mld_hdr_t *)((uint8_t *)ip6hbh + router_alert_length);
   2466 
   2467 	mldh->mld_type = type;
   2468 	mldh->mld_addr = ilm->ilm_v6addr;
   2469 
   2470 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
   2471 	ip6router->ip6or_len = 2;
   2472 	ip6router->ip6or_value[0] = 0;
   2473 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
   2474 
   2475 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
   2476 	ip6hbh->ip6h_len = 0;
   2477 
   2478 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
   2479 	ip6h->ip6_plen = htons(sizeof (*mldh) + router_alert_length);
   2480 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
   2481 	ip6h->ip6_hops = MLD_HOP_LIMIT;
   2482 	if (v6addr == NULL)
   2483 		ip6h->ip6_dst =  ilm->ilm_v6addr;
   2484 	else
   2485 		ip6h->ip6_dst = *v6addr;
   2486 
   2487 	ip6h->ip6_src = ipv6_all_zeros;
   2488 	/*
   2489 	 * Prepare for checksum by putting icmp length in the icmp
   2490 	 * checksum field. The checksum is calculated in ip_output.
   2491 	 */
   2492 	mldh->mld_cksum = htons(sizeof (*mldh));
   2493 
   2494 	ill_mcast_queue(ill, mp);
   2495 }
   2496 
   2497 /*
   2498  * Sends an MLD_V2_LISTENER_REPORT message out the passed-in ill.  The
   2499  * report will contain one multicast address record for each element of
   2500  * reclist.  If this causes packet length to exceed ill->ill_mtu,
   2501  * multiple reports are sent.  reclist is assumed to be made up of
   2502  * buffers allocated by mcast_bldmrec(), and those buffers are freed here.
   2503  */
   2504 static void
   2505 mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
   2506 {
   2507 	mblk_t		*mp;
   2508 	mld2r_t		*mld2r;
   2509 	mld2mar_t	*mld2mar;
   2510 	in6_addr_t	*srcarray;
   2511 	ip6_t		*ip6h;
   2512 	ip6_hbh_t	*ip6hbh;
   2513 	struct ip6_opt_router	*ip6router;
   2514 	size_t		size, optlen, padlen, icmpsize, rsize;
   2515 	int		i, numrec, more_src_cnt;
   2516 	mrec_t		*rp, *cur_reclist;
   2517 	mrec_t		*next_reclist = reclist;
   2518 	boolean_t	morepkts;
   2519 
   2520 	/* If there aren't any records, there's nothing to send */
   2521 	if (reclist == NULL)
   2522 		return;
   2523 
   2524 	ASSERT(ill->ill_isv6);
   2525 	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
   2526 
   2527 	/*
   2528 	 * Total option length (optlen + padlen) must be a multiple of
   2529 	 * 8 bytes.  We assume here that optlen <= 8, so the total option
   2530 	 * length will be 8.  Assert this in case anything ever changes.
   2531 	 */
   2532 	optlen = sizeof (ip6_hbh_t) + sizeof (struct ip6_opt_router);
   2533 	ASSERT(optlen <= 8);
   2534 	padlen = 8 - optlen;
   2535 nextpkt:
   2536 	icmpsize = sizeof (mld2r_t);
   2537 	size = IPV6_HDR_LEN + optlen + padlen + icmpsize;
   2538 	morepkts = B_FALSE;
   2539 	more_src_cnt = 0;
   2540 	for (rp = cur_reclist = next_reclist, numrec = 0; rp != NULL;
   2541 	    rp = rp->mrec_next, numrec++) {
   2542 		rsize = sizeof (mld2mar_t) +
   2543 		    (rp->mrec_srcs.sl_numsrc * sizeof (in6_addr_t));
   2544 		if (size + rsize > ill->ill_mtu) {
   2545 			if (rp == cur_reclist) {
   2546 				/*
   2547 				 * If the first mrec we looked at is too big
   2548 				 * to fit in a single packet (i.e the source
   2549 				 * list is too big), we must either truncate
   2550 				 * the list (if TO_EX or IS_EX), or send
   2551 				 * multiple reports for the same group (all
   2552 				 * other types).
   2553 				 */
   2554 				int srcspace, srcsperpkt;
   2555 				srcspace = ill->ill_mtu -
   2556 				    (size + sizeof (mld2mar_t));
   2557 
   2558 				/*
   2559 				 * Skip if there's not even enough room in
   2560 				 * a single packet to send something useful.
   2561 				 */
   2562 				if (srcspace <= sizeof (in6_addr_t))
   2563 					continue;
   2564 
   2565 				srcsperpkt = srcspace / sizeof (in6_addr_t);
   2566 				/*
   2567 				 * Increment icmpsize and size, because we will
   2568 				 * be sending a record for the mrec we're
   2569 				 * looking at now.
   2570 				 */
   2571 				rsize = sizeof (mld2mar_t) +
   2572 				    (srcsperpkt * sizeof (in6_addr_t));
   2573 				icmpsize += rsize;
   2574 				size += rsize;
   2575 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
   2576 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
   2577 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
   2578 					if (rp->mrec_next == NULL) {
   2579 						/* no more packets to send */
   2580 						break;
   2581 					} else {
   2582 						/*
   2583 						 * more packets, but we're
   2584 						 * done with this mrec.
   2585 						 */
   2586 						next_reclist = rp->mrec_next;
   2587 					}
   2588 				} else {
   2589 					more_src_cnt = rp->mrec_srcs.sl_numsrc
   2590 					    - srcsperpkt;
   2591 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
   2592 					/*
   2593 					 * We'll fix up this mrec (remove the
   2594 					 * srcs we've already sent) before
   2595 					 * returning to nextpkt above.
   2596 					 */
   2597 					next_reclist = rp;
   2598 				}
   2599 			} else {
   2600 				next_reclist = rp;
   2601 			}
   2602 			morepkts = B_TRUE;
   2603 			break;
   2604 		}
   2605 		icmpsize += rsize;
   2606 		size += rsize;
   2607 	}
   2608 
   2609 	mp = allocb(size, BPRI_HI);
   2610 	if (mp == NULL)
   2611 		goto free_reclist;
   2612 	bzero(mp->b_rptr, size);
   2613 	mp->b_wptr = mp->b_rptr + size;
   2614 
   2615 	ip6h = (ip6_t *)mp->b_rptr;
   2616 	ip6hbh = (ip6_hbh_t *)&(ip6h[1]);
   2617 	ip6router = (struct ip6_opt_router *)&(ip6hbh[1]);
   2618 	mld2r = (mld2r_t *)((uint8_t *)ip6hbh + optlen + padlen);
   2619 	mld2mar = (mld2mar_t *)&(mld2r[1]);
   2620 
   2621 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
   2622 	ip6h->ip6_plen = htons(optlen + padlen + icmpsize);
   2623 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
   2624 	ip6h->ip6_hops = MLD_HOP_LIMIT;
   2625 	ip6h->ip6_dst = ipv6_all_v2rtrs_mcast;
   2626 	ip6h->ip6_src = ipv6_all_zeros;
   2627 
   2628 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
   2629 	/*
   2630 	 * ip6h_len is the number of 8-byte words, not including the first
   2631 	 * 8 bytes; we've assumed optlen + padlen == 8 bytes; hence len = 0.
   2632 	 */
   2633 	ip6hbh->ip6h_len = 0;
   2634 
   2635 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
   2636 	ip6router->ip6or_len = 2;
   2637 	ip6router->ip6or_value[0] = 0;
   2638 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
   2639 
   2640 	mld2r->mld2r_type = MLD_V2_LISTENER_REPORT;
   2641 	mld2r->mld2r_nummar = htons(numrec);
   2642 	/*
   2643 	 * Prepare for the checksum by putting icmp length in the icmp
   2644 	 * checksum field. The checksum is calculated in ip_output_simple.
   2645 	 */
   2646 	mld2r->mld2r_cksum = htons(icmpsize);
   2647 
   2648 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
   2649 		mld2mar->mld2mar_type = rp->mrec_type;
   2650 		mld2mar->mld2mar_auxlen = 0;
   2651 		mld2mar->mld2mar_numsrc = htons(rp->mrec_srcs.sl_numsrc);
   2652 		mld2mar->mld2mar_group = rp->mrec_group;
   2653 		srcarray = (in6_addr_t *)&(mld2mar[1]);
   2654 
   2655 		for (i = 0; i < rp->mrec_srcs.sl_numsrc; i++)
   2656 			srcarray[i] = rp->mrec_srcs.sl_addr[i];
   2657 
   2658 		mld2mar = (mld2mar_t *)&(srcarray[i]);
   2659 	}
   2660 
   2661 	ill_mcast_queue(ill, mp);
   2662 
   2663 	if (morepkts) {
   2664 		if (more_src_cnt > 0) {
   2665 			int index, mvsize;
   2666 			slist_t *sl = &next_reclist->mrec_srcs;
   2667 			index = sl->sl_numsrc;
   2668 			mvsize = more_src_cnt * sizeof (in6_addr_t);
   2669 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
   2670 			    mvsize);
   2671 			sl->sl_numsrc = more_src_cnt;
   2672 		}
   2673 		goto nextpkt;
   2674 	}
   2675 
   2676 free_reclist:
   2677 	while (reclist != NULL) {
   2678 		rp = reclist->mrec_next;
   2679 		mi_free(reclist);
   2680 		reclist = rp;
   2681 	}
   2682 }
   2683 
   2684 static mrec_t *
   2685 mcast_bldmrec(mcast_record_t type, in6_addr_t *grp, slist_t *srclist,
   2686     mrec_t *next)
   2687 {
   2688 	mrec_t *rp;
   2689 	int i;
   2690 
   2691 	if ((type == ALLOW_NEW_SOURCES || type == BLOCK_OLD_SOURCES) &&
   2692 	    SLIST_IS_EMPTY(srclist))
   2693 		return (next);
   2694 
   2695 	rp = (mrec_t *)mi_alloc(sizeof (mrec_t), BPRI_HI);
   2696 	if (rp == NULL)
   2697 		return (next);
   2698 
   2699 	rp->mrec_next = next;
   2700 	rp->mrec_type = type;
   2701 	rp->mrec_auxlen = 0;
   2702 	rp->mrec_group = *grp;
   2703 	if (srclist == NULL) {
   2704 		rp->mrec_srcs.sl_numsrc = 0;
   2705 	} else {
   2706 		rp->mrec_srcs.sl_numsrc = srclist->sl_numsrc;
   2707 		for (i = 0; i < srclist->sl_numsrc; i++)
   2708 			rp->mrec_srcs.sl_addr[i] = srclist->sl_addr[i];
   2709 	}
   2710 
   2711 	return (rp);
   2712 }
   2713 
   2714 /*
   2715  * Set up initial retransmit state.  If memory cannot be allocated for
   2716  * the source lists, simply create as much state as is possible; memory
   2717  * allocation failures are considered one type of transient error that
   2718  * the retransmissions are designed to overcome (and if they aren't
   2719  * transient, there are bigger problems than failing to notify the
   2720  * router about multicast group membership state changes).
   2721  */
   2722 static void
   2723 mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp, mcast_record_t rtype,
   2724     slist_t *flist)
   2725 {
   2726 	/*
   2727 	 * There are only three possibilities for rtype:
   2728 	 *	New join, transition from INCLUDE {} to INCLUDE {flist}
   2729 	 *	  => rtype is ALLOW_NEW_SOURCES
   2730 	 *	New join, transition from INCLUDE {} to EXCLUDE {flist}
   2731 	 *	  => rtype is CHANGE_TO_EXCLUDE
   2732 	 *	State change that involves a filter mode change
   2733 	 *	  => rtype is either CHANGE_TO_INCLUDE or CHANGE_TO_EXCLUDE
   2734 	 */
   2735 	ASSERT(rtype == CHANGE_TO_EXCLUDE || rtype == CHANGE_TO_INCLUDE ||
   2736 	    rtype == ALLOW_NEW_SOURCES);
   2737 
   2738 	rtxp->rtx_cnt = ill->ill_mcast_rv;
   2739 
   2740 	switch (rtype) {
   2741 	case CHANGE_TO_EXCLUDE:
   2742 		rtxp->rtx_fmode_cnt = ill->ill_mcast_rv;
   2743 		CLEAR_SLIST(rtxp->rtx_allow);
   2744 		COPY_SLIST(flist, rtxp->rtx_block);
   2745 		break;
   2746 	case ALLOW_NEW_SOURCES:
   2747 	case CHANGE_TO_INCLUDE:
   2748 		rtxp->rtx_fmode_cnt =
   2749 		    rtype == ALLOW_NEW_SOURCES ? 0 : ill->ill_mcast_rv;
   2750 		CLEAR_SLIST(rtxp->rtx_block);
   2751 		COPY_SLIST(flist, rtxp->rtx_allow);
   2752 		break;
   2753 	}
   2754 }
   2755 
   2756 /*
   2757  * The basic strategy here, as extrapolated from RFC 3810 section 6.1 and
   2758  * RFC 3376 section 5.1, covers three cases:
   2759  *	* The current state change is a filter mode change
   2760  *		Set filter mode retransmit counter; set retransmit allow or
   2761  *		block list to new source list as appropriate, and clear the
   2762  *		retransmit list that was not set; send TO_IN or TO_EX with
   2763  *		new source list.
   2764  *	* The current state change is a source list change, but the filter
   2765  *	  mode retransmit counter is > 0
   2766  *		Decrement filter mode retransmit counter; set retransmit
   2767  *		allow or block list to  new source list as appropriate,
   2768  *		and clear the retransmit list that was not set; send TO_IN
   2769  *		or TO_EX with new source list.
   2770  *	* The current state change is a source list change, and the filter
   2771  *	  mode retransmit counter is 0.
   2772  *		Merge existing rtx allow and block lists with new state:
   2773  *		  rtx_allow = (new allow + rtx_allow) - new block
   2774  *		  rtx_block = (new block + rtx_block) - new allow
   2775  *		Send ALLOW and BLOCK records for new retransmit lists;
   2776  *		decrement retransmit counter.
   2777  *
   2778  * As is the case for mcast_init_rtx(), memory allocation failures are
   2779  * acceptable; we just create as much state as we can.
   2780  */
   2781 static mrec_t *
   2782 mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist)
   2783 {
   2784 	ill_t *ill;
   2785 	rtx_state_t *rtxp = &ilm->ilm_rtx;
   2786 	mcast_record_t txtype;
   2787 	mrec_t *rp, *rpnext, *rtnmrec;
   2788 	boolean_t ovf;
   2789 
   2790 	ill = ilm->ilm_ill;
   2791 
   2792 	if (mreclist == NULL)
   2793 		return (mreclist);
   2794 
   2795 	/*
   2796 	 * A filter mode change is indicated by a single mrec, which is
   2797 	 * either TO_IN or TO_EX.  In this case, we just need to set new
   2798 	 * retransmit state as if this were an initial join.  There is
   2799 	 * no change to the mrec list.
   2800 	 */
   2801 	if (mreclist->mrec_type == CHANGE_TO_INCLUDE ||
   2802 	    mreclist->mrec_type == CHANGE_TO_EXCLUDE) {
   2803 		mcast_init_rtx(ill, rtxp, mreclist->mrec_type,
   2804 		    &mreclist->mrec_srcs);
   2805 		return (mreclist);
   2806 	}
   2807 
   2808 	/*
   2809 	 * Only the source list has changed
   2810 	 */
   2811 	rtxp->rtx_cnt = ill->ill_mcast_rv;
   2812 	if (rtxp->rtx_fmode_cnt > 0) {
   2813 		/* but we're still sending filter mode change reports */
   2814 		rtxp->rtx_fmode_cnt--;
   2815 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
   2816 			CLEAR_SLIST(rtxp->rtx_block);
   2817 			COPY_SLIST(flist, rtxp->rtx_allow);
   2818 			txtype = CHANGE_TO_INCLUDE;
   2819 		} else {
   2820 			CLEAR_SLIST(rtxp->rtx_allow);
   2821 			COPY_SLIST(flist, rtxp->rtx_block);
   2822 			txtype = CHANGE_TO_EXCLUDE;
   2823 		}
   2824 		/* overwrite first mrec with new info */
   2825 		mreclist->mrec_type = txtype;
   2826 		l_copy(flist, &mreclist->mrec_srcs);
   2827 		/* then free any remaining mrecs */
   2828 		for (rp = mreclist->mrec_next; rp != NULL; rp = rpnext) {
   2829 			rpnext = rp->mrec_next;
   2830 			mi_free(rp);
   2831 		}
   2832 		mreclist->mrec_next = NULL;
   2833 		rtnmrec = mreclist;
   2834 	} else {
   2835 		mrec_t *allow_mrec, *block_mrec;
   2836 		/*
   2837 		 * Just send the source change reports; but we need to
   2838 		 * recalculate the ALLOW and BLOCK lists based on previous
   2839 		 * state and new changes.
   2840 		 */
   2841 		rtnmrec = mreclist;
   2842 		allow_mrec = block_mrec = NULL;
   2843 		for (rp = mreclist; rp != NULL; rp = rp->mrec_next) {
   2844 			ASSERT(rp->mrec_type == ALLOW_NEW_SOURCES ||
   2845 			    rp->mrec_type == BLOCK_OLD_SOURCES);
   2846 			if (rp->mrec_type == ALLOW_NEW_SOURCES)
   2847 				allow_mrec = rp;
   2848 			else
   2849 				block_mrec = rp;
   2850 		}
   2851 		/*
   2852 		 * Perform calculations:
   2853 		 *   new_allow = mrec_allow + (rtx_allow - mrec_block)
   2854 		 *   new_block = mrec_block + (rtx_block - mrec_allow)
   2855 		 *
   2856 		 * Each calc requires two steps, for example:
   2857 		 *   rtx_allow = rtx_allow - mrec_block;
   2858 		 *   new_allow = mrec_allow + rtx_allow;
   2859 		 *
   2860 		 * Store results in mrec lists, and then copy into rtx lists.
   2861 		 * We do it in this order in case the rtx list hasn't been
   2862 		 * alloc'd yet; if it hasn't and our alloc fails, that's okay,
   2863 		 * Overflows are also okay.
   2864 		 */
   2865 		if (block_mrec != NULL) {
   2866 			l_difference_in_a(rtxp->rtx_allow,
   2867 			    &block_mrec->mrec_srcs);
   2868 		}
   2869 		if (allow_mrec != NULL) {
   2870 			l_difference_in_a(rtxp->rtx_block,
   2871 			    &allow_mrec->mrec_srcs);
   2872 			l_union_in_a(&allow_mrec->mrec_srcs, rtxp->rtx_allow,
   2873 			    &ovf);
   2874 		}
   2875 		if (block_mrec != NULL) {
   2876 			l_union_in_a(&block_mrec->mrec_srcs, rtxp->rtx_block,
   2877 			    &ovf);
   2878 			COPY_SLIST(&block_mrec->mrec_srcs, rtxp->rtx_block);
   2879 		} else {
   2880 			rtnmrec = mcast_bldmrec(BLOCK_OLD_SOURCES,
   2881 			    &ilm->ilm_v6addr, rtxp->rtx_block, allow_mrec);
   2882 		}
   2883 		if (allow_mrec != NULL) {
   2884 			COPY_SLIST(&allow_mrec->mrec_srcs, rtxp->rtx_allow);
   2885 		} else {
   2886 			rtnmrec = mcast_bldmrec(ALLOW_NEW_SOURCES,
   2887 			    &ilm->ilm_v6addr, rtxp->rtx_allow, block_mrec);
   2888 		}
   2889 	}
   2890 
   2891 	return (rtnmrec);
   2892 }
   2893