Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 /* Copyright (c) 1990 Mentat Inc. */
     26 
     27 /*
     28  * Procedures for the kernel part of DVMRP,
     29  * a Distance-Vector Multicast Routing Protocol.
     30  * (See RFC-1075)
     31  * Written by David Waitzman, BBN Labs, August 1988.
     32  * Modified by Steve Deering, Stanford, February 1989.
     33  * Modified by Mark J. Steiglitz, Stanford, May, 1991
     34  * Modified by Van Jacobson, LBL, January 1993
     35  * Modified by Ajit Thyagarajan, PARC, August 1993
     36  * Modified by Bill Fenner, PARC, April 1995
     37  *
     38  * MROUTING 3.5
     39  */
     40 
     41 /*
     42  * TODO
     43  * - function pointer field in vif, void *vif_sendit()
     44  */
     45 
     46 #include <sys/types.h>
     47 #include <sys/stream.h>
     48 #include <sys/stropts.h>
     49 #include <sys/strlog.h>
     50 #include <sys/systm.h>
     51 #include <sys/ddi.h>
     52 #include <sys/cmn_err.h>
     53 #include <sys/zone.h>
     54 
     55 #include <sys/param.h>
     56 #include <sys/socket.h>
     57 #include <sys/vtrace.h>
     58 #include <sys/debug.h>
     59 #include <net/if.h>
     60 #include <sys/sockio.h>
     61 #include <netinet/in.h>
     62 #include <net/if_dl.h>
     63 
     64 #include <inet/ipsec_impl.h>
     65 #include <inet/common.h>
     66 #include <inet/mi.h>
     67 #include <inet/nd.h>
     68 #include <inet/mib2.h>
     69 #include <netinet/ip6.h>
     70 #include <inet/ip.h>
     71 #include <inet/snmpcom.h>
     72 
     73 #include <netinet/igmp.h>
     74 #include <netinet/igmp_var.h>
     75 #include <netinet/udp.h>
     76 #include <netinet/ip_mroute.h>
     77 #include <inet/ip_multi.h>
     78 #include <inet/ip_ire.h>
     79 #include <inet/ip_ndp.h>
     80 #include <inet/ip_if.h>
     81 #include <inet/ipclassifier.h>
     82 
     83 #include <netinet/pim.h>
     84 
     85 
     86 /*
     87  * MT Design:
     88  *
     89  * There are three main data structures viftable, mfctable and tbftable that
     90  * need to be protected against MT races.
     91  *
     92  * vitable is a fixed length array of vif structs. There is no lock to protect
     93  * the whole array, instead each struct is protected by its own indiviual lock.
     94  * The value of v_marks in conjuction with the value of v_refcnt determines the
     95  * current state of a vif structure. One special state that needs mention
     96  * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
     97  * that vif is being initalized.
     98  * Each structure is freed when the refcnt goes down to zero. If a delete comes
     99  * in when the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
    100  * which prevents the struct from further use.  When the refcnt goes to zero
    101  * the struct is freed and is marked VIF_MARK_NOTINUSE.
    102  * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
    103  * from  going away a refhold is put on the ipif before using it. see
    104  * lock_good_vif() and unlock_good_vif().
    105  *
    106  * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts
    107  * of the vif struct.
    108  *
    109  * tbftable is also a fixed length array of tbf structs and is only accessed
    110  * via v_tbf.  It is protected by its own lock tbf_lock.
    111  *
    112  * Lock Ordering is
    113  * v_lock --> tbf_lock
    114  * v_lock --> ill_locK
    115  *
    116  * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb).
    117  * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker,
    118  * it also maintains a state. These fields are protected by a lock (mfcb_lock).
    119  * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to
    120  * protect the struct elements.
    121  *
    122  * mfc structs are dynamically allocated and are singly linked
    123  * at the head of the chain. When an mfc structure is to be deleted
    124  * it is marked condemned and so is the state in the bucket struct.
    125  * When the last walker of the hash bucket exits all the mfc structs
    126  * marked condemed are freed.
    127  *
    128  * Locking Hierarchy:
    129  * The bucket lock should be acquired before the mfc struct lock.
    130  * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking
    131  * operations on the bucket struct.
    132  *
    133  * last_encap_lock and numvifs_mutex should be acquired after
    134  * acquring vif or mfc locks. These locks protect some global variables.
    135  *
    136  * The statistics are not currently protected by a lock
    137  * causing the stats be be approximate, not exact.
    138  */
    139 
    140 #define	NO_VIF	MAXVIFS 	/* from mrouted, no route for src */
    141 
    142 /*
    143  * Timeouts:
    144  * 	Upcall timeouts - BSD uses boolean_t mfc->expire and
    145  *	nexpire[MFCTBLSIZE], the number of times expire has been called.
    146  *	SunOS 5.x uses mfc->timeout for each mfc.
    147  *	Some Unixes are limited in the number of simultaneous timeouts
    148  * 	that can be run, SunOS 5.x does not have this restriction.
    149  */
    150 
    151 /*
    152  * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and
    153  * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall
    154  * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE
    155  */
    156 #define		EXPIRE_TIMEOUT	(hz/4)	/* 4x / second	*/
    157 #define		UPCALL_EXPIRE	6	/* number of timeouts	*/
    158 
    159 /*
    160  * Hash function for a source, group entry
    161  */
    162 #define	MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
    163 	((g) >> 20) ^ ((g) >> 10) ^ (g))
    164 
    165 #define			TBF_REPROCESS	(hz / 100)	/* 100x /second	*/
    166 
    167 /* Identify PIM packet that came on a Register interface */
    168 #define	PIM_REGISTER_MARKER	0xffffffff
    169 
    170 /* Function declarations */
    171 static int	add_mfc(struct mfcctl *, ip_stack_t *);
    172 static int	add_vif(struct vifctl *, conn_t *, ip_stack_t *);
    173 static int	del_mfc(struct mfcctl *, ip_stack_t *);
    174 static int	del_vif(vifi_t *, ip_stack_t *);
    175 static void	del_vifp(struct vif *);
    176 static void	encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
    177 static void	expire_upcalls(void *);
    178 static void	fill_route(struct mfc *, struct mfcctl *, ip_stack_t *);
    179 static void	free_queue(struct mfc *);
    180 static int	get_assert(uchar_t *, ip_stack_t *);
    181 static int	get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *);
    182 static int	get_sg_cnt(struct sioc_sg_req *, ip_stack_t *);
    183 static int	get_version(uchar_t *);
    184 static int	get_vif_cnt(struct sioc_vif_req *, ip_stack_t *);
    185 static int	ip_mdq(mblk_t *, ipha_t *, ill_t *,
    186 		    ipaddr_t, struct mfc *);
    187 static int	ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *);
    188 static void	phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
    189 static int	register_mforward(mblk_t *, ip_recv_attr_t *);
    190 static void	register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
    191 static int	set_assert(int *, ip_stack_t *);
    192 
    193 /*
    194  * Token Bucket Filter functions
    195  */
    196 static int  priority(struct vif *, ipha_t *);
    197 static void tbf_control(struct vif *, mblk_t *, ipha_t *);
    198 static int  tbf_dq_sel(struct vif *, ipha_t *);
    199 static void tbf_process_q(struct vif *);
    200 static void tbf_queue(struct vif *, mblk_t *);
    201 static void tbf_reprocess_q(void *);
    202 static void tbf_send_packet(struct vif *, mblk_t *);
    203 static void tbf_update_tokens(struct vif *);
    204 static void release_mfc(struct mfcb *);
    205 
    206 static boolean_t is_mrouter_off(ip_stack_t *);
    207 /*
    208  * Encapsulation packets
    209  */
    210 
    211 #define	ENCAP_TTL	64
    212 
    213 /* prototype IP hdr for encapsulated packets */
    214 static ipha_t multicast_encap_iphdr = {
    215 	IP_SIMPLE_HDR_VERSION,
    216 	0,				/* tos */
    217 	sizeof (ipha_t),		/* total length */
    218 	0,				/* id */
    219 	0,				/* frag offset */
    220 	ENCAP_TTL, IPPROTO_ENCAP,
    221 	0,				/* checksum */
    222 };
    223 
    224 /*
    225  * Rate limit for assert notification messages, in nsec.
    226  */
    227 #define	ASSERT_MSG_TIME		3000000000
    228 
    229 
    230 #define	VIF_REFHOLD(vifp) {			\
    231 	mutex_enter(&(vifp)->v_lock);		\
    232 	(vifp)->v_refcnt++;			\
    233 	mutex_exit(&(vifp)->v_lock);		\
    234 }
    235 
    236 #define	VIF_REFRELE_LOCKED(vifp) {				\
    237 	(vifp)->v_refcnt--;					\
    238 	if ((vifp)->v_refcnt == 0 &&				\
    239 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
    240 			del_vifp(vifp);				\
    241 	} else {						\
    242 		mutex_exit(&(vifp)->v_lock);			\
    243 	}							\
    244 }
    245 
    246 #define	VIF_REFRELE(vifp) {					\
    247 	mutex_enter(&(vifp)->v_lock);				\
    248 	(vifp)->v_refcnt--;					\
    249 	if ((vifp)->v_refcnt == 0 &&				\
    250 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
    251 			del_vifp(vifp);				\
    252 	} else {						\
    253 		mutex_exit(&(vifp)->v_lock);			\
    254 	}							\
    255 }
    256 
    257 #define	MFCB_REFHOLD(mfcb) {				\
    258 	mutex_enter(&(mfcb)->mfcb_lock);		\
    259 	(mfcb)->mfcb_refcnt++;				\
    260 	ASSERT((mfcb)->mfcb_refcnt != 0);		\
    261 	mutex_exit(&(mfcb)->mfcb_lock);			\
    262 }
    263 
    264 #define	MFCB_REFRELE(mfcb) {					\
    265 	mutex_enter(&(mfcb)->mfcb_lock);			\
    266 	ASSERT((mfcb)->mfcb_refcnt != 0);			\
    267 	if (--(mfcb)->mfcb_refcnt == 0 &&			\
    268 		((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) {	\
    269 			release_mfc(mfcb);			\
    270 	}							\
    271 	mutex_exit(&(mfcb)->mfcb_lock);				\
    272 }
    273 
    274 /*
    275  * MFCFIND:
    276  * Find a route for a given origin IP address and multicast group address.
    277  * Skip entries with pending upcalls.
    278  * Type of service parameter to be added in the future!
    279  */
    280 #define	MFCFIND(mfcbp, o, g, rt) { \
    281 	struct mfc *_mb_rt = NULL; \
    282 	rt = NULL; \
    283 	_mb_rt = mfcbp->mfcb_mfc; \
    284 	while (_mb_rt) { \
    285 		if ((_mb_rt->mfc_origin.s_addr == o) && \
    286 		    (_mb_rt->mfc_mcastgrp.s_addr == g) && \
    287 		    (_mb_rt->mfc_rte == NULL) && \
    288 		    (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) {        \
    289 		    rt = _mb_rt; \
    290 		    break; \
    291 		} \
    292 	_mb_rt = _mb_rt->mfc_next; \
    293 	} \
    294 }
    295 
    296 /*
    297  * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime()
    298  * are inefficient. We use gethrestime() which returns a timespec_t with
    299  * sec and nsec, the resolution is machine dependent.
    300  * The following 2 macros have been changed to use nsec instead of usec.
    301  */
    302 /*
    303  * Macros to compute elapsed time efficiently.
    304  * Borrowed from Van Jacobson's scheduling code.
    305  * Delta should be a hrtime_t.
    306  */
    307 #define	TV_DELTA(a, b, delta) { \
    308 	int xxs; \
    309  \
    310 	delta = (a).tv_nsec - (b).tv_nsec; \
    311 	if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
    312 		switch (xxs) { \
    313 		case 2: \
    314 		    delta += 1000000000; \
    315 		    /*FALLTHROUGH*/ \
    316 		case 1: \
    317 		    delta += 1000000000; \
    318 		    break; \
    319 		default: \
    320 		    delta += (1000000000 * xxs); \
    321 		} \
    322 	} \
    323 }
    324 
    325 #define	TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \
    326 	(a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
    327 
    328 /*
    329  * Handle MRT setsockopt commands to modify the multicast routing tables.
    330  */
    331 int
    332 ip_mrouter_set(int cmd, conn_t *connp, int checkonly, uchar_t *data,
    333     int datalen)
    334 {
    335 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
    336 
    337 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
    338 	if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) {
    339 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    340 		return (EACCES);
    341 	}
    342 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    343 
    344 	if (checkonly) {
    345 		/*
    346 		 * do not do operation, just pretend to - new T_CHECK
    347 		 * Note: Even routines further on can probably fail but
    348 		 * this T_CHECK stuff is only to please XTI so it not
    349 		 * necessary to be perfect.
    350 		 */
    351 		switch (cmd) {
    352 		case MRT_INIT:
    353 		case MRT_DONE:
    354 		case MRT_ADD_VIF:
    355 		case MRT_DEL_VIF:
    356 		case MRT_ADD_MFC:
    357 		case MRT_DEL_MFC:
    358 		case MRT_ASSERT:
    359 			return (0);
    360 		default:
    361 			return (EOPNOTSUPP);
    362 		}
    363 	}
    364 
    365 	/*
    366 	 * make sure no command is issued after multicast routing has been
    367 	 * turned off.
    368 	 */
    369 	if (cmd != MRT_INIT && cmd != MRT_DONE) {
    370 		if (is_mrouter_off(ipst))
    371 			return (EINVAL);
    372 	}
    373 
    374 	switch (cmd) {
    375 	case MRT_INIT:	return (ip_mrouter_init(connp, data, datalen, ipst));
    376 	case MRT_DONE:	return (ip_mrouter_done(ipst));
    377 	case MRT_ADD_VIF:  return (add_vif((struct vifctl *)data, connp, ipst));
    378 	case MRT_DEL_VIF:  return (del_vif((vifi_t *)data, ipst));
    379 	case MRT_ADD_MFC:  return (add_mfc((struct mfcctl *)data, ipst));
    380 	case MRT_DEL_MFC:  return (del_mfc((struct mfcctl *)data, ipst));
    381 	case MRT_ASSERT:   return (set_assert((int *)data, ipst));
    382 	default:	   return (EOPNOTSUPP);
    383 	}
    384 }
    385 
    386 /*
    387  * Handle MRT getsockopt commands
    388  */
    389 int
    390 ip_mrouter_get(int cmd, conn_t *connp, uchar_t *data)
    391 {
    392 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
    393 
    394 	if (connp != ipst->ips_ip_g_mrouter)
    395 		return (EACCES);
    396 
    397 	switch (cmd) {
    398 	case MRT_VERSION:	return (get_version((uchar_t *)data));
    399 	case MRT_ASSERT:	return (get_assert((uchar_t *)data, ipst));
    400 	default:		return (EOPNOTSUPP);
    401 	}
    402 }
    403 
    404 /*
    405  * Handle ioctl commands to obtain information from the cache.
    406  * Called with shared access to IP. These are read_only ioctls.
    407  */
    408 /* ARGSUSED */
    409 int
    410 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
    411     ip_ioctl_cmd_t *ipip, void *if_req)
    412 {
    413 	mblk_t	*mp1;
    414 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
    415 	conn_t		*connp = Q_TO_CONN(q);
    416 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
    417 
    418 	/* Existence verified in ip_wput_nondata */
    419 	mp1 = mp->b_cont->b_cont;
    420 
    421 	switch (iocp->ioc_cmd) {
    422 	case (SIOCGETVIFCNT):
    423 		return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst));
    424 	case (SIOCGETSGCNT):
    425 		return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst));
    426 	case (SIOCGETLSGCNT):
    427 		return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst));
    428 	default:
    429 		return (EINVAL);
    430 	}
    431 }
    432 
    433 /*
    434  * Returns the packet, byte, rpf-failure count for the source, group provided.
    435  */
    436 static int
    437 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst)
    438 {
    439 	struct mfc *rt;
    440 	struct mfcb *mfcbp;
    441 
    442 	mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)];
    443 	MFCB_REFHOLD(mfcbp);
    444 	MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt);
    445 
    446 	if (rt != NULL) {
    447 		mutex_enter(&rt->mfc_mutex);
    448 		req->pktcnt   = rt->mfc_pkt_cnt;
    449 		req->bytecnt  = rt->mfc_byte_cnt;
    450 		req->wrong_if = rt->mfc_wrong_if;
    451 		mutex_exit(&rt->mfc_mutex);
    452 	} else
    453 		req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU;
    454 
    455 	MFCB_REFRELE(mfcbp);
    456 	return (0);
    457 }
    458 
    459 /*
    460  * Returns the packet, byte, rpf-failure count for the source, group provided.
    461  * Uses larger counters and IPv6 addresses.
    462  */
    463 /* ARGSUSED XXX until implemented */
    464 static int
    465 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst)
    466 {
    467 	/* XXX TODO SIOCGETLSGCNT */
    468 	return (ENXIO);
    469 }
    470 
    471 /*
    472  * Returns the input and output packet and byte counts on the vif provided.
    473  */
    474 static int
    475 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst)
    476 {
    477 	vifi_t vifi = req->vifi;
    478 
    479 	if (vifi >= ipst->ips_numvifs)
    480 		return (EINVAL);
    481 
    482 	/*
    483 	 * No locks here, an approximation is fine.
    484 	 */
    485 	req->icount = ipst->ips_vifs[vifi].v_pkt_in;
    486 	req->ocount = ipst->ips_vifs[vifi].v_pkt_out;
    487 	req->ibytes = ipst->ips_vifs[vifi].v_bytes_in;
    488 	req->obytes = ipst->ips_vifs[vifi].v_bytes_out;
    489 
    490 	return (0);
    491 }
    492 
    493 static int
    494 get_version(uchar_t *data)
    495 {
    496 	int *v = (int *)data;
    497 
    498 	*v = 0x0305;	/* XXX !!!! */
    499 
    500 	return (0);
    501 }
    502 
    503 /*
    504  * Set PIM assert processing global.
    505  */
    506 static int
    507 set_assert(int *i, ip_stack_t *ipst)
    508 {
    509 	if ((*i != 1) && (*i != 0))
    510 		return (EINVAL);
    511 
    512 	ipst->ips_pim_assert = *i;
    513 
    514 	return (0);
    515 }
    516 
    517 /*
    518  * Get PIM assert processing global.
    519  */
    520 static int
    521 get_assert(uchar_t *data, ip_stack_t *ipst)
    522 {
    523 	int *i = (int *)data;
    524 
    525 	*i = ipst->ips_pim_assert;
    526 
    527 	return (0);
    528 }
    529 
    530 /*
    531  * Enable multicast routing.
    532  */
    533 static int
    534 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst)
    535 {
    536 	int	*v;
    537 
    538 	if (data == NULL || (datalen != sizeof (int)))
    539 		return (ENOPROTOOPT);
    540 
    541 	v = (int *)data;
    542 	if (*v != 1)
    543 		return (ENOPROTOOPT);
    544 
    545 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
    546 	if (ipst->ips_ip_g_mrouter != NULL) {
    547 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    548 		return (EADDRINUSE);
    549 	}
    550 
    551 	/*
    552 	 * MRT_INIT should only be allowed for RAW sockets, but we double
    553 	 * check.
    554 	 */
    555 	if (!IPCL_IS_RAWIP(connp)) {
    556 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    557 		return (EINVAL);
    558 	}
    559 
    560 	ipst->ips_ip_g_mrouter = connp;
    561 	connp->conn_multi_router = 1;
    562 	/* In order for tunnels to work we have to turn ip_g_forward on */
    563 	if (!WE_ARE_FORWARDING(ipst)) {
    564 		if (ipst->ips_ip_mrtdebug > 1) {
    565 			(void) mi_strlog(connp->conn_rq, 1, SL_TRACE,
    566 			    "ip_mrouter_init: turning on forwarding");
    567 		}
    568 		ipst->ips_saved_ip_g_forward = ipst->ips_ip_g_forward;
    569 		ipst->ips_ip_g_forward = IP_FORWARD_ALWAYS;
    570 	}
    571 
    572 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    573 	return (0);
    574 }
    575 
    576 void
    577 ip_mrouter_stack_init(ip_stack_t *ipst)
    578 {
    579 	mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL);
    580 
    581 	ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1),
    582 	    KM_SLEEP);
    583 	ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP);
    584 	/*
    585 	 * mfctable:
    586 	 * Includes all mfcs, including waiting upcalls.
    587 	 * Multiple mfcs per bucket.
    588 	 */
    589 	ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ,
    590 	    KM_SLEEP);
    591 	/*
    592 	 * Define the token bucket filter structures.
    593 	 * tbftable -> each vif has one of these for storing info.
    594 	 */
    595 	ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP);
    596 
    597 	mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL);
    598 
    599 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
    600 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
    601 }
    602 
    603 /*
    604  * Disable multicast routing.
    605  * Didn't use global timeout_val (BSD version), instead check the mfctable.
    606  */
    607 int
    608 ip_mrouter_done(ip_stack_t *ipst)
    609 {
    610 	conn_t		*mrouter;
    611 	vifi_t 		vifi;
    612 	struct mfc	*mfc_rt;
    613 	int		i;
    614 
    615 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
    616 	if (ipst->ips_ip_g_mrouter == NULL) {
    617 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    618 		return (EINVAL);
    619 	}
    620 
    621 	mrouter = ipst->ips_ip_g_mrouter;
    622 
    623 	if (ipst->ips_saved_ip_g_forward != -1) {
    624 		if (ipst->ips_ip_mrtdebug > 1) {
    625 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
    626 			    "ip_mrouter_done: turning off forwarding");
    627 		}
    628 		ipst->ips_ip_g_forward = ipst->ips_saved_ip_g_forward;
    629 		ipst->ips_saved_ip_g_forward = -1;
    630 	}
    631 
    632 	/*
    633 	 * Always clear cache when vifs change.
    634 	 * No need to get ipst->ips_last_encap_lock since we are running as
    635 	 * a writer.
    636 	 */
    637 	mutex_enter(&ipst->ips_last_encap_lock);
    638 	ipst->ips_last_encap_src = 0;
    639 	ipst->ips_last_encap_vif = NULL;
    640 	mutex_exit(&ipst->ips_last_encap_lock);
    641 	mrouter->conn_multi_router = 0;
    642 
    643 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    644 
    645 	/*
    646 	 * For each phyint in use,
    647 	 * disable promiscuous reception of all IP multicasts.
    648 	 */
    649 	for (vifi = 0; vifi < MAXVIFS; vifi++) {
    650 		struct vif *vifp = ipst->ips_vifs + vifi;
    651 
    652 		mutex_enter(&vifp->v_lock);
    653 		/*
    654 		 * if the vif is active mark it condemned.
    655 		 */
    656 		if (vifp->v_marks & VIF_MARK_GOOD) {
    657 			ASSERT(vifp->v_ipif != NULL);
    658 			ipif_refhold(vifp->v_ipif);
    659 			/* Phyint only */
    660 			if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
    661 				ipif_t *ipif = vifp->v_ipif;
    662 				ilm_t *ilm = vifp->v_ilm;
    663 
    664 				vifp->v_ilm = NULL;
    665 				vifp->v_marks &= ~VIF_MARK_GOOD;
    666 				vifp->v_marks |= VIF_MARK_CONDEMNED;
    667 
    668 				mutex_exit(&(vifp)->v_lock);
    669 				if (ilm != NULL) {
    670 					ill_t *ill = ipif->ipif_ill;
    671 
    672 					(void) ip_delmulti(ilm);
    673 					ASSERT(ill->ill_mrouter_cnt > 0);
    674 					atomic_dec_32(&ill->ill_mrouter_cnt);
    675 				}
    676 				mutex_enter(&vifp->v_lock);
    677 			}
    678 			ipif_refrele(vifp->v_ipif);
    679 			/*
    680 			 * decreases the refcnt added in add_vif.
    681 			 * and release v_lock.
    682 			 */
    683 			VIF_REFRELE_LOCKED(vifp);
    684 		} else {
    685 			mutex_exit(&vifp->v_lock);
    686 			continue;
    687 		}
    688 	}
    689 
    690 	mutex_enter(&ipst->ips_numvifs_mutex);
    691 	ipst->ips_numvifs = 0;
    692 	ipst->ips_pim_assert = 0;
    693 	ipst->ips_reg_vif_num = ALL_VIFS;
    694 	mutex_exit(&ipst->ips_numvifs_mutex);
    695 
    696 	/*
    697 	 * Free upcall msgs.
    698 	 * Go through mfctable and stop any outstanding upcall
    699 	 * timeouts remaining on mfcs.
    700 	 */
    701 	for (i = 0; i < MFCTBLSIZ; i++) {
    702 		mutex_enter(&ipst->ips_mfcs[i].mfcb_lock);
    703 		ipst->ips_mfcs[i].mfcb_refcnt++;
    704 		ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED;
    705 		mutex_exit(&ipst->ips_mfcs[i].mfcb_lock);
    706 		mfc_rt = ipst->ips_mfcs[i].mfcb_mfc;
    707 		while (mfc_rt) {
    708 			/* Free upcalls */
    709 			mutex_enter(&mfc_rt->mfc_mutex);
    710 			if (mfc_rt->mfc_rte != NULL) {
    711 				if (mfc_rt->mfc_timeout_id != 0) {
    712 					/*
    713 					 * OK to drop the lock as we have
    714 					 * a refcnt on the bucket. timeout
    715 					 * can fire but it will see that
    716 					 * mfc_timeout_id == 0 and not do
    717 					 * anything. see expire_upcalls().
    718 					 */
    719 					mfc_rt->mfc_timeout_id = 0;
    720 					mutex_exit(&mfc_rt->mfc_mutex);
    721 					(void) untimeout(
    722 					    mfc_rt->mfc_timeout_id);
    723 						mfc_rt->mfc_timeout_id = 0;
    724 					mutex_enter(&mfc_rt->mfc_mutex);
    725 
    726 					/*
    727 					 * all queued upcall packets
    728 					 * and mblk will be freed in
    729 					 * release_mfc().
    730 					 */
    731 				}
    732 			}
    733 
    734 			mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
    735 
    736 			mutex_exit(&mfc_rt->mfc_mutex);
    737 			mfc_rt = mfc_rt->mfc_next;
    738 		}
    739 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
    740 	}
    741 
    742 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
    743 	ipst->ips_ip_g_mrouter = NULL;
    744 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    745 	return (0);
    746 }
    747 
    748 void
    749 ip_mrouter_stack_destroy(ip_stack_t *ipst)
    750 {
    751 	struct mfcb *mfcbp;
    752 	struct mfc  *rt;
    753 	int i;
    754 
    755 	for (i = 0; i < MFCTBLSIZ; i++) {
    756 		mfcbp = &ipst->ips_mfcs[i];
    757 
    758 		while ((rt = mfcbp->mfcb_mfc) != NULL) {
    759 			(void) printf("ip_mrouter_stack_destroy: free for %d\n",
    760 			    i);
    761 
    762 			mfcbp->mfcb_mfc = rt->mfc_next;
    763 			free_queue(rt);
    764 			mi_free(rt);
    765 		}
    766 	}
    767 	kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1));
    768 	ipst->ips_vifs = NULL;
    769 	kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat));
    770 	ipst->ips_mrtstat = NULL;
    771 	kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ);
    772 	ipst->ips_mfcs = NULL;
    773 	kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS);
    774 	ipst->ips_tbfs = NULL;
    775 
    776 	mutex_destroy(&ipst->ips_last_encap_lock);
    777 	mutex_destroy(&ipst->ips_ip_g_mrouter_mutex);
    778 }
    779 
    780 static boolean_t
    781 is_mrouter_off(ip_stack_t *ipst)
    782 {
    783 	conn_t	*mrouter;
    784 
    785 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
    786 	if (ipst->ips_ip_g_mrouter == NULL) {
    787 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    788 		return (B_TRUE);
    789 	}
    790 
    791 	mrouter = ipst->ips_ip_g_mrouter;
    792 	if (mrouter->conn_multi_router == 0) {
    793 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    794 		return (B_TRUE);
    795 	}
    796 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    797 	return (B_FALSE);
    798 }
    799 
    800 static void
    801 unlock_good_vif(struct vif *vifp)
    802 {
    803 	ASSERT(vifp->v_ipif != NULL);
    804 	ipif_refrele(vifp->v_ipif);
    805 	VIF_REFRELE(vifp);
    806 }
    807 
    808 static boolean_t
    809 lock_good_vif(struct vif *vifp)
    810 {
    811 	mutex_enter(&vifp->v_lock);
    812 	if (!(vifp->v_marks & VIF_MARK_GOOD)) {
    813 		mutex_exit(&vifp->v_lock);
    814 		return (B_FALSE);
    815 	}
    816 
    817 	ASSERT(vifp->v_ipif != NULL);
    818 	mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock);
    819 	if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) {
    820 		mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
    821 		mutex_exit(&vifp->v_lock);
    822 		return (B_FALSE);
    823 	}
    824 	ipif_refhold_locked(vifp->v_ipif);
    825 	mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
    826 	vifp->v_refcnt++;
    827 	mutex_exit(&vifp->v_lock);
    828 	return (B_TRUE);
    829 }
    830 
    831 /*
    832  * Add a vif to the vif table.
    833  */
    834 static int
    835 add_vif(struct vifctl *vifcp, conn_t *connp, ip_stack_t *ipst)
    836 {
    837 	struct vif	*vifp = ipst->ips_vifs + vifcp->vifc_vifi;
    838 	ipif_t		*ipif;
    839 	int		error = 0;
    840 	struct tbf	*v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi;
    841 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
    842 	ilm_t		*ilm;
    843 	ill_t		*ill;
    844 
    845 	ASSERT(connp != NULL);
    846 
    847 	if (vifcp->vifc_vifi >= MAXVIFS)
    848 		return (EINVAL);
    849 
    850 	if (is_mrouter_off(ipst))
    851 		return (EINVAL);
    852 
    853 	mutex_enter(&vifp->v_lock);
    854 	/*
    855 	 * Viftable entry should be 0.
    856 	 * if v_marks == 0 but v_refcnt != 0 means struct is being
    857 	 * initialized.
    858 	 *
    859 	 * Also note that it is very unlikely that we will get a MRT_ADD_VIF
    860 	 * request while the delete is in progress, mrouted only sends add
    861 	 * requests when a new interface is added and the new interface cannot
    862 	 * have the same vifi as an existing interface. We make sure that
    863 	 * ill_delete will block till the vif is deleted by adding a refcnt
    864 	 * to ipif in del_vif().
    865 	 */
    866 	if (vifp->v_lcl_addr.s_addr != 0 ||
    867 	    vifp->v_marks != 0 ||
    868 	    vifp->v_refcnt != 0) {
    869 		mutex_exit(&vifp->v_lock);
    870 		return (EADDRINUSE);
    871 	}
    872 
    873 	/* Incoming vif should not be 0 */
    874 	if (vifcp->vifc_lcl_addr.s_addr == 0) {
    875 		mutex_exit(&vifp->v_lock);
    876 		return (EINVAL);
    877 	}
    878 
    879 	vifp->v_refcnt++;
    880 	mutex_exit(&vifp->v_lock);
    881 	/* Find the interface with the local address */
    882 	ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
    883 	    IPCL_ZONEID(connp), ipst);
    884 	if (ipif == NULL) {
    885 		VIF_REFRELE(vifp);
    886 		return (EADDRNOTAVAIL);
    887 	}
    888 
    889 	if (ipst->ips_ip_mrtdebug > 1) {
    890 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
    891 		    "add_vif: src 0x%x enter",
    892 		    vifcp->vifc_lcl_addr.s_addr);
    893 	}
    894 
    895 	mutex_enter(&vifp->v_lock);
    896 	/*
    897 	 * Always clear cache when vifs change.
    898 	 * Needed to ensure that src isn't left over from before vif was added.
    899 	 * No need to get last_encap_lock, since we are running as a writer.
    900 	 */
    901 
    902 	mutex_enter(&ipst->ips_last_encap_lock);
    903 	ipst->ips_last_encap_src = 0;
    904 	ipst->ips_last_encap_vif = NULL;
    905 	mutex_exit(&ipst->ips_last_encap_lock);
    906 
    907 	if (vifcp->vifc_flags & VIFF_TUNNEL) {
    908 		if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) {
    909 			cmn_err(CE_WARN,
    910 			    "add_vif: source route tunnels not supported\n");
    911 			VIF_REFRELE_LOCKED(vifp);
    912 			ipif_refrele(ipif);
    913 			return (EOPNOTSUPP);
    914 		}
    915 		vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
    916 
    917 	} else {
    918 		/* Phyint or Register vif */
    919 		if (vifcp->vifc_flags & VIFF_REGISTER) {
    920 			/*
    921 			 * Note: Since all IPPROTO_IP level options (including
    922 			 * MRT_ADD_VIF) are done exclusively via
    923 			 * ip_optmgmt_writer(), a lock is not necessary to
    924 			 * protect reg_vif_num.
    925 			 */
    926 			mutex_enter(&ipst->ips_numvifs_mutex);
    927 			if (ipst->ips_reg_vif_num == ALL_VIFS) {
    928 				ipst->ips_reg_vif_num = vifcp->vifc_vifi;
    929 				mutex_exit(&ipst->ips_numvifs_mutex);
    930 			} else {
    931 				mutex_exit(&ipst->ips_numvifs_mutex);
    932 				VIF_REFRELE_LOCKED(vifp);
    933 				ipif_refrele(ipif);
    934 				return (EADDRINUSE);
    935 			}
    936 		}
    937 
    938 		/* Make sure the interface supports multicast */
    939 		if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) {
    940 			VIF_REFRELE_LOCKED(vifp);
    941 			ipif_refrele(ipif);
    942 			if (vifcp->vifc_flags & VIFF_REGISTER) {
    943 				mutex_enter(&ipst->ips_numvifs_mutex);
    944 				ipst->ips_reg_vif_num = ALL_VIFS;
    945 				mutex_exit(&ipst->ips_numvifs_mutex);
    946 			}
    947 			return (EOPNOTSUPP);
    948 		}
    949 		/* Enable promiscuous reception of all IP mcasts from the if */
    950 		mutex_exit(&vifp->v_lock);
    951 
    952 		ill = ipif->ipif_ill;
    953 		if (IS_UNDER_IPMP(ill))
    954 			ill = ipmp_ill_hold_ipmp_ill(ill);
    955 
    956 		if (ill == NULL) {
    957 			ilm = NULL;
    958 		} else {
    959 			ilm = ip_addmulti(&ipv6_all_zeros, ill,
    960 			    ipif->ipif_zoneid, &error);
    961 			if (ilm != NULL)
    962 				atomic_inc_32(&ill->ill_mrouter_cnt);
    963 			if (IS_UNDER_IPMP(ipif->ipif_ill)) {
    964 				ill_refrele(ill);
    965 				ill = ipif->ipif_ill;
    966 			}
    967 		}
    968 
    969 		mutex_enter(&vifp->v_lock);
    970 		/*
    971 		 * since we released the lock lets make sure that
    972 		 * ip_mrouter_done() has not been called.
    973 		 */
    974 		if (ilm == NULL || is_mrouter_off(ipst)) {
    975 			if (ilm != NULL) {
    976 				(void) ip_delmulti(ilm);
    977 				ASSERT(ill->ill_mrouter_cnt > 0);
    978 				atomic_dec_32(&ill->ill_mrouter_cnt);
    979 			}
    980 			if (vifcp->vifc_flags & VIFF_REGISTER) {
    981 				mutex_enter(&ipst->ips_numvifs_mutex);
    982 				ipst->ips_reg_vif_num = ALL_VIFS;
    983 				mutex_exit(&ipst->ips_numvifs_mutex);
    984 			}
    985 			VIF_REFRELE_LOCKED(vifp);
    986 			ipif_refrele(ipif);
    987 			return (error?error:EINVAL);
    988 		}
    989 		vifp->v_ilm = ilm;
    990 	}
    991 	/* Define parameters for the tbf structure */
    992 	vifp->v_tbf = v_tbf;
    993 	gethrestime(&vifp->v_tbf->tbf_last_pkt_t);
    994 	vifp->v_tbf->tbf_n_tok = 0;
    995 	vifp->v_tbf->tbf_q_len = 0;
    996 	vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
    997 	vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
    998 
    999 	vifp->v_flags = vifcp->vifc_flags;
   1000 	vifp->v_threshold = vifcp->vifc_threshold;
   1001 	vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
   1002 	vifp->v_ipif = ipif;
   1003 	ipif_refrele(ipif);
   1004 	/* Scaling up here, allows division by 1024 in critical code.	*/
   1005 	vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000);
   1006 	vifp->v_timeout_id = 0;
   1007 	/* initialize per vif pkt counters */
   1008 	vifp->v_pkt_in = 0;
   1009 	vifp->v_pkt_out = 0;
   1010 	vifp->v_bytes_in = 0;
   1011 	vifp->v_bytes_out = 0;
   1012 	mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL);
   1013 
   1014 	/* Adjust numvifs up, if the vifi is higher than numvifs */
   1015 	mutex_enter(&ipst->ips_numvifs_mutex);
   1016 	if (ipst->ips_numvifs <= vifcp->vifc_vifi)
   1017 		ipst->ips_numvifs = vifcp->vifc_vifi + 1;
   1018 	mutex_exit(&ipst->ips_numvifs_mutex);
   1019 
   1020 	if (ipst->ips_ip_mrtdebug > 1) {
   1021 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1022 		    "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d",
   1023 		    vifcp->vifc_vifi,
   1024 		    ntohl(vifcp->vifc_lcl_addr.s_addr),
   1025 		    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
   1026 		    ntohl(vifcp->vifc_rmt_addr.s_addr),
   1027 		    vifcp->vifc_threshold, vifcp->vifc_rate_limit);
   1028 	}
   1029 
   1030 	vifp->v_marks = VIF_MARK_GOOD;
   1031 	mutex_exit(&vifp->v_lock);
   1032 	return (0);
   1033 }
   1034 
   1035 
   1036 /* Delete a vif from the vif table. */
   1037 static void
   1038 del_vifp(struct vif *vifp)
   1039 {
   1040 	struct tbf	*t = vifp->v_tbf;
   1041 	mblk_t  *mp0;
   1042 	vifi_t  vifi;
   1043 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
   1044 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   1045 
   1046 	ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED);
   1047 	ASSERT(t != NULL);
   1048 
   1049 	if (ipst->ips_ip_mrtdebug > 1) {
   1050 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1051 		    "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr);
   1052 	}
   1053 
   1054 	if (vifp->v_timeout_id != 0) {
   1055 		(void) untimeout(vifp->v_timeout_id);
   1056 		vifp->v_timeout_id = 0;
   1057 	}
   1058 
   1059 	/*
   1060 	 * Free packets queued at the interface.
   1061 	 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc.
   1062 	 */
   1063 	mutex_enter(&t->tbf_lock);
   1064 	while (t->tbf_q != NULL) {
   1065 		mp0 = t->tbf_q;
   1066 		t->tbf_q = t->tbf_q->b_next;
   1067 		mp0->b_prev = mp0->b_next = NULL;
   1068 		freemsg(mp0);
   1069 	}
   1070 	mutex_exit(&t->tbf_lock);
   1071 
   1072 	/*
   1073 	 * Always clear cache when vifs change.
   1074 	 * No need to get last_encap_lock since we are running as a writer.
   1075 	 */
   1076 	mutex_enter(&ipst->ips_last_encap_lock);
   1077 	if (vifp == ipst->ips_last_encap_vif) {
   1078 		ipst->ips_last_encap_vif = NULL;
   1079 		ipst->ips_last_encap_src = 0;
   1080 	}
   1081 	mutex_exit(&ipst->ips_last_encap_lock);
   1082 
   1083 	mutex_destroy(&t->tbf_lock);
   1084 
   1085 	bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf)));
   1086 
   1087 	/* Adjust numvifs down */
   1088 	mutex_enter(&ipst->ips_numvifs_mutex);
   1089 	for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */
   1090 		if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0)
   1091 			break;
   1092 	ipst->ips_numvifs = vifi;
   1093 	mutex_exit(&ipst->ips_numvifs_mutex);
   1094 
   1095 	bzero(vifp, sizeof (*vifp));
   1096 }
   1097 
   1098 static int
   1099 del_vif(vifi_t *vifip, ip_stack_t *ipst)
   1100 {
   1101 	struct vif	*vifp = ipst->ips_vifs + *vifip;
   1102 
   1103 	if (*vifip >= ipst->ips_numvifs)
   1104 		return (EINVAL);
   1105 
   1106 	mutex_enter(&vifp->v_lock);
   1107 	/*
   1108 	 * Not initialized
   1109 	 * Here we are not looking at the vif that is being initialized
   1110 	 * i.e vifp->v_marks == 0 and refcnt > 0.
   1111 	 */
   1112 	if (vifp->v_lcl_addr.s_addr == 0 ||
   1113 	    !(vifp->v_marks & VIF_MARK_GOOD)) {
   1114 		mutex_exit(&vifp->v_lock);
   1115 		return (EADDRNOTAVAIL);
   1116 	}
   1117 
   1118 	/* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
   1119 	vifp->v_marks &= ~VIF_MARK_GOOD;
   1120 	vifp->v_marks |= VIF_MARK_CONDEMNED;
   1121 
   1122 	/* Phyint only */
   1123 	if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
   1124 		ipif_t *ipif = vifp->v_ipif;
   1125 		ilm_t *ilm = vifp->v_ilm;
   1126 
   1127 		vifp->v_ilm = NULL;
   1128 
   1129 		ASSERT(ipif != NULL);
   1130 		/*
   1131 		 * should be OK to drop the lock as we
   1132 		 * have marked this as CONDEMNED.
   1133 		 */
   1134 		mutex_exit(&(vifp)->v_lock);
   1135 		if (ilm != NULL) {
   1136 			(void) ip_delmulti(ilm);
   1137 			ASSERT(ipif->ipif_ill->ill_mrouter_cnt > 0);
   1138 			atomic_dec_32(&ipif->ipif_ill->ill_mrouter_cnt);
   1139 		}
   1140 		mutex_enter(&(vifp)->v_lock);
   1141 	}
   1142 
   1143 	if (vifp->v_flags & VIFF_REGISTER) {
   1144 		mutex_enter(&ipst->ips_numvifs_mutex);
   1145 		ipst->ips_reg_vif_num = ALL_VIFS;
   1146 		mutex_exit(&ipst->ips_numvifs_mutex);
   1147 	}
   1148 
   1149 	/*
   1150 	 * decreases the refcnt added in add_vif.
   1151 	 */
   1152 	VIF_REFRELE_LOCKED(vifp);
   1153 	return (0);
   1154 }
   1155 
   1156 /*
   1157  * Add an mfc entry.
   1158  */
   1159 static int
   1160 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
   1161 {
   1162 	struct mfc *rt;
   1163 	struct rtdetq *rte;
   1164 	ushort_t nstl;
   1165 	int i;
   1166 	struct mfcb *mfcbp;
   1167 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   1168 
   1169 	/*
   1170 	 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted
   1171 	 * did not have a real route for pkt.
   1172 	 * We want this pkt without rt installed in the mfctable to prevent
   1173 	 * multiiple tries, so go ahead and put it in mfctable, it will
   1174 	 * be discarded later in ip_mdq() because the child is NULL.
   1175 	 */
   1176 
   1177 	/* Error checking, out of bounds? */
   1178 	if (mfccp->mfcc_parent > MAXVIFS) {
   1179 		ip0dbg(("ADD_MFC: mfcc_parent out of range %d",
   1180 		    (int)mfccp->mfcc_parent));
   1181 		return (EINVAL);
   1182 	}
   1183 
   1184 	if ((mfccp->mfcc_parent != NO_VIF) &&
   1185 	    (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) {
   1186 		ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n",
   1187 		    (int)mfccp->mfcc_parent));
   1188 		return (EINVAL);
   1189 	}
   1190 
   1191 	if (is_mrouter_off(ipst)) {
   1192 		return (EINVAL);
   1193 	}
   1194 
   1195 	mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr,
   1196 	    mfccp->mfcc_mcastgrp.s_addr)];
   1197 	MFCB_REFHOLD(mfcbp);
   1198 	MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr,
   1199 	    mfccp->mfcc_mcastgrp.s_addr, rt);
   1200 
   1201 	/* If an entry already exists, just update the fields */
   1202 	if (rt) {
   1203 		if (ipst->ips_ip_mrtdebug > 1) {
   1204 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1205 			    "add_mfc: update o %x grp %x parent %x",
   1206 			    ntohl(mfccp->mfcc_origin.s_addr),
   1207 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
   1208 			    mfccp->mfcc_parent);
   1209 		}
   1210 		mutex_enter(&rt->mfc_mutex);
   1211 		rt->mfc_parent = mfccp->mfcc_parent;
   1212 
   1213 		mutex_enter(&ipst->ips_numvifs_mutex);
   1214 		for (i = 0; i < (int)ipst->ips_numvifs; i++)
   1215 			rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
   1216 		mutex_exit(&ipst->ips_numvifs_mutex);
   1217 		mutex_exit(&rt->mfc_mutex);
   1218 
   1219 		MFCB_REFRELE(mfcbp);
   1220 		return (0);
   1221 	}
   1222 
   1223 	/*
   1224 	 * Find the entry for which the upcall was made and update.
   1225 	 */
   1226 	for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) {
   1227 		mutex_enter(&rt->mfc_mutex);
   1228 		if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
   1229 		    (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
   1230 		    (rt->mfc_rte != NULL) &&
   1231 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
   1232 			if (nstl++ != 0)
   1233 				cmn_err(CE_WARN,
   1234 				    "add_mfc: %s o %x g %x p %x",
   1235 				    "multiple kernel entries",
   1236 				    ntohl(mfccp->mfcc_origin.s_addr),
   1237 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
   1238 				    mfccp->mfcc_parent);
   1239 
   1240 			if (ipst->ips_ip_mrtdebug > 1) {
   1241 				(void) mi_strlog(mrouter->conn_rq, 1,
   1242 				    SL_TRACE,
   1243 				    "add_mfc: o %x g %x p %x",
   1244 				    ntohl(mfccp->mfcc_origin.s_addr),
   1245 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
   1246 				    mfccp->mfcc_parent);
   1247 			}
   1248 			fill_route(rt, mfccp, ipst);
   1249 
   1250 			/*
   1251 			 * Prevent cleanup of cache entry.
   1252 			 * Timer starts in ip_mforward.
   1253 			 */
   1254 			if (rt->mfc_timeout_id != 0) {
   1255 				timeout_id_t id;
   1256 				id = rt->mfc_timeout_id;
   1257 				/*
   1258 				 * setting id to zero will avoid this
   1259 				 * entry from being cleaned up in
   1260 				 * expire_up_calls().
   1261 				 */
   1262 				rt->mfc_timeout_id = 0;
   1263 				/*
   1264 				 * dropping the lock is fine as we
   1265 				 * have a refhold on the bucket.
   1266 				 * so mfc cannot be freed.
   1267 				 * The timeout can fire but it will see
   1268 				 * that mfc_timeout_id == 0 and not cleanup.
   1269 				 */
   1270 				mutex_exit(&rt->mfc_mutex);
   1271 				(void) untimeout(id);
   1272 				mutex_enter(&rt->mfc_mutex);
   1273 			}
   1274 
   1275 			/*
   1276 			 * Send all pkts that are queued waiting for the upcall.
   1277 			 * ip_mdq param tun set to 0 -
   1278 			 * the return value of ip_mdq() isn't used here,
   1279 			 * so value we send doesn't matter.
   1280 			 */
   1281 			while (rt->mfc_rte != NULL) {
   1282 				rte = rt->mfc_rte;
   1283 				rt->mfc_rte = rte->rte_next;
   1284 				mutex_exit(&rt->mfc_mutex);
   1285 				(void) ip_mdq(rte->mp, (ipha_t *)
   1286 				    rte->mp->b_rptr, rte->ill, 0, rt);
   1287 				freemsg(rte->mp);
   1288 				mi_free((char *)rte);
   1289 				mutex_enter(&rt->mfc_mutex);
   1290 			}
   1291 		}
   1292 		mutex_exit(&rt->mfc_mutex);
   1293 	}
   1294 
   1295 
   1296 	/*
   1297 	 * It is possible that an entry is being inserted without an upcall
   1298 	 */
   1299 	if (nstl == 0) {
   1300 		mutex_enter(&(mfcbp->mfcb_lock));
   1301 		if (ipst->ips_ip_mrtdebug > 1) {
   1302 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1303 			    "add_mfc: no upcall o %x g %x p %x",
   1304 			    ntohl(mfccp->mfcc_origin.s_addr),
   1305 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
   1306 			    mfccp->mfcc_parent);
   1307 		}
   1308 		if (is_mrouter_off(ipst)) {
   1309 			mutex_exit(&mfcbp->mfcb_lock);
   1310 			MFCB_REFRELE(mfcbp);
   1311 			return (EINVAL);
   1312 		}
   1313 
   1314 		for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) {
   1315 
   1316 			mutex_enter(&rt->mfc_mutex);
   1317 			if ((rt->mfc_origin.s_addr ==
   1318 			    mfccp->mfcc_origin.s_addr) &&
   1319 			    (rt->mfc_mcastgrp.s_addr ==
   1320 			    mfccp->mfcc_mcastgrp.s_addr) &&
   1321 			    (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) {
   1322 				fill_route(rt, mfccp, ipst);
   1323 				mutex_exit(&rt->mfc_mutex);
   1324 				break;
   1325 			}
   1326 			mutex_exit(&rt->mfc_mutex);
   1327 		}
   1328 
   1329 		/* No upcall, so make a new entry into mfctable */
   1330 		if (rt == NULL) {
   1331 			rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
   1332 			if (rt == NULL) {
   1333 				ip1dbg(("add_mfc: out of memory\n"));
   1334 				mutex_exit(&mfcbp->mfcb_lock);
   1335 				MFCB_REFRELE(mfcbp);
   1336 				return (ENOBUFS);
   1337 			}
   1338 
   1339 			/* Insert new entry at head of hash chain */
   1340 			mutex_enter(&rt->mfc_mutex);
   1341 			fill_route(rt, mfccp, ipst);
   1342 
   1343 			/* Link into table */
   1344 			rt->mfc_next   = mfcbp->mfcb_mfc;
   1345 			mfcbp->mfcb_mfc = rt;
   1346 			mutex_exit(&rt->mfc_mutex);
   1347 		}
   1348 		mutex_exit(&mfcbp->mfcb_lock);
   1349 	}
   1350 
   1351 	MFCB_REFRELE(mfcbp);
   1352 	return (0);
   1353 }
   1354 
   1355 /*
   1356  * Fills in mfc structure from mrouted mfcctl.
   1357  */
   1358 static void
   1359 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst)
   1360 {
   1361 	int i;
   1362 
   1363 	rt->mfc_origin		= mfccp->mfcc_origin;
   1364 	rt->mfc_mcastgrp	= mfccp->mfcc_mcastgrp;
   1365 	rt->mfc_parent		= mfccp->mfcc_parent;
   1366 	mutex_enter(&ipst->ips_numvifs_mutex);
   1367 	for (i = 0; i < (int)ipst->ips_numvifs; i++) {
   1368 		rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
   1369 	}
   1370 	mutex_exit(&ipst->ips_numvifs_mutex);
   1371 	/* Initialize pkt counters per src-grp */
   1372 	rt->mfc_pkt_cnt	= 0;
   1373 	rt->mfc_byte_cnt	= 0;
   1374 	rt->mfc_wrong_if	= 0;
   1375 	rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0;
   1376 
   1377 }
   1378 
   1379 static void
   1380 free_queue(struct mfc *mfcp)
   1381 {
   1382 	struct rtdetq *rte0;
   1383 
   1384 	/*
   1385 	 * Drop all queued upcall packets.
   1386 	 * Free the mbuf with the pkt.
   1387 	 */
   1388 	while ((rte0 = mfcp->mfc_rte) != NULL) {
   1389 		mfcp->mfc_rte = rte0->rte_next;
   1390 		freemsg(rte0->mp);
   1391 		mi_free((char *)rte0);
   1392 	}
   1393 }
   1394 /*
   1395  * go thorugh the hash bucket and free all the entries marked condemned.
   1396  */
   1397 void
   1398 release_mfc(struct mfcb *mfcbp)
   1399 {
   1400 	struct mfc *current_mfcp;
   1401 	struct mfc *prev_mfcp;
   1402 
   1403 	prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
   1404 
   1405 	while (current_mfcp != NULL) {
   1406 		if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) {
   1407 			if (current_mfcp == mfcbp->mfcb_mfc) {
   1408 				mfcbp->mfcb_mfc = current_mfcp->mfc_next;
   1409 				free_queue(current_mfcp);
   1410 				mi_free(current_mfcp);
   1411 				prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
   1412 				continue;
   1413 			}
   1414 			ASSERT(prev_mfcp != NULL);
   1415 			prev_mfcp->mfc_next = current_mfcp->mfc_next;
   1416 			free_queue(current_mfcp);
   1417 			mi_free(current_mfcp);
   1418 			current_mfcp = NULL;
   1419 		} else {
   1420 			prev_mfcp = current_mfcp;
   1421 		}
   1422 
   1423 		current_mfcp = prev_mfcp->mfc_next;
   1424 
   1425 	}
   1426 	mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED;
   1427 	ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0);
   1428 }
   1429 
   1430 /*
   1431  * Delete an mfc entry.
   1432  */
   1433 static int
   1434 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
   1435 {
   1436 	struct in_addr	origin;
   1437 	struct in_addr	mcastgrp;
   1438 	struct mfc 	*rt;
   1439 	uint_t		hash;
   1440 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   1441 
   1442 	origin = mfccp->mfcc_origin;
   1443 	mcastgrp = mfccp->mfcc_mcastgrp;
   1444 	hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
   1445 
   1446 	if (ipst->ips_ip_mrtdebug > 1) {
   1447 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1448 		    "del_mfc: o %x g %x",
   1449 		    ntohl(origin.s_addr),
   1450 		    ntohl(mcastgrp.s_addr));
   1451 	}
   1452 
   1453 	MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
   1454 
   1455 	/* Find mfc in mfctable, finds only entries without upcalls */
   1456 	for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) {
   1457 		mutex_enter(&rt->mfc_mutex);
   1458 		if (origin.s_addr == rt->mfc_origin.s_addr &&
   1459 		    mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
   1460 		    rt->mfc_rte == NULL &&
   1461 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED))
   1462 			break;
   1463 		mutex_exit(&rt->mfc_mutex);
   1464 	}
   1465 
   1466 	/*
   1467 	 * Return if there was an upcall (mfc_rte != NULL,
   1468 	 * or rt not in mfctable.
   1469 	 */
   1470 	if (rt == NULL) {
   1471 		MFCB_REFRELE(&ipst->ips_mfcs[hash]);
   1472 		return (EADDRNOTAVAIL);
   1473 	}
   1474 
   1475 
   1476 	/*
   1477 	 * no need to hold lock as we have a reference.
   1478 	 */
   1479 	ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
   1480 	/* error checking */
   1481 	if (rt->mfc_timeout_id != 0) {
   1482 		ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null"));
   1483 		/*
   1484 		 * Its ok to drop the lock,  the struct cannot be freed
   1485 		 * since we have a ref on the hash bucket.
   1486 		 */
   1487 		rt->mfc_timeout_id = 0;
   1488 		mutex_exit(&rt->mfc_mutex);
   1489 		(void) untimeout(rt->mfc_timeout_id);
   1490 		mutex_enter(&rt->mfc_mutex);
   1491 	}
   1492 
   1493 	ASSERT(rt->mfc_rte == NULL);
   1494 
   1495 
   1496 	/*
   1497 	 * Delete the entry from the cache
   1498 	 */
   1499 	rt->mfc_marks |= MFCB_MARK_CONDEMNED;
   1500 	mutex_exit(&rt->mfc_mutex);
   1501 
   1502 	MFCB_REFRELE(&ipst->ips_mfcs[hash]);
   1503 
   1504 	return (0);
   1505 }
   1506 
   1507 #define	TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
   1508 
   1509 /*
   1510  * IP multicast forwarding function. This function assumes that the packet
   1511  * pointed to by ipha has arrived on (or is about to be sent to) the interface
   1512  * pointed to by "ill", and the packet is to be relayed to other networks
   1513  * that have members of the packet's destination IP multicast group.
   1514  *
   1515  * The packet is returned unscathed to the caller, unless it is
   1516  * erroneous, in which case a -1 value tells the caller (IP)
   1517  * to discard it.
   1518  *
   1519  * Unlike BSD, SunOS 5.x needs to return to IP info about
   1520  * whether pkt came in thru a tunnel, so it can be discarded, unless
   1521  * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try
   1522  * to be delivered.
   1523  * Return values are 0 - pkt is okay and phyint
   1524  *		    -1 - pkt is malformed and to be tossed
   1525  *                   1 - pkt came in on tunnel
   1526  */
   1527 int
   1528 ip_mforward(mblk_t *mp, ip_recv_attr_t *ira)
   1529 {
   1530 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
   1531 	ill_t		*ill = ira->ira_ill;
   1532 	struct mfc 	*rt;
   1533 	ipaddr_t	src, dst, tunnel_src = 0;
   1534 	static int	srctun = 0;
   1535 	vifi_t		vifi;
   1536 	boolean_t	pim_reg_packet = B_FALSE;
   1537 	struct mfcb	*mfcbp;
   1538 	ip_stack_t	*ipst = ill->ill_ipst;
   1539 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   1540 	ill_t		*rill = ira->ira_rill;
   1541 
   1542 	ASSERT(ira->ira_pktlen == msgdsize(mp));
   1543 
   1544 	if (ipst->ips_ip_mrtdebug > 1) {
   1545 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1546 		    "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s",
   1547 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
   1548 		    ill->ill_name);
   1549 	}
   1550 
   1551 	dst = ipha->ipha_dst;
   1552 	if (ira->ira_flags & IRAF_PIM_REGISTER)
   1553 		pim_reg_packet = B_TRUE;
   1554 	else if (ira->ira_flags & IRAF_MROUTE_TUNNEL_SET)
   1555 		tunnel_src = ira->ira_mroute_tunnel;
   1556 
   1557 	/*
   1558 	 * Don't forward a packet with time-to-live of zero or one,
   1559 	 * or a packet destined to a local-only group.
   1560 	 */
   1561 	if (CLASSD(dst) && (ipha->ipha_ttl <= 1 ||
   1562 	    (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) {
   1563 		if (ipst->ips_ip_mrtdebug > 1) {
   1564 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1565 			    "ip_mforward: not forwarded ttl %d,"
   1566 			    " dst 0x%x ill %s",
   1567 			    ipha->ipha_ttl, ntohl(dst), ill->ill_name);
   1568 		}
   1569 		if (tunnel_src != 0)
   1570 			return (1);
   1571 		else
   1572 			return (0);
   1573 	}
   1574 
   1575 	if ((tunnel_src != 0) || pim_reg_packet) {
   1576 		/*
   1577 		 * Packet arrived over an encapsulated tunnel or via a PIM
   1578 		 * register message.
   1579 		 */
   1580 		if (ipst->ips_ip_mrtdebug > 1) {
   1581 			if (tunnel_src != 0) {
   1582 				(void) mi_strlog(mrouter->conn_rq, 1,
   1583 				    SL_TRACE,
   1584 				    "ip_mforward: ill %s arrived via ENCAP TUN",
   1585 				    ill->ill_name);
   1586 			} else if (pim_reg_packet) {
   1587 				(void) mi_strlog(mrouter->conn_rq, 1,
   1588 				    SL_TRACE,
   1589 				    "ip_mforward: ill %s arrived via"
   1590 				    "  REGISTER VIF",
   1591 				    ill->ill_name);
   1592 			}
   1593 		}
   1594 	} else if ((ipha->ipha_version_and_hdr_length & 0xf) <
   1595 	    (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 ||
   1596 	    ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) {
   1597 		/* Packet arrived via a physical interface. */
   1598 		if (ipst->ips_ip_mrtdebug > 1) {
   1599 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1600 			    "ip_mforward: ill %s arrived via PHYINT",
   1601 			    ill->ill_name);
   1602 		}
   1603 
   1604 	} else {
   1605 		/*
   1606 		 * Packet arrived through a SRCRT tunnel.
   1607 		 * Source-route tunnels are no longer supported.
   1608 		 * Error message printed every 1000 times.
   1609 		 */
   1610 		if ((srctun++ % 1000) == 0) {
   1611 			cmn_err(CE_WARN,
   1612 			    "ip_mforward: received source-routed pkt from %x",
   1613 			    ntohl(ipha->ipha_src));
   1614 		}
   1615 		return (-1);
   1616 	}
   1617 
   1618 	ipst->ips_mrtstat->mrts_fwd_in++;
   1619 	src = ipha->ipha_src;
   1620 
   1621 	/* Find route in cache, return NULL if not there or upcalls q'ed. */
   1622 
   1623 	/*
   1624 	 * Lock the mfctable against changes made by ip_mforward.
   1625 	 * Note that only add_mfc and del_mfc can remove entries and
   1626 	 * they run with exclusive access to IP. So we do not need to
   1627 	 * guard against the rt being deleted, so release lock after reading.
   1628 	 */
   1629 
   1630 	if (is_mrouter_off(ipst))
   1631 		return (-1);
   1632 
   1633 	mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)];
   1634 	MFCB_REFHOLD(mfcbp);
   1635 	MFCFIND(mfcbp, src, dst, rt);
   1636 
   1637 	/* Entry exists, so forward if necessary */
   1638 	if (rt != NULL) {
   1639 		int ret = 0;
   1640 		ipst->ips_mrtstat->mrts_mfc_hits++;
   1641 		if (pim_reg_packet) {
   1642 			ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
   1643 			ret = ip_mdq(mp, ipha,
   1644 			    ipst->ips_vifs[ipst->ips_reg_vif_num].
   1645 			    v_ipif->ipif_ill,
   1646 			    0, rt);
   1647 		} else {
   1648 			ret = ip_mdq(mp, ipha, ill, tunnel_src, rt);
   1649 		}
   1650 
   1651 		MFCB_REFRELE(mfcbp);
   1652 		return (ret);
   1653 
   1654 		/*
   1655 		 * Don't forward if we don't have a cache entry.  Mrouted will
   1656 		 * always provide a cache entry in response to an upcall.
   1657 		 */
   1658 	} else {
   1659 		/*
   1660 		 * If we don't have a route for packet's origin, make a copy
   1661 		 * of the packet and send message to routing daemon.
   1662 		 */
   1663 		struct mfc	*mfc_rt	 = NULL;
   1664 		mblk_t		*mp0	 = NULL;
   1665 		mblk_t		*mp_copy = NULL;
   1666 		struct rtdetq	*rte	 = NULL;
   1667 		struct rtdetq	*rte_m, *rte1, *prev_rte;
   1668 		uint_t		hash;
   1669 		int		npkts;
   1670 		boolean_t	new_mfc = B_FALSE;
   1671 		ipst->ips_mrtstat->mrts_mfc_misses++;
   1672 		/* BSD uses mrts_no_route++ */
   1673 		if (ipst->ips_ip_mrtdebug > 1) {
   1674 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1675 			    "ip_mforward: no rte ill %s src %x g %x misses %d",
   1676 			    ill->ill_name, ntohl(src), ntohl(dst),
   1677 			    (int)ipst->ips_mrtstat->mrts_mfc_misses);
   1678 		}
   1679 		/*
   1680 		 * The order of the following code differs from the BSD code.
   1681 		 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x
   1682 		 * code works, so SunOS 5.x wasn't changed to conform to the
   1683 		 * BSD version.
   1684 		 */
   1685 
   1686 		/* Lock mfctable. */
   1687 		hash = MFCHASH(src, dst);
   1688 		mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock));
   1689 
   1690 		/*
   1691 		 * If we are turning off mrouted return an error
   1692 		 */
   1693 		if (is_mrouter_off(ipst)) {
   1694 			mutex_exit(&mfcbp->mfcb_lock);
   1695 			MFCB_REFRELE(mfcbp);
   1696 			return (-1);
   1697 		}
   1698 
   1699 		/* Is there an upcall waiting for this packet? */
   1700 		for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt;
   1701 		    mfc_rt = mfc_rt->mfc_next) {
   1702 			mutex_enter(&mfc_rt->mfc_mutex);
   1703 			if (ipst->ips_ip_mrtdebug > 1) {
   1704 				(void) mi_strlog(mrouter->conn_rq, 1,
   1705 				    SL_TRACE,
   1706 				    "ip_mforward: MFCTAB hash %d o 0x%x"
   1707 				    " g 0x%x\n",
   1708 				    hash, ntohl(mfc_rt->mfc_origin.s_addr),
   1709 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
   1710 			}
   1711 			/* There is an upcall */
   1712 			if ((src == mfc_rt->mfc_origin.s_addr) &&
   1713 			    (dst == mfc_rt->mfc_mcastgrp.s_addr) &&
   1714 			    (mfc_rt->mfc_rte != NULL) &&
   1715 			    !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
   1716 				break;
   1717 			}
   1718 			mutex_exit(&mfc_rt->mfc_mutex);
   1719 		}
   1720 		/* No upcall, so make a new entry into mfctable */
   1721 		if (mfc_rt == NULL) {
   1722 			mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
   1723 			if (mfc_rt == NULL) {
   1724 				ipst->ips_mrtstat->mrts_fwd_drop++;
   1725 				ip1dbg(("ip_mforward: out of memory "
   1726 				    "for mfc, mfc_rt\n"));
   1727 				goto error_return;
   1728 			} else
   1729 				new_mfc = B_TRUE;
   1730 			/* Get resources */
   1731 			/* TODO could copy header and dup rest */
   1732 			mp_copy = copymsg(mp);
   1733 			if (mp_copy == NULL) {
   1734 				ipst->ips_mrtstat->mrts_fwd_drop++;
   1735 				ip1dbg(("ip_mforward: out of memory for "
   1736 				    "mblk, mp_copy\n"));
   1737 				goto error_return;
   1738 			}
   1739 			mutex_enter(&mfc_rt->mfc_mutex);
   1740 		}
   1741 		/* Get resources for rte, whether first rte or not first. */
   1742 		/* Add this packet into rtdetq */
   1743 		rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq));
   1744 		if (rte == NULL) {
   1745 			ipst->ips_mrtstat->mrts_fwd_drop++;
   1746 			mutex_exit(&mfc_rt->mfc_mutex);
   1747 			ip1dbg(("ip_mforward: out of memory for"
   1748 			    " rtdetq, rte\n"));
   1749 			goto error_return;
   1750 		}
   1751 
   1752 		mp0 = copymsg(mp);
   1753 		if (mp0 == NULL) {
   1754 			ipst->ips_mrtstat->mrts_fwd_drop++;
   1755 			ip1dbg(("ip_mforward: out of memory for mblk, mp0\n"));
   1756 			mutex_exit(&mfc_rt->mfc_mutex);
   1757 			goto error_return;
   1758 		}
   1759 		rte->mp		= mp0;
   1760 		if (pim_reg_packet) {
   1761 			ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
   1762 			rte->ill =
   1763 			    ipst->ips_vifs[ipst->ips_reg_vif_num].
   1764 			    v_ipif->ipif_ill;
   1765 		} else {
   1766 			rte->ill = ill;
   1767 		}
   1768 		rte->rte_next	= NULL;
   1769 
   1770 		/*
   1771 		 * Determine if upcall q (rtdetq) has overflowed.
   1772 		 * mfc_rt->mfc_rte is null by mi_zalloc
   1773 		 * if it is the first message.
   1774 		 */
   1775 		for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m;
   1776 		    rte_m = rte_m->rte_next)
   1777 			npkts++;
   1778 		if (ipst->ips_ip_mrtdebug > 1) {
   1779 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1780 			    "ip_mforward: upcalls %d\n", npkts);
   1781 		}
   1782 		if (npkts > MAX_UPQ) {
   1783 			ipst->ips_mrtstat->mrts_upq_ovflw++;
   1784 			mutex_exit(&mfc_rt->mfc_mutex);
   1785 			goto error_return;
   1786 		}
   1787 
   1788 		if (npkts == 0) {	/* first upcall */
   1789 			int i = 0;
   1790 			/*
   1791 			 * Now finish installing the new mfc! Now that we have
   1792 			 * resources!  Insert new entry at head of hash chain.
   1793 			 * Use src and dst which are ipaddr_t's.
   1794 			 */
   1795 			mfc_rt->mfc_origin.s_addr = src;
   1796 			mfc_rt->mfc_mcastgrp.s_addr = dst;
   1797 
   1798 			mutex_enter(&ipst->ips_numvifs_mutex);
   1799 			for (i = 0; i < (int)ipst->ips_numvifs; i++)
   1800 				mfc_rt->mfc_ttls[i] = 0;
   1801 			mutex_exit(&ipst->ips_numvifs_mutex);
   1802 			mfc_rt->mfc_parent = ALL_VIFS;
   1803 
   1804 			/* Link into table */
   1805 			if (ipst->ips_ip_mrtdebug > 1) {
   1806 				(void) mi_strlog(mrouter->conn_rq, 1,
   1807 				    SL_TRACE,
   1808 				    "ip_mforward: NEW MFCTAB hash %d o 0x%x "
   1809 				    "g 0x%x\n", hash,
   1810 				    ntohl(mfc_rt->mfc_origin.s_addr),
   1811 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
   1812 			}
   1813 			mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc;
   1814 			ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt;
   1815 			mfc_rt->mfc_rte = NULL;
   1816 		}
   1817 
   1818 		/* Link in the upcall */
   1819 		/* First upcall */
   1820 		if (mfc_rt->mfc_rte == NULL)
   1821 			mfc_rt->mfc_rte = rte;
   1822 		else {
   1823 			/* not the first upcall */
   1824 			prev_rte = mfc_rt->mfc_rte;
   1825 			for (rte1 = mfc_rt->mfc_rte->rte_next; rte1;
   1826 			    prev_rte = rte1, rte1 = rte1->rte_next)
   1827 				;
   1828 			prev_rte->rte_next = rte;
   1829 		}
   1830 
   1831 		/*
   1832 		 * No upcalls waiting, this is first one, so send a message to
   1833 		 * routing daemon to install a route into kernel table.
   1834 		 */
   1835 		if (npkts == 0) {
   1836 			struct igmpmsg	*im;
   1837 			/* ipha_protocol is 0, for upcall */
   1838 			ASSERT(mp_copy != NULL);
   1839 			im = (struct igmpmsg *)mp_copy->b_rptr;
   1840 			im->im_msgtype	= IGMPMSG_NOCACHE;
   1841 			im->im_mbz = 0;
   1842 			mutex_enter(&ipst->ips_numvifs_mutex);
   1843 			if (pim_reg_packet) {
   1844 				im->im_vif = (uchar_t)ipst->ips_reg_vif_num;
   1845 				mutex_exit(&ipst->ips_numvifs_mutex);
   1846 			} else {
   1847 				/*
   1848 				 * XXX do we need to hold locks here ?
   1849 				 */
   1850 				for (vifi = 0;
   1851 				    vifi < ipst->ips_numvifs;
   1852 				    vifi++) {
   1853 					if (ipst->ips_vifs[vifi].v_ipif == NULL)
   1854 						continue;
   1855 					if (ipst->ips_vifs[vifi].
   1856 					    v_ipif->ipif_ill == ill) {
   1857 						im->im_vif = (uchar_t)vifi;
   1858 						break;
   1859 					}
   1860 				}
   1861 				mutex_exit(&ipst->ips_numvifs_mutex);
   1862 				ASSERT(vifi < ipst->ips_numvifs);
   1863 			}
   1864 
   1865 			ipst->ips_mrtstat->mrts_upcalls++;
   1866 			/* Timer to discard upcalls if mrouted is too slow */
   1867 			mfc_rt->mfc_timeout_id = timeout(expire_upcalls,
   1868 			    mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE);
   1869 			mutex_exit(&mfc_rt->mfc_mutex);
   1870 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
   1871 			/* Pass to RAWIP */
   1872 			ira->ira_ill = ira->ira_rill = NULL;
   1873 			(mrouter->conn_recv)(mrouter, mp_copy, NULL, ira);
   1874 			ira->ira_ill = ill;
   1875 			ira->ira_rill = rill;
   1876 		} else {
   1877 			mutex_exit(&mfc_rt->mfc_mutex);
   1878 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
   1879 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1880 			ip_drop_input("ip_mforward - upcall already waiting",
   1881 			    mp_copy, ill);
   1882 			freemsg(mp_copy);
   1883 		}
   1884 
   1885 		MFCB_REFRELE(mfcbp);
   1886 		if (tunnel_src != 0)
   1887 			return (1);
   1888 		else
   1889 			return (0);
   1890 	error_return:
   1891 		mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
   1892 		MFCB_REFRELE(mfcbp);
   1893 		if (mfc_rt != NULL && (new_mfc == B_TRUE))
   1894 			mi_free((char *)mfc_rt);
   1895 		if (rte != NULL)
   1896 			mi_free((char *)rte);
   1897 		if (mp_copy != NULL) {
   1898 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1899 			ip_drop_input("ip_mforward error", mp_copy, ill);
   1900 			freemsg(mp_copy);
   1901 		}
   1902 		if (mp0 != NULL)
   1903 			freemsg(mp0);
   1904 		return (-1);
   1905 	}
   1906 }
   1907 
   1908 /*
   1909  * Clean up the mfctable cache entry if upcall is not serviced.
   1910  * SunOS 5.x has timeout per mfc, unlike BSD which has one timer.
   1911  */
   1912 static void
   1913 expire_upcalls(void *arg)
   1914 {
   1915 	struct mfc *mfc_rt = arg;
   1916 	uint_t hash;
   1917 	struct mfc *prev_mfc, *mfc0;
   1918 	ip_stack_t	*ipst;
   1919 	conn_t		*mrouter;
   1920 
   1921 	if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) {
   1922 		cmn_err(CE_WARN, "expire_upcalls: no ILL\n");
   1923 		return;
   1924 	}
   1925 	ipst = mfc_rt->mfc_rte->ill->ill_ipst;
   1926 	mrouter = ipst->ips_ip_g_mrouter;
   1927 
   1928 	hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr);
   1929 	if (ipst->ips_ip_mrtdebug > 1) {
   1930 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1931 		    "expire_upcalls: hash %d s %x g %x",
   1932 		    hash, ntohl(mfc_rt->mfc_origin.s_addr),
   1933 		    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
   1934 	}
   1935 	MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
   1936 	mutex_enter(&mfc_rt->mfc_mutex);
   1937 	/*
   1938 	 * if timeout has been set to zero, than the
   1939 	 * entry has been filled, no need to delete it.
   1940 	 */
   1941 	if (mfc_rt->mfc_timeout_id == 0)
   1942 		goto done;
   1943 	ipst->ips_mrtstat->mrts_cache_cleanups++;
   1944 	mfc_rt->mfc_timeout_id = 0;
   1945 
   1946 	/* Determine entry to be cleaned up in cache table. */
   1947 	for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0;
   1948 	    prev_mfc = mfc0, mfc0 = mfc0->mfc_next)
   1949 		if (mfc0 == mfc_rt)
   1950 			break;
   1951 
   1952 	/* del_mfc takes care of gone mfcs */
   1953 	ASSERT(prev_mfc != NULL);
   1954 	ASSERT(mfc0 != NULL);
   1955 
   1956 	/*
   1957 	 * Delete the entry from the cache
   1958 	 */
   1959 	ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
   1960 	mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
   1961 
   1962 	/*
   1963 	 * release_mfc will drop all queued upcall packets.
   1964 	 * and will free the mbuf with the pkt, if, timing info.
   1965 	 */
   1966 done:
   1967 	mutex_exit(&mfc_rt->mfc_mutex);
   1968 	MFCB_REFRELE(&ipst->ips_mfcs[hash]);
   1969 }
   1970 
   1971 /*
   1972  * Packet forwarding routine once entry in the cache is made.
   1973  */
   1974 static int
   1975 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
   1976     struct mfc *rt)
   1977 {
   1978 	vifi_t vifi;
   1979 	struct vif *vifp;
   1980 	ipaddr_t dst = ipha->ipha_dst;
   1981 	size_t  plen = msgdsize(mp);
   1982 	vifi_t num_of_vifs;
   1983 	ip_stack_t	*ipst = ill->ill_ipst;
   1984 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   1985 	ip_recv_attr_t	iras;
   1986 
   1987 	if (ipst->ips_ip_mrtdebug > 1) {
   1988 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1989 		    "ip_mdq: SEND src %x, ipha_dst %x, ill %s",
   1990 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
   1991 		    ill->ill_name);
   1992 	}
   1993 
   1994 	/* Macro to send packet on vif */
   1995 #define	MC_SEND(ipha, mp, vifp, dst) { \
   1996 	if ((vifp)->v_flags & VIFF_TUNNEL) \
   1997 		encap_send((ipha), (mp), (vifp), (dst)); \
   1998 	else if ((vifp)->v_flags & VIFF_REGISTER) \
   1999 		register_send((ipha), (mp), (vifp), (dst)); \
   2000 	else \
   2001 		phyint_send((ipha), (mp), (vifp), (dst)); \
   2002 }
   2003 
   2004 	vifi = rt->mfc_parent;
   2005 
   2006 	/*
   2007 	 * The value of vifi is MAXVIFS if the pkt had no parent, i.e.,
   2008 	 * Mrouted had no route.
   2009 	 * We wanted the route installed in the mfctable to prevent multiple
   2010 	 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is
   2011 	 * NULL so we don't want to check the ill. Still needed as of Mrouted
   2012 	 * 3.6.
   2013 	 */
   2014 	if (vifi == NO_VIF) {
   2015 		ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n",
   2016 		    ill->ill_name));
   2017 		if (ipst->ips_ip_mrtdebug > 1) {
   2018 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2019 			    "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name);
   2020 		}
   2021 		return (-1);	/* drop pkt */
   2022 	}
   2023 
   2024 	if (!lock_good_vif(&ipst->ips_vifs[vifi]))
   2025 		return (-1);
   2026 	/*
   2027 	 * The MFC entries are not cleaned up when an ipif goes
   2028 	 * away thus this code has to guard against an MFC referencing
   2029 	 * an ipif that has been closed. Note: reset_mrt_vif_ipif
   2030 	 * sets the v_ipif to NULL when the ipif disappears.
   2031 	 */
   2032 	ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL);
   2033 
   2034 	if (vifi >= ipst->ips_numvifs) {
   2035 		cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs "
   2036 		    "%d ill %s viftable ill %s\n",
   2037 		    (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
   2038 		    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
   2039 		unlock_good_vif(&ipst->ips_vifs[vifi]);
   2040 		return (-1);
   2041 	}
   2042 	/*
   2043 	 * Don't forward if it didn't arrive from the parent vif for its
   2044 	 * origin.
   2045 	 */
   2046 	if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill) ||
   2047 	    (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
   2048 		/* Came in the wrong interface */
   2049 		ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
   2050 			"numvifs %d ill %s viftable ill %s\n",
   2051 			(int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
   2052 			ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name));
   2053 		if (ipst->ips_ip_mrtdebug > 1) {
   2054 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2055 			    "ip_mdq: arrived wrong if, vifi %d ill "
   2056 			    "%s viftable ill %s\n",
   2057 			    (int)vifi, ill->ill_name,
   2058 			    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
   2059 		}
   2060 		ipst->ips_mrtstat->mrts_wrong_if++;
   2061 		rt->mfc_wrong_if++;
   2062 
   2063 		/*
   2064 		 * If we are doing PIM assert processing and we are forwarding
   2065 		 * packets on this interface, and it is a broadcast medium
   2066 		 * interface (and not a tunnel), send a message to the routing.
   2067 		 *
   2068 		 * We use the first ipif on the list, since it's all we have.
   2069 		 * Chances are the ipif_flags are the same for ipifs on the ill.
   2070 		 */
   2071 		if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 &&
   2072 		    (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) &&
   2073 		    !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) {
   2074 			mblk_t		*mp_copy;
   2075 			struct igmpmsg	*im;
   2076 
   2077 			/* TODO could copy header and dup rest */
   2078 			mp_copy = copymsg(mp);
   2079 			if (mp_copy == NULL) {
   2080 				ipst->ips_mrtstat->mrts_fwd_drop++;
   2081 				ip1dbg(("ip_mdq: out of memory "
   2082 				    "for mblk, mp_copy\n"));
   2083 				unlock_good_vif(&ipst->ips_vifs[vifi]);
   2084 				return (-1);
   2085 			}
   2086 
   2087 			im = (struct igmpmsg *)mp_copy->b_rptr;
   2088 			im->im_msgtype = IGMPMSG_WRONGVIF;
   2089 			im->im_mbz = 0;
   2090 			im->im_vif = (ushort_t)vifi;
   2091 			/* Pass to RAWIP */
   2092 
   2093 			bzero(&iras, sizeof (iras));
   2094 			iras.ira_flags = IRAF_IS_IPV4;
   2095 			iras.ira_ip_hdr_length =
   2096 			    IPH_HDR_LENGTH(mp_copy->b_rptr);
   2097 			iras.ira_pktlen = msgdsize(mp_copy);
   2098 			(mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
   2099 			ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
   2100 		}
   2101 		unlock_good_vif(&ipst->ips_vifs[vifi]);
   2102 		if (tunnel_src != 0)
   2103 			return (1);
   2104 		else
   2105 			return (0);
   2106 	}
   2107 	/*
   2108 	 * If I sourced this packet, it counts as output, else it was input.
   2109 	 */
   2110 	if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) {
   2111 		ipst->ips_vifs[vifi].v_pkt_out++;
   2112 		ipst->ips_vifs[vifi].v_bytes_out += plen;
   2113 	} else {
   2114 		ipst->ips_vifs[vifi].v_pkt_in++;
   2115 		ipst->ips_vifs[vifi].v_bytes_in += plen;
   2116 	}
   2117 	mutex_enter(&rt->mfc_mutex);
   2118 	rt->mfc_pkt_cnt++;
   2119 	rt->mfc_byte_cnt += plen;
   2120 	mutex_exit(&rt->mfc_mutex);
   2121 	unlock_good_vif(&ipst->ips_vifs[vifi]);
   2122 	/*
   2123 	 * For each vif, decide if a copy of the packet should be forwarded.
   2124 	 * Forward if:
   2125 	 *		- the vif threshold ttl is non-zero AND
   2126 	 *		- the pkt ttl exceeds the vif's threshold
   2127 	 * A non-zero mfc_ttl indicates that the vif is part of
   2128 	 * the output set for the mfc entry.
   2129 	 */
   2130 	mutex_enter(&ipst->ips_numvifs_mutex);
   2131 	num_of_vifs = ipst->ips_numvifs;
   2132 	mutex_exit(&ipst->ips_numvifs_mutex);
   2133 	for (vifp = ipst->ips_vifs, vifi = 0;
   2134 	    vifi < num_of_vifs;
   2135 	    vifp++, vifi++) {
   2136 		if (!lock_good_vif(vifp))
   2137 			continue;
   2138 		if ((rt->mfc_ttls[vifi] > 0) &&
   2139 		    (ipha->ipha_ttl > rt->mfc_ttls[vifi])) {
   2140 			/*
   2141 			 * lock_good_vif should not have succedded if
   2142 			 * v_ipif is null.
   2143 			 */
   2144 			ASSERT(vifp->v_ipif != NULL);
   2145 			vifp->v_pkt_out++;
   2146 			vifp->v_bytes_out += plen;
   2147 			MC_SEND(ipha, mp, vifp, dst);
   2148 			ipst->ips_mrtstat->mrts_fwd_out++;
   2149 		}
   2150 		unlock_good_vif(vifp);
   2151 	}
   2152 	if (tunnel_src != 0)
   2153 		return (1);
   2154 	else
   2155 		return (0);
   2156 }
   2157 
   2158 /*
   2159  * Send the packet on physical interface.
   2160  * Caller assumes can continue to use mp on return.
   2161  */
   2162 /* ARGSUSED */
   2163 static void
   2164 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
   2165 {
   2166 	mblk_t 	*mp_copy;
   2167 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
   2168 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2169 
   2170 	/* Make a new reference to the packet */
   2171 	mp_copy = copymsg(mp);	/* TODO could copy header and dup rest */
   2172 	if (mp_copy == NULL) {
   2173 		ipst->ips_mrtstat->mrts_fwd_drop++;
   2174 		ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n"));
   2175 		return;
   2176 	}
   2177 	if (vifp->v_rate_limit <= 0)
   2178 		tbf_send_packet(vifp, mp_copy);
   2179 	else  {
   2180 		if (ipst->ips_ip_mrtdebug > 1) {
   2181 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2182 			    "phyint_send: tbf_contr rate %d "
   2183 			    "vifp 0x%p mp 0x%p dst 0x%x",
   2184 			    vifp->v_rate_limit, (void *)vifp, (void *)mp, dst);
   2185 		}
   2186 		tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr);
   2187 	}
   2188 }
   2189 
   2190 /*
   2191  * Send the whole packet for REGISTER encapsulation to PIM daemon
   2192  * Caller assumes it can continue to use mp on return.
   2193  */
   2194 /* ARGSUSED */
   2195 static void
   2196 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
   2197 {
   2198 	struct igmpmsg	*im;
   2199 	mblk_t		*mp_copy;
   2200 	ipha_t		*ipha_copy;
   2201 	ill_t		*ill = vifp->v_ipif->ipif_ill;
   2202 	ip_stack_t	*ipst = ill->ill_ipst;
   2203 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2204 	ip_recv_attr_t	iras;
   2205 
   2206 	if (ipst->ips_ip_mrtdebug > 1) {
   2207 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2208 		    "register_send: src %x, dst %x\n",
   2209 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
   2210 	}
   2211 
   2212 	/*
   2213 	 * Copy the old packet & pullup its IP header into the new mblk_t so we
   2214 	 * can modify it.  Try to fill the new mblk_t since if we don't the
   2215 	 * ethernet driver will.
   2216 	 */
   2217 	mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED);
   2218 	if (mp_copy == NULL) {
   2219 		++ipst->ips_mrtstat->mrts_pim_nomemory;
   2220 		if (ipst->ips_ip_mrtdebug > 3) {
   2221 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2222 			    "register_send: allocb failure.");
   2223 		}
   2224 		return;
   2225 	}
   2226 
   2227 	/*
   2228 	 * Bump write pointer to account for igmpmsg being added.
   2229 	 */
   2230 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg);
   2231 
   2232 	/*
   2233 	 * Chain packet to new mblk_t.
   2234 	 */
   2235 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
   2236 		++ipst->ips_mrtstat->mrts_pim_nomemory;
   2237 		if (ipst->ips_ip_mrtdebug > 3) {
   2238 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2239 			    "register_send: copymsg failure.");
   2240 		}
   2241 		freeb(mp_copy);
   2242 		return;
   2243 	}
   2244 
   2245 	/*
   2246 	 * icmp_input() asserts that IP version field is set to an
   2247 	 * appropriate version. Hence, the struct igmpmsg that this really
   2248 	 * becomes, needs to have the correct IP version field.
   2249 	 */
   2250 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
   2251 	*ipha_copy = multicast_encap_iphdr;
   2252 
   2253 	/*
   2254 	 * The kernel uses the struct igmpmsg header to encode the messages to
   2255 	 * the multicast routing daemon. Fill in the fields in the header
   2256 	 * starting with the message type which is IGMPMSG_WHOLEPKT
   2257 	 */
   2258 	im = (struct igmpmsg *)mp_copy->b_rptr;
   2259 	im->im_msgtype = IGMPMSG_WHOLEPKT;
   2260 	im->im_src.s_addr = ipha->ipha_src;
   2261 	im->im_dst.s_addr = ipha->ipha_dst;
   2262 
   2263 	/*
   2264 	 * Must Be Zero. This is because the struct igmpmsg is really an IP
   2265 	 * header with renamed fields and the multicast routing daemon uses
   2266 	 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages.
   2267 	 */
   2268 	im->im_mbz = 0;
   2269 
   2270 	++ipst->ips_mrtstat->mrts_upcalls;
   2271 	if (IPCL_IS_NONSTR(mrouter) ? mrouter->conn_flow_cntrld :
   2272 	    !canputnext(mrouter->conn_rq)) {
   2273 		++ipst->ips_mrtstat->mrts_pim_regsend_drops;
   2274 		if (ipst->ips_ip_mrtdebug > 3) {
   2275 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2276 			    "register_send: register upcall failure.");
   2277 		}
   2278 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2279 		ip_drop_input("mrts_pim_regsend_drops", mp_copy, ill);
   2280 		freemsg(mp_copy);
   2281 	} else {
   2282 		/* Pass to RAWIP */
   2283 		bzero(&iras, sizeof (iras));
   2284 		iras.ira_flags = IRAF_IS_IPV4;
   2285 		iras.ira_ip_hdr_length = sizeof (ipha_t);
   2286 		iras.ira_pktlen = msgdsize(mp_copy);
   2287 		(mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
   2288 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
   2289 	}
   2290 }
   2291 
   2292 /*
   2293  * pim_validate_cksum handles verification of the checksum in the
   2294  * pim header.  For PIM Register packets, the checksum is calculated
   2295  * across the PIM header only.  For all other packets, the checksum
   2296  * is for the PIM header and remainder of the packet.
   2297  *
   2298  * returns: B_TRUE, if checksum is okay.
   2299  *          B_FALSE, if checksum is not valid.
   2300  */
   2301 static boolean_t
   2302 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp)
   2303 {
   2304 	mblk_t *mp_dup;
   2305 
   2306 	if ((mp_dup = dupmsg(mp)) == NULL)
   2307 		return (B_FALSE);
   2308 
   2309 	mp_dup->b_rptr += IPH_HDR_LENGTH(ip);
   2310 	if (pimp->pim_type == PIM_REGISTER)
   2311 		mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN;
   2312 	if (IP_CSUM(mp_dup, 0, 0)) {
   2313 		freemsg(mp_dup);
   2314 		return (B_FALSE);
   2315 	}
   2316 	freemsg(mp_dup);
   2317 	return (B_TRUE);
   2318 }
   2319 
   2320 /*
   2321  * Process PIM protocol packets i.e. IP Protocol 103.
   2322  * Register messages are decapsulated and sent onto multicast forwarding.
   2323  *
   2324  * Return NULL for a bad packet that is discarded here.
   2325  * Return mp if the message is OK and should be handed to "raw" receivers.
   2326  * Callers of pim_input() may need to reinitialize variables that were copied
   2327  * from the mblk as this calls pullupmsg().
   2328  */
   2329 mblk_t *
   2330 pim_input(mblk_t *mp, ip_recv_attr_t *ira)
   2331 {
   2332 	ipha_t		*eip, *ip;
   2333 	int		iplen, pimlen, iphlen;
   2334 	struct pim	*pimp;	/* pointer to a pim struct */
   2335 	uint32_t	*reghdr;
   2336 	ill_t		*ill = ira->ira_ill;
   2337 	ip_stack_t	*ipst = ill->ill_ipst;
   2338 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2339 
   2340 	/*
   2341 	 * Pullup the msg for PIM protocol processing.
   2342 	 */
   2343 	if (pullupmsg(mp, -1) == 0) {
   2344 		++ipst->ips_mrtstat->mrts_pim_nomemory;
   2345 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2346 		ip_drop_input("mrts_pim_nomemory", mp, ill);
   2347 		freemsg(mp);
   2348 		return (NULL);
   2349 	}
   2350 
   2351 	ip = (ipha_t *)mp->b_rptr;
   2352 	iplen = ip->ipha_length;
   2353 	iphlen = IPH_HDR_LENGTH(ip);
   2354 	pimlen = ntohs(iplen) - iphlen;
   2355 
   2356 	/*
   2357 	 * Validate lengths
   2358 	 */
   2359 	if (pimlen < PIM_MINLEN) {
   2360 		++ipst->ips_mrtstat->mrts_pim_malformed;
   2361 		if (ipst->ips_ip_mrtdebug > 1) {
   2362 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2363 			    "pim_input: length not at least minlen");
   2364 		}
   2365 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2366 		ip_drop_input("mrts_pim_malformed", mp, ill);
   2367 		freemsg(mp);
   2368 		return (NULL);
   2369 	}
   2370 
   2371 	/*
   2372 	 * Point to the PIM header.
   2373 	 */
   2374 	pimp = (struct pim *)((caddr_t)ip + iphlen);
   2375 
   2376 	/*
   2377 	 * Check the version number.
   2378 	 */
   2379 	if (pimp->pim_vers != PIM_VERSION) {
   2380 		++ipst->ips_mrtstat->mrts_pim_badversion;
   2381 		if (ipst->ips_ip_mrtdebug > 1) {
   2382 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2383 			    "pim_input: unknown version of PIM");
   2384 		}
   2385 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2386 		ip_drop_input("mrts_pim_badversion", mp, ill);
   2387 		freemsg(mp);
   2388 		return (NULL);
   2389 	}
   2390 
   2391 	/*
   2392 	 * Validate the checksum
   2393 	 */
   2394 	if (!pim_validate_cksum(mp, ip, pimp)) {
   2395 		++ipst->ips_mrtstat->mrts_pim_rcv_badcsum;
   2396 		if (ipst->ips_ip_mrtdebug > 1) {
   2397 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2398 			    "pim_input: invalid checksum");
   2399 		}
   2400 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2401 		ip_drop_input("pim_rcv_badcsum", mp, ill);
   2402 		freemsg(mp);
   2403 		return (NULL);
   2404 	}
   2405 
   2406 	if (pimp->pim_type != PIM_REGISTER)
   2407 		return (mp);
   2408 
   2409 	reghdr = (uint32_t *)(pimp + 1);
   2410 	eip = (ipha_t *)(reghdr + 1);
   2411 
   2412 	/*
   2413 	 * check if the inner packet is destined to mcast group
   2414 	 */
   2415 	if (!CLASSD(eip->ipha_dst)) {
   2416 		++ipst->ips_mrtstat->mrts_pim_badregisters;
   2417 		if (ipst->ips_ip_mrtdebug > 1) {
   2418 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2419 			    "pim_input: Inner pkt not mcast .. !");
   2420 		}
   2421 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2422 		ip_drop_input("mrts_pim_badregisters", mp, ill);
   2423 		freemsg(mp);
   2424 		return (NULL);
   2425 	}
   2426 	if (ipst->ips_ip_mrtdebug > 1) {
   2427 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2428 		    "register from %x, to %x, len %d",
   2429 		    ntohl(eip->ipha_src),
   2430 		    ntohl(eip->ipha_dst),
   2431 		    ntohs(eip->ipha_length));
   2432 	}
   2433 	/*
   2434 	 * If the null register bit is not set, decapsulate
   2435 	 * the packet before forwarding it.
   2436 	 * Avoid this in no register vif
   2437 	 */
   2438 	if (!(ntohl(*reghdr) & PIM_NULL_REGISTER) &&
   2439 	    ipst->ips_reg_vif_num != ALL_VIFS) {
   2440 		mblk_t *mp_copy;
   2441 		uint_t saved_pktlen;
   2442 
   2443 		/* Copy the message */
   2444 		if ((mp_copy = copymsg(mp)) == NULL) {
   2445 			++ipst->ips_mrtstat->mrts_pim_nomemory;
   2446 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2447 			ip_drop_input("mrts_pim_nomemory", mp, ill);
   2448 			freemsg(mp);
   2449 			return (NULL);
   2450 		}
   2451 
   2452 		/*
   2453 		 * Decapsulate the packet and give it to
   2454 		 * register_mforward.
   2455 		 */
   2456 		mp_copy->b_rptr += iphlen + sizeof (pim_t) + sizeof (*reghdr);
   2457 		saved_pktlen = ira->ira_pktlen;
   2458 		ira->ira_pktlen -= iphlen + sizeof (pim_t) + sizeof (*reghdr);
   2459 		if (register_mforward(mp_copy, ira) != 0) {
   2460 			/* register_mforward already called ip_drop_input */
   2461 			freemsg(mp);
   2462 			ira->ira_pktlen = saved_pktlen;
   2463 			return (NULL);
   2464 		}
   2465 		ira->ira_pktlen = saved_pktlen;
   2466 	}
   2467 
   2468 	/*
   2469 	 * Pass all valid PIM packets up to any process(es) listening on a raw
   2470 	 * PIM socket. For Solaris it is done right after pim_input() is
   2471 	 * called.
   2472 	 */
   2473 	return (mp);
   2474 }
   2475 
   2476 /*
   2477  * PIM sparse mode hook.  Called by pim_input after decapsulating
   2478  * the packet. Loop back the packet, as if we have received it.
   2479  * In pim_input() we have to check if the destination is a multicast address.
   2480  */
   2481 static int
   2482 register_mforward(mblk_t *mp, ip_recv_attr_t *ira)
   2483 {
   2484 	ire_t		*ire;
   2485 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
   2486 	ill_t		*ill = ira->ira_ill;
   2487 	ip_stack_t	*ipst = ill->ill_ipst;
   2488 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2489 
   2490 	ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs);
   2491 
   2492 	if (ipst->ips_ip_mrtdebug > 3) {
   2493 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2494 		    "register_mforward: src %x, dst %x\n",
   2495 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
   2496 	}
   2497 	/*
   2498 	 * Need to pass in to ip_mforward() the information that the
   2499 	 * packet has arrived on the register_vif. We mark it with
   2500 	 * the IRAF_PIM_REGISTER attribute.
   2501 	 * pim_input verified that the (inner) destination is multicast,
   2502 	 * hence we skip the generic code in ip_input.
   2503 	 */
   2504 	ira->ira_flags |= IRAF_PIM_REGISTER;
   2505 	++ipst->ips_mrtstat->mrts_pim_regforwards;
   2506 
   2507 	if (!CLASSD(ipha->ipha_dst)) {
   2508 		ire = ire_route_recursive_v4(ipha->ipha_dst, 0, NULL, ALL_ZONES,
   2509 		    ira->ira_tsl, MATCH_IRE_SECATTR, B_TRUE, 0, ipst, NULL,
   2510 		    NULL, NULL);
   2511 	} else {
   2512 		ire = ire_multicast(ill);
   2513 	}
   2514 	ASSERT(ire != NULL);
   2515 	/* Normally this will return the IRE_MULTICAST */
   2516 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
   2517 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2518 		ip_drop_input("mrts_pim RTF_REJECT", mp, ill);
   2519 		freemsg(mp);
   2520 		ire_refrele(ire);
   2521 		return (-1);
   2522 	}
   2523 	ASSERT(ire->ire_type & IRE_MULTICAST);
   2524 	(*ire->ire_recvfn)(ire, mp, ipha, ira);
   2525 	ire_refrele(ire);
   2526 
   2527 	return (0);
   2528 }
   2529 
   2530 /*
   2531  * Send an encapsulated packet.
   2532  * Caller assumes can continue to use mp when routine returns.
   2533  */
   2534 /* ARGSUSED */
   2535 static void
   2536 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
   2537 {
   2538 	mblk_t 	*mp_copy;
   2539 	ipha_t 	*ipha_copy;
   2540 	size_t	len;
   2541 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
   2542 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2543 
   2544 	if (ipst->ips_ip_mrtdebug > 1) {
   2545 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2546 		    "encap_send: vif %ld enter",
   2547 		    (ptrdiff_t)(vifp - ipst->ips_vifs));
   2548 	}
   2549 	len = ntohs(ipha->ipha_length);
   2550 
   2551 	/*
   2552 	 * Copy the old packet & pullup it's IP header into the
   2553 	 * new mbuf so we can modify it.  Try to fill the new
   2554 	 * mbuf since if we don't the ethernet driver will.
   2555 	 */
   2556 	mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED);
   2557 	if (mp_copy == NULL)
   2558 		return;
   2559 	mp_copy->b_rptr += 32;
   2560 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr);
   2561 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
   2562 		freeb(mp_copy);
   2563 		return;
   2564 	}
   2565 
   2566 	/*
   2567 	 * Fill in the encapsulating IP header.
   2568 	 * Remote tunnel dst in rmt_addr, from add_vif().
   2569 	 */
   2570 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
   2571 	*ipha_copy = multicast_encap_iphdr;
   2572 	ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET);
   2573 	ipha_copy->ipha_length = htons(len + sizeof (ipha_t));
   2574 	ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr;
   2575 	ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr;
   2576 	ASSERT(ipha_copy->ipha_ident == 0);
   2577 
   2578 	/* Turn the encapsulated IP header back into a valid one. */
   2579 	ipha = (ipha_t *)mp_copy->b_cont->b_rptr;
   2580 	ipha->ipha_ttl--;
   2581 	ipha->ipha_hdr_checksum = 0;
   2582 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
   2583 
   2584 	ipha_copy->ipha_ttl = ipha->ipha_ttl;
   2585 
   2586 	if (ipst->ips_ip_mrtdebug > 1) {
   2587 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2588 		    "encap_send: group 0x%x", ntohl(ipha->ipha_dst));
   2589 	}
   2590 	if (vifp->v_rate_limit <= 0)
   2591 		tbf_send_packet(vifp, mp_copy);
   2592 	else
   2593 		/* ipha is from the original header */
   2594 		tbf_control(vifp, mp_copy, ipha);
   2595 }
   2596 
   2597 /*
   2598  * De-encapsulate a packet and feed it back through IP input if it
   2599  * matches one of our multicast tunnels.
   2600  *
   2601  * This routine is called whenever IP gets a packet with prototype
   2602  * IPPROTO_ENCAP and a local destination address and the packet didn't
   2603  * match one of our configured IP-in-IP tunnels.
   2604  */
   2605 void
   2606 ip_mroute_decap(mblk_t *mp, ip_recv_attr_t *ira)
   2607 {
   2608 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
   2609 	ipha_t		*ipha_encap;
   2610 	int		hlen = IPH_HDR_LENGTH(ipha);
   2611 	int		hlen_encap;
   2612 	ipaddr_t	src;
   2613 	struct vif	*vifp;
   2614 	ire_t		*ire;
   2615 	ill_t		*ill = ira->ira_ill;
   2616 	ip_stack_t	*ipst = ill->ill_ipst;
   2617 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2618 
   2619 	/* Make sure we have all of the inner header */
   2620 	ipha_encap = (ipha_t *)((char *)ipha + hlen);
   2621 	if (mp->b_wptr - mp->b_rptr < hlen + IP_SIMPLE_HDR_LENGTH) {
   2622 		ipha = ip_pullup(mp, hlen + IP_SIMPLE_HDR_LENGTH, ira);
   2623 		if (ipha == NULL) {
   2624 			ipst->ips_mrtstat->mrts_bad_tunnel++;
   2625 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2626 			ip_drop_input("ip_mroute_decap: too short", mp, ill);
   2627 			freemsg(mp);
   2628 			return;
   2629 		}
   2630 		ipha_encap = (ipha_t *)((char *)ipha + hlen);
   2631 	}
   2632 	hlen_encap = IPH_HDR_LENGTH(ipha_encap);
   2633 	if (mp->b_wptr - mp->b_rptr < hlen + hlen_encap) {
   2634 		ipha = ip_pullup(mp, hlen + hlen_encap, ira);
   2635 		if (ipha == NULL) {
   2636 			ipst->ips_mrtstat->mrts_bad_tunnel++;
   2637 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2638 			ip_drop_input("ip_mroute_decap: too short", mp, ill);
   2639 			freemsg(mp);
   2640 			return;
   2641 		}
   2642 		ipha_encap = (ipha_t *)((char *)ipha + hlen);
   2643 	}
   2644 
   2645 	/*
   2646 	 * Dump the packet if it's not to a multicast destination or if
   2647 	 * we don't have an encapsulating tunnel with the source.
   2648 	 * Note:  This code assumes that the remote site IP address
   2649 	 * uniquely identifies the tunnel (i.e., that this site has
   2650 	 * at most one tunnel with the remote site).
   2651 	 */
   2652 	if (!CLASSD(ipha_encap->ipha_dst)) {
   2653 		ipst->ips_mrtstat->mrts_bad_tunnel++;
   2654 		ip1dbg(("ip_mroute_decap: bad tunnel\n"));
   2655 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2656 		ip_drop_input("mrts_bad_tunnel", mp, ill);
   2657 		freemsg(mp);
   2658 		return;
   2659 	}
   2660 	src = (ipaddr_t)ipha->ipha_src;
   2661 	mutex_enter(&ipst->ips_last_encap_lock);
   2662 	if (src != ipst->ips_last_encap_src) {
   2663 		struct vif *vife;
   2664 
   2665 		vifp = ipst->ips_vifs;
   2666 		vife = vifp + ipst->ips_numvifs;
   2667 		ipst->ips_last_encap_src = src;
   2668 		ipst->ips_last_encap_vif = 0;
   2669 		for (; vifp < vife; ++vifp) {
   2670 			if (!lock_good_vif(vifp))
   2671 				continue;
   2672 			if (vifp->v_rmt_addr.s_addr == src) {
   2673 				if (vifp->v_flags & VIFF_TUNNEL)
   2674 					ipst->ips_last_encap_vif = vifp;
   2675 				if (ipst->ips_ip_mrtdebug > 1) {
   2676 					(void) mi_strlog(mrouter->conn_rq,
   2677 					    1, SL_TRACE,
   2678 					    "ip_mroute_decap: good tun "
   2679 					    "vif %ld with %x",
   2680 					    (ptrdiff_t)(vifp - ipst->ips_vifs),
   2681 					    ntohl(src));
   2682 				}
   2683 				unlock_good_vif(vifp);
   2684 				break;
   2685 			}
   2686 			unlock_good_vif(vifp);
   2687 		}
   2688 	}
   2689 	if ((vifp = ipst->ips_last_encap_vif) == 0) {
   2690 		mutex_exit(&ipst->ips_last_encap_lock);
   2691 		ipst->ips_mrtstat->mrts_bad_tunnel++;
   2692 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2693 		ip_drop_input("mrts_bad_tunnel", mp, ill);
   2694 		freemsg(mp);
   2695 		ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
   2696 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src)));
   2697 		return;
   2698 	}
   2699 	mutex_exit(&ipst->ips_last_encap_lock);
   2700 
   2701 	/*
   2702 	 * Need to pass in the tunnel source to ip_mforward (so that it can
   2703 	 * verify that the packet arrived over the correct vif.)
   2704 	 */
   2705 	ira->ira_flags |= IRAF_MROUTE_TUNNEL_SET;
   2706 	ira->ira_mroute_tunnel = src;
   2707 	mp->b_rptr += hlen;
   2708 	ira->ira_pktlen -= hlen;
   2709 	ira->ira_ip_hdr_length = hlen_encap;
   2710 
   2711 	/*
   2712 	 * We don't redo any of the filtering in ill_input_full_v4 and we
   2713 	 * have checked that all of ipha_encap and any IP options are
   2714 	 * pulled up. Hence we call ire_recv_multicast_v4 directly.
   2715 	 * However, we have to check for RSVP as in ip_input_full_v4
   2716 	 * and if so we pass it to ire_recv_broadcast_v4 for local delivery
   2717 	 * to the rsvpd.
   2718 	 */
   2719 	if (ipha_encap->ipha_protocol == IPPROTO_RSVP &&
   2720 	    ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) {
   2721 		ire = ire_route_recursive_v4(INADDR_BROADCAST, 0, ill,
   2722 		    ALL_ZONES, ira->ira_tsl, MATCH_IRE_ILL|MATCH_IRE_SECATTR,
   2723 		    B_TRUE, 0, ipst, NULL, NULL, NULL);
   2724 	} else {
   2725 		ire = ire_multicast(ill);
   2726 	}
   2727 	ASSERT(ire != NULL);
   2728 	/* Normally this will return the IRE_MULTICAST or IRE_BROADCAST */
   2729 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
   2730 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2731 		ip_drop_input("ip_mroute_decap: RTF_REJECT", mp, ill);
   2732 		freemsg(mp);
   2733 		ire_refrele(ire);
   2734 		return;
   2735 	}
   2736 	ire->ire_ib_pkt_count++;
   2737 	ASSERT(ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST));
   2738 	(*ire->ire_recvfn)(ire, mp, ipha_encap, ira);
   2739 	ire_refrele(ire);
   2740 }
   2741 
   2742 /*
   2743  * Remove all records with v_ipif == ipif.  Called when an interface goes away
   2744  * (stream closed).  Called as writer.
   2745  */
   2746 void
   2747 reset_mrt_vif_ipif(ipif_t *ipif)
   2748 {
   2749 	vifi_t vifi, tmp_vifi;
   2750 	vifi_t num_of_vifs;
   2751 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
   2752 
   2753 	/* Can't check vifi >= 0 since vifi_t is unsigned! */
   2754 
   2755 	mutex_enter(&ipst->ips_numvifs_mutex);
   2756 	num_of_vifs = ipst->ips_numvifs;
   2757 	mutex_exit(&ipst->ips_numvifs_mutex);
   2758 
   2759 	for (vifi = num_of_vifs; vifi != 0; vifi--) {
   2760 		tmp_vifi = vifi - 1;
   2761 		if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) {
   2762 			(void) del_vif(&tmp_vifi, ipst);
   2763 		}
   2764 	}
   2765 }
   2766 
   2767 /* Remove pending upcall msgs when ill goes away.  Called by ill_delete.  */
   2768 void
   2769 reset_mrt_ill(ill_t *ill)
   2770 {
   2771 	struct mfc	*rt;
   2772 	struct rtdetq	*rte;
   2773 	int		i;
   2774 	ip_stack_t	*ipst = ill->ill_ipst;
   2775 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2776 	timeout_id_t	id;
   2777 
   2778 	for (i = 0; i < MFCTBLSIZ; i++) {
   2779 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
   2780 		if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) {
   2781 			if (ipst->ips_ip_mrtdebug > 1) {
   2782 				(void) mi_strlog(mrouter->conn_rq, 1,
   2783 				    SL_TRACE,
   2784 				    "reset_mrt_ill: mfctable [%d]", i);
   2785 			}
   2786 			while (rt != NULL) {
   2787 				mutex_enter(&rt->mfc_mutex);
   2788 				while ((rte = rt->mfc_rte) != NULL) {
   2789 					if (rte->ill == ill &&
   2790 					    (id = rt->mfc_timeout_id) != 0) {
   2791 						/*
   2792 						 * Its ok to drop the lock,  the
   2793 						 * struct cannot be freed since
   2794 						 * we have a ref on the hash
   2795 						 * bucket.
   2796 						 */
   2797 						mutex_exit(&rt->mfc_mutex);
   2798 						(void) untimeout(id);
   2799 						mutex_enter(&rt->mfc_mutex);
   2800 					}
   2801 					if (rte->ill == ill) {
   2802 						if (ipst->ips_ip_mrtdebug > 1) {
   2803 						(void) mi_strlog(
   2804 						    mrouter->conn_rq,
   2805 						    1, SL_TRACE,
   2806 						    "reset_mrt_ill: "
   2807 						    "ill 0x%p", (void *)ill);
   2808 						}
   2809 						rt->mfc_rte = rte->rte_next;
   2810 						freemsg(rte->mp);
   2811 						mi_free((char *)rte);
   2812 					}
   2813 				}
   2814 				mutex_exit(&rt->mfc_mutex);
   2815 				rt = rt->mfc_next;
   2816 			}
   2817 		}
   2818 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
   2819 	}
   2820 }
   2821 
   2822 /*
   2823  * Token bucket filter module.
   2824  * The ipha is for mcastgrp destination for phyint and encap.
   2825  */
   2826 static void
   2827 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
   2828 {
   2829 	size_t 	p_len =  msgdsize(mp);
   2830 	struct tbf	*t    = vifp->v_tbf;
   2831 	timeout_id_t id = 0;
   2832 	ill_t		*ill = vifp->v_ipif->ipif_ill;
   2833 	ip_stack_t	*ipst = ill->ill_ipst;
   2834 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2835 
   2836 	/* Drop if packet is too large */
   2837 	if (p_len > MAX_BKT_SIZE) {
   2838 		ipst->ips_mrtstat->mrts_pkt2large++;
   2839 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
   2840 		ip_drop_output("tbf_control - too large", mp, ill);
   2841 		freemsg(mp);
   2842 		return;
   2843 	}
   2844 	if (ipst->ips_ip_mrtdebug > 1) {
   2845 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2846 		    "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x",
   2847 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len,
   2848 		    ntohl(ipha->ipha_dst));
   2849 	}
   2850 
   2851 	mutex_enter(&t->tbf_lock);
   2852 
   2853 	tbf_update_tokens(vifp);
   2854 
   2855 	/*
   2856 	 * If there are enough tokens,
   2857 	 * and the queue is empty, send this packet out.
   2858 	 */
   2859 	if (ipst->ips_ip_mrtdebug > 1) {
   2860 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2861 		    "tbf_control: vif %ld, TOKENS  %d, pkt len  %lu, qlen  %d",
   2862 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len,
   2863 		    t->tbf_q_len);
   2864 	}
   2865 	/* No packets are queued */
   2866 	if (t->tbf_q_len == 0) {
   2867 		/* queue empty, send packet if enough tokens */
   2868 		if (p_len <= t->tbf_n_tok) {
   2869 			t->tbf_n_tok -= p_len;
   2870 			mutex_exit(&t->tbf_lock);
   2871 			tbf_send_packet(vifp, mp);
   2872 			return;
   2873 		} else {
   2874 			/* Queue packet and timeout till later */
   2875 			tbf_queue(vifp, mp);
   2876 			ASSERT(vifp->v_timeout_id == 0);
   2877 			vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
   2878 			    TBF_REPROCESS);
   2879 		}
   2880 	} else if (t->tbf_q_len < t->tbf_max_q_len) {
   2881 		/* Finite queue length, so queue pkts and process queue */
   2882 		tbf_queue(vifp, mp);
   2883 		tbf_process_q(vifp);
   2884 	} else {
   2885 		/* Check that we have UDP header with IP header */
   2886 		size_t hdr_length = IPH_HDR_LENGTH(ipha) +
   2887 		    sizeof (struct udphdr);
   2888 
   2889 		if ((mp->b_wptr - mp->b_rptr) < hdr_length) {
   2890 			if (!pullupmsg(mp, hdr_length)) {
   2891 				BUMP_MIB(ill->ill_ip_mib,
   2892 				    ipIfStatsOutDiscards);
   2893 				ip_drop_output("tbf_control - pullup", mp, ill);
   2894 				freemsg(mp);
   2895 				ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
   2896 				    "vif %ld src 0x%x dst 0x%x\n",
   2897 				    (ptrdiff_t)(vifp - ipst->ips_vifs),
   2898 				    ntohl(ipha->ipha_src),
   2899 				    ntohl(ipha->ipha_dst)));
   2900 				mutex_exit(&vifp->v_tbf->tbf_lock);
   2901 				return;
   2902 			} else
   2903 				/* Have to reassign ipha after pullupmsg */
   2904 				ipha = (ipha_t *)mp->b_rptr;
   2905 		}
   2906 		/*
   2907 		 * Queue length too much,
   2908 		 * try to selectively dq, or queue and process
   2909 		 */
   2910 		if (!tbf_dq_sel(vifp, ipha)) {
   2911 			ipst->ips_mrtstat->mrts_q_overflow++;
   2912 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
   2913 			ip_drop_output("mrts_q_overflow", mp, ill);
   2914 			freemsg(mp);
   2915 		} else {
   2916 			tbf_queue(vifp, mp);
   2917 			tbf_process_q(vifp);
   2918 		}
   2919 	}
   2920 	if (t->tbf_q_len == 0) {
   2921 		id = vifp->v_timeout_id;
   2922 		vifp->v_timeout_id = 0;
   2923 	}
   2924 	mutex_exit(&vifp->v_tbf->tbf_lock);
   2925 	if (id != 0)
   2926 		(void) untimeout(id);
   2927 }
   2928 
   2929 /*
   2930  * Adds a packet to the tbf queue at the interface.
   2931  * The ipha is for mcastgrp destination for phyint and encap.
   2932  */
   2933 static void
   2934 tbf_queue(struct vif *vifp, mblk_t *mp)
   2935 {
   2936 	struct tbf	*t = vifp->v_tbf;
   2937 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
   2938 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2939 
   2940 	if (ipst->ips_ip_mrtdebug > 1) {
   2941 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2942 		    "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs));
   2943 	}
   2944 	ASSERT(MUTEX_HELD(&t->tbf_lock));
   2945 
   2946 	if (t->tbf_t == NULL) {
   2947 		/* Queue was empty */
   2948 		t->tbf_q = mp;
   2949 	} else {
   2950 		/* Insert at tail */
   2951 		t->tbf_t->b_next = mp;
   2952 	}
   2953 	/* set new tail pointer */
   2954 	t->tbf_t = mp;
   2955 
   2956 	mp->b_next = mp->b_prev = NULL;
   2957 
   2958 	t->tbf_q_len++;
   2959 }
   2960 
   2961 /*
   2962  * Process the queue at the vif interface.
   2963  * Drops the tbf_lock when sending packets.
   2964  *
   2965  * NOTE : The caller should quntimeout if the queue length is 0.
   2966  */
   2967 static void
   2968 tbf_process_q(struct vif *vifp)
   2969 {
   2970 	mblk_t	*mp;
   2971 	struct tbf	*t = vifp->v_tbf;
   2972 	size_t	len;
   2973 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
   2974 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2975 
   2976 	if (ipst->ips_ip_mrtdebug > 1) {
   2977 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2978 		    "tbf_process_q 1: vif %ld qlen = %d",
   2979 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len);
   2980 	}
   2981 
   2982 	/*
   2983 	 * Loop through the queue at the interface and send
   2984 	 * as many packets as possible.
   2985 	 */
   2986 	ASSERT(MUTEX_HELD(&t->tbf_lock));
   2987 
   2988 	while (t->tbf_q_len > 0) {
   2989 		mp = t->tbf_q;
   2990 		len = (size_t)msgdsize(mp); /* length of ip pkt */
   2991 
   2992 		/* Determine if the packet can be sent */
   2993 		if (len <= t->tbf_n_tok) {
   2994 			/*
   2995 			 * If so, reduce no. of tokens, dequeue the packet,
   2996 			 * send the packet.
   2997 			 */
   2998 			t->tbf_n_tok -= len;
   2999 
   3000 			t->tbf_q = mp->b_next;
   3001 			if (--t->tbf_q_len == 0) {
   3002 				t->tbf_t = NULL;
   3003 			}
   3004 			mp->b_next = NULL;
   3005 			/* Exit mutex before sending packet, then re-enter */
   3006 			mutex_exit(&t->tbf_lock);
   3007 			tbf_send_packet(vifp, mp);
   3008 			mutex_enter(&t->tbf_lock);
   3009 		} else
   3010 			break;
   3011 	}
   3012 }
   3013 
   3014 /* Called at tbf timeout to update tokens, process q and reset timer.  */
   3015 static void
   3016 tbf_reprocess_q(void *arg)
   3017 {
   3018 	struct vif *vifp = arg;
   3019 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
   3020 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   3021 
   3022 	mutex_enter(&vifp->v_tbf->tbf_lock);
   3023 	vifp->v_timeout_id = 0;
   3024 	tbf_update_tokens(vifp);
   3025 
   3026 	tbf_process_q(vifp);
   3027 
   3028 	if (vifp->v_tbf->tbf_q_len > 0) {
   3029 		vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
   3030 		    TBF_REPROCESS);
   3031 	}
   3032 	mutex_exit(&vifp->v_tbf->tbf_lock);
   3033 
   3034 	if (ipst->ips_ip_mrtdebug > 1) {
   3035 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   3036 		    "tbf_reprcess_q: vif %ld timeout id = %p",
   3037 		    (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id);
   3038 	}
   3039 }
   3040 
   3041 /*
   3042  * Function that will selectively discard a member of the tbf queue,
   3043  * based on the precedence value and the priority.
   3044  *
   3045  * NOTE : The caller should quntimeout if the queue length is 0.
   3046  */
   3047 static int
   3048 tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
   3049 {
   3050 	uint_t		p;
   3051 	struct tbf		*t = vifp->v_tbf;
   3052 	mblk_t		**np;
   3053 	mblk_t		*last, *mp;
   3054 	ill_t		*ill = vifp->v_ipif->ipif_ill;
   3055 	ip_stack_t	*ipst = ill->ill_ipst;
   3056 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   3057 
   3058 	if (ipst->ips_ip_mrtdebug > 1) {
   3059 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   3060 		    "dq_sel: vif %ld dst 0x%x",
   3061 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst));
   3062 	}
   3063 
   3064 	ASSERT(MUTEX_HELD(&t->tbf_lock));
   3065 	p = priority(vifp, ipha);
   3066 
   3067 	np = &t->tbf_q;
   3068 	last = NULL;
   3069 	while ((mp = *np) != NULL) {
   3070 		if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) {
   3071 			*np = mp->b_next;
   3072 			/* If removing the last packet, fix the tail pointer */
   3073 			if (mp == t->tbf_t)
   3074 				t->tbf_t = last;
   3075 			mp->b_prev = mp->b_next = NULL;
   3076 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
   3077 			ip_drop_output("tbf_dq_send", mp, ill);
   3078 			freemsg(mp);
   3079 			/*
   3080 			 * It's impossible for the queue to be empty, but
   3081 			 * we check anyway.
   3082 			 */
   3083 			if (--t->tbf_q_len == 0) {
   3084 				t->tbf_t = NULL;
   3085 			}
   3086 			ipst->ips_mrtstat->mrts_drop_sel++;
   3087 			return (1);
   3088 		}
   3089 		np = &mp->b_next;
   3090 		last = mp;
   3091 	}
   3092 	return (0);
   3093 }
   3094 
   3095 /* Sends packet, 2 cases - encap tunnel, phyint.  */
   3096 static void
   3097 tbf_send_packet(struct vif *vifp, mblk_t *mp)
   3098 {
   3099 	ipif_t		*ipif = vifp->v_ipif;
   3100 	ill_t		*ill = ipif->ipif_ill;
   3101 	ip_stack_t	*ipst = ill->ill_ipst;
   3102 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   3103 	ipha_t		*ipha;
   3104 
   3105 	ipha = (ipha_t *)mp->b_rptr;
   3106 	/* If encap tunnel options */
   3107 	if (vifp->v_flags & VIFF_TUNNEL)  {
   3108 		ip_xmit_attr_t	ixas;
   3109 
   3110 		if (ipst->ips_ip_mrtdebug > 1) {
   3111 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   3112 			    "tbf_send_packet: ENCAP tunnel vif %ld",
   3113 			    (ptrdiff_t)(vifp - ipst->ips_vifs));
   3114 		}
   3115 		bzero(&ixas, sizeof (ixas));
   3116 		ixas.ixa_flags = IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE;
   3117 		ixas.ixa_ipst = ipst;
   3118 		ixas.ixa_ifindex = 0;
   3119 		ixas.ixa_cred = kcred;
   3120 		ixas.ixa_cpid = NOPID;
   3121 		ixas.ixa_tsl = NULL;
   3122 		ixas.ixa_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
   3123 		ixas.ixa_pktlen = ntohs(ipha->ipha_length);
   3124 		ixas.ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
   3125 
   3126 		/*
   3127 		 * Feed into ip_output_simple which will set the ident field
   3128 		 * and checksum the encapsulating header.
   3129 		 * BSD gets the cached route vifp->v_route from ip_output()
   3130 		 * to speed up route table lookups. Not necessary in SunOS 5.x.
   3131 		 * One could make multicast forwarding faster by putting an
   3132 		 * ip_xmit_attr_t in each vif thereby caching the ire/nce.
   3133 		 */
   3134 		(void) ip_output_simple(mp, &ixas);
   3135 		ixa_cleanup(&ixas);
   3136 		return;
   3137 
   3138 		/* phyint */
   3139 	} else {
   3140 		/* Need to loop back to members on the outgoing interface. */
   3141 		ipaddr_t	dst;
   3142 		ip_recv_attr_t	iras;
   3143 		nce_t		*nce;
   3144 
   3145 		bzero(&iras, sizeof (iras));
   3146 		iras.ira_flags = IRAF_IS_IPV4;
   3147 		iras.ira_ill = iras.ira_rill = ill;
   3148 		iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
   3149 		iras.ira_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
   3150 		iras.ira_pktlen = ntohs(ipha->ipha_length);
   3151 		iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
   3152 
   3153 		dst = ipha->ipha_dst;
   3154 		if (ill_hasmembers_v4(ill, dst)) {
   3155 			iras.ira_flags |= IRAF_LOOPBACK_COPY;
   3156 		}
   3157 		if (ipst->ips_ip_mrtdebug > 1) {
   3158 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   3159 			    "tbf_send_pkt: phyint forward  vif %ld dst = 0x%x",
   3160 			    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst));
   3161 		}
   3162 		/*
   3163 		 * Find an NCE which matches the nexthop.
   3164 		 * For a pt-pt interface we use the other end of the pt-pt
   3165 		 * link.
   3166 		 */
   3167 		if (ipif->ipif_flags & IPIF_POINTOPOINT) {
   3168 			dst = ipif->ipif_pp_dst_addr;
   3169 			nce = arp_nce_init(ill, dst, ill->ill_net_type);
   3170 		} else {
   3171 			nce = arp_nce_init(ill, dst, IRE_MULTICAST);
   3172 		}
   3173 		if (nce == NULL) {
   3174 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
   3175 			ip_drop_output("tbf_send_packet - no nce", mp, ill);
   3176 			freemsg(mp);
   3177 			return;
   3178 		}
   3179 
   3180 		/*
   3181 		 * We don't remeber the incoming ill. Thus we
   3182 		 * pretend the  packet arrived on the outbound ill. This means
   3183 		 * statistics for input errors will be increased on the wrong
   3184 		 * ill but that isn't a big deal.
   3185 		 */
   3186 		ip_forward_xmit_v4(nce, ill, mp, ipha, &iras, ill->ill_mtu, 0);
   3187 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
   3188 
   3189 		nce_refrele(nce);
   3190 	}
   3191 }
   3192 
   3193 /*
   3194  * Determine the current time and then the elapsed time (between the last time
   3195  * and time now).  Update the no. of tokens in the bucket.
   3196  */
   3197 static void
   3198 tbf_update_tokens(struct vif *vifp)
   3199 {
   3200 	timespec_t	tp;
   3201 	hrtime_t	tm;
   3202 	struct tbf	*t = vifp->v_tbf;
   3203 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
   3204 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   3205 
   3206 	ASSERT(MUTEX_HELD(&t->tbf_lock));
   3207 
   3208 	/* Time in secs and nsecs, rate limit in kbits/sec */
   3209 	gethrestime(&tp);
   3210 
   3211 	/*LINTED*/
   3212 	TV_DELTA(tp, t->tbf_last_pkt_t, tm);
   3213 
   3214 	/*
   3215 	 * This formula is actually
   3216 	 * "time in seconds" * "bytes/second".  Scaled for nsec.
   3217 	 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8)
   3218 	 *
   3219 	 * The (1000/1024) was introduced in add_vif to optimize
   3220 	 * this divide into a shift.
   3221 	 */
   3222 	t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8;
   3223 	t->tbf_last_pkt_t = tp;
   3224 
   3225 	if (t->tbf_n_tok > MAX_BKT_SIZE)
   3226 		t->tbf_n_tok = MAX_BKT_SIZE;
   3227 	if (ipst->ips_ip_mrtdebug > 1) {
   3228 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   3229 		    "tbf_update_tok: tm %lld tok %d vif %ld",
   3230 		    tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs));
   3231 	}
   3232 }
   3233 
   3234 /*
   3235  * Priority currently is based on port nos.
   3236  * Different forwarding mechanisms have different ways
   3237  * of obtaining the port no. Hence, the vif must be
   3238  * given along with the packet itself.
   3239  *
   3240  */
   3241 static int
   3242 priority(struct vif *vifp, ipha_t *ipha)
   3243 {
   3244 	int prio;
   3245 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
   3246 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   3247 
   3248 	/* Temporary hack; may add general packet classifier some day */
   3249 
   3250 	ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock));
   3251 
   3252 	/*
   3253 	 * The UDP port space is divided up into four priority ranges:
   3254 	 * [0, 16384)	: unclassified - lowest priority
   3255 	 * [16384, 32768)	: audio - highest priority
   3256 	 * [32768, 49152)	: whiteboard - medium priority
   3257 	 * [49152, 65536)	: video - low priority
   3258 	 */
   3259 
   3260 	if (ipha->ipha_protocol == IPPROTO_UDP) {
   3261 		struct udphdr *udp =
   3262 		    (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha));
   3263 		switch (ntohs(udp->uh_dport) & 0xc000) {
   3264 		case 0x4000:
   3265 			prio = 70;
   3266 			break;
   3267 		case 0x8000:
   3268 			prio = 60;
   3269 			break;
   3270 		case 0xc000:
   3271 			prio = 55;
   3272 			break;
   3273 		default:
   3274 			prio = 50;
   3275 			break;
   3276 		}
   3277 		if (ipst->ips_ip_mrtdebug > 1) {
   3278 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   3279 			    "priority: port %x prio %d\n",
   3280 			    ntohs(udp->uh_dport), prio);
   3281 		}
   3282 	} else
   3283 		prio = 50;  /* default priority */
   3284 	return (prio);
   3285 }
   3286 
   3287 /*
   3288  * End of token bucket filter modifications
   3289  */
   3290 
   3291 
   3292 
   3293 /*
   3294  * Produces data for netstat -M.
   3295  */
   3296 int
   3297 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst)
   3298 {
   3299 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
   3300 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
   3301 	if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat,
   3302 		sizeof (struct mrtstat))) {
   3303 		ip0dbg(("ip_mroute_stats: failed %ld bytes\n",
   3304 		    (size_t)sizeof (struct mrtstat)));
   3305 		return (0);
   3306 	}
   3307 	return (1);
   3308 }
   3309 
   3310 /*
   3311  * Sends info for SNMP's MIB.
   3312  */
   3313 int
   3314 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst)
   3315 {
   3316 	struct vifctl 	vi;
   3317 	vifi_t		vifi;
   3318 
   3319 	mutex_enter(&ipst->ips_numvifs_mutex);
   3320 	for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) {
   3321 		if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0)
   3322 			continue;
   3323 		/*
   3324 		 * No locks here, an approximation is fine.
   3325 		 */
   3326 		vi.vifc_vifi = vifi;
   3327 		vi.vifc_flags = ipst->ips_vifs[vifi].v_flags;
   3328 		vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold;
   3329 		vi.vifc_rate_limit	= ipst->ips_vifs[vifi].v_rate_limit;
   3330 		vi.vifc_lcl_addr	= ipst->ips_vifs[vifi].v_lcl_addr;
   3331 		vi.vifc_rmt_addr	= ipst->ips_vifs[vifi].v_rmt_addr;
   3332 		vi.vifc_pkt_in		= ipst->ips_vifs[vifi].v_pkt_in;
   3333 		vi.vifc_pkt_out		= ipst->ips_vifs[vifi].v_pkt_out;
   3334 
   3335 		if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) {
   3336 			ip0dbg(("ip_mroute_vif: failed %ld bytes\n",
   3337 			    (size_t)sizeof (vi)));
   3338 			mutex_exit(&ipst->ips_numvifs_mutex);
   3339 			return (0);
   3340 		}
   3341 	}
   3342 	mutex_exit(&ipst->ips_numvifs_mutex);
   3343 	return (1);
   3344 }
   3345 
   3346 /*
   3347  * Called by ip_snmp_get to send up multicast routing table.
   3348  */
   3349 int
   3350 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst)
   3351 {
   3352 	int			i, j;
   3353 	struct mfc		*rt;
   3354 	struct mfcctl	mfcc;
   3355 
   3356 	/*
   3357 	 * Make sure multicast has not been turned off.
   3358 	 */
   3359 	if (is_mrouter_off(ipst))
   3360 		return (1);
   3361 
   3362 	/* Loop over all hash buckets and their chains */
   3363 	for (i = 0; i < MFCTBLSIZ; i++) {
   3364 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
   3365 		for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) {
   3366 			mutex_enter(&rt->mfc_mutex);
   3367 			if (rt->mfc_rte != NULL ||
   3368 			    (rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
   3369 				mutex_exit(&rt->mfc_mutex);
   3370 				continue;
   3371 			}
   3372 			mfcc.mfcc_origin = rt->mfc_origin;
   3373 			mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp;
   3374 			mfcc.mfcc_parent = rt->mfc_parent;
   3375 			mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt;
   3376 			mutex_enter(&ipst->ips_numvifs_mutex);
   3377 			for (j = 0; j < (int)ipst->ips_numvifs; j++)
   3378 				mfcc.mfcc_ttls[j] = rt->mfc_ttls[j];
   3379 			for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++)
   3380 				mfcc.mfcc_ttls[j] = 0;
   3381 			mutex_exit(&ipst->ips_numvifs_mutex);
   3382 
   3383 			mutex_exit(&rt->mfc_mutex);
   3384 			if (!snmp_append_data(mp, (char *)&mfcc,
   3385 			    sizeof (mfcc))) {
   3386 				MFCB_REFRELE(&ipst->ips_mfcs[i]);
   3387 				ip0dbg(("ip_mroute_mrt: failed %ld bytes\n",
   3388 				    (size_t)sizeof (mfcc)));
   3389 				return (0);
   3390 			}
   3391 		}
   3392 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
   3393 	}
   3394 	return (1);
   3395 }
   3396