Home | History | Annotate | Download | only in in.mpathd
      1 /*
      2  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
      3  * Use is subject to license terms.
      4  */
      5 
      6 /*
      7  * Copyright (c) 1987 Regents of the University of California.
      8  * All rights reserved.
      9  *
     10  * Redistribution and use in source and binary forms are permitted
     11  * provided that the above copyright notice and this paragraph are
     12  * duplicated in all such forms and that any documentation,
     13  * advertising materials, and other materials related to such
     14  * distribution and use acknowledge that the software was developed
     15  * by the University of California, Berkeley. The name of the
     16  * University may not be used to endorse or promote products derived
     17  * from this software without specific prior written permission.
     18  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
     19  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
     20  * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
     21  */
     22 
     23 #include "mpd_defs.h"
     24 #include "mpd_tables.h"
     25 
     26 /*
     27  * Probe types for probe()
     28  */
     29 #define	PROBE_UNI	0x1234		/* Unicast probe packet */
     30 #define	PROBE_MULTI	0x5678		/* Multicast probe packet */
     31 #define	PROBE_RTT	0x9abc		/* RTT only probe packet */
     32 
     33 #define	MSEC_PERMIN	(60 * MILLISEC)	/* Number of milliseconds in a minute */
     34 
     35 /*
     36  * Format of probe / probe response packets. This is an ICMP Echo request
     37  * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
     38  */
     39 struct pr_icmp
     40 {
     41 	uint8_t  pr_icmp_type;		/* type field */
     42 	uint8_t  pr_icmp_code;		/* code field */
     43 	uint16_t pr_icmp_cksum;		/* checksum field */
     44 	uint16_t pr_icmp_id;		/* Identification */
     45 	uint16_t pr_icmp_seq;		/* sequence number */
     46 	uint64_t pr_icmp_timestamp;	/* Time stamp (in ns) */
     47 	uint32_t pr_icmp_mtype;		/* Message type */
     48 };
     49 
     50 static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0,
     51 				    0x0, 0x0, 0x0, 0x0,
     52 				    0x0, 0x0, 0x0, 0x0,
     53 				    0x0, 0x0, 0x0, 0x1 } };
     54 
     55 static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
     56 
     57 static hrtime_t	last_fdt_bumpup_time;	/* When FDT was bumped up last */
     58 
     59 static void		*find_ancillary(struct msghdr *msg, int cmsg_level,
     60     int cmsg_type);
     61 static void		pi_set_crtt(struct target *tg, int64_t m,
     62     boolean_t is_probe_uni);
     63 static void		incoming_echo_reply(struct phyint_instance *pii,
     64     struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp);
     65 static void		incoming_rtt_reply(struct phyint_instance *pii,
     66     struct pr_icmp *reply, struct in6_addr fromaddr);
     67 static void		incoming_mcast_reply(struct phyint_instance *pii,
     68     struct pr_icmp *reply, struct in6_addr fromaddr);
     69 
     70 static boolean_t	check_pg_crtt_improved(struct phyint_group *pg);
     71 static boolean_t	check_pii_crtt_improved(struct phyint_instance *pii);
     72 static boolean_t	check_exception_target(struct phyint_instance *pii,
     73     struct target *target);
     74 static void		probe_fail_info(struct phyint_instance *pii,
     75     struct target *cur_tg, struct probe_fail_count *pfinfo);
     76 static void		probe_success_info(struct phyint_instance *pii,
     77     struct target *cur_tg, struct probe_success_count *psinfo);
     78 static boolean_t	phyint_repaired(struct phyint *pi);
     79 
     80 static boolean_t	highest_ack_tg(uint16_t seq, struct target *tg);
     81 static int 		in_cksum(ushort_t *addr, int len);
     82 static void		reset_snxt_basetimes(void);
     83 static int		ns2ms(int64_t ns);
     84 static int64_t		tv2ns(struct timeval *);
     85 
     86 /*
     87  * CRTT - Conservative Round Trip Time Estimate
     88  * Probe success - A matching probe reply received before CRTT ms has elapsed
     89  *	after sending the probe.
     90  * Probe failure - No probe reply received and more than CRTT ms has elapsed
     91  *	after sending the probe.
     92  *
     93  * TLS - Time last success. Most recent probe ack received at this time.
     94  * TFF - Time first fail. The time of the earliest probe failure in
     95  *	a consecutive series of probe failures.
     96  * NUM_PROBE_REPAIRS  - Number of consecutive successful probes required
     97  * 	before declaring phyint repair.
     98  * NUM_PROBE_FAILS - Number of consecutive probe failures required to
     99  *	declare a phyint failure.
    100  *
    101  * 			Phyint state diagram
    102  *
    103  * The state of a phyint that is capable of being probed, is completely
    104  * specified by the 3-tuple <pi_state, pg_state, I>.
    105  *
    106  * A phyint starts in either PI_RUNNING or PI_OFFLINE, depending on whether
    107  * IFF_OFFLINE is set.  If the phyint is also configured with a test address
    108  * (the common case) and probe targets, then a phyint must also successfully
    109  * be able to send and receive probes in order to remain in the PI_RUNNING
    110  * state (otherwise, it transitions to PI_FAILED).
    111  *
    112  * Further, if a PI_RUNNING phyint is configured with a test address but is
    113  * unable to find any probe targets, it will transition to the PI_NOTARGETS
    114  * state, which indicates that the link is apparently functional but that
    115  * in.mpathd is unable to send probes to verify functionality (in this case,
    116  * in.mpathd makes the optimistic assumption that the interface is working
    117  * correctly and thus does not mark the interface FAILED, but reports it as
    118  * IPMP_IF_UNKNOWN through the async events and query interfaces).
    119  *
    120  * At any point, a phyint may be administratively marked offline via if_mpadm.
    121  * In this case, the interface always transitions to PI_OFFLINE, regardless
    122  * of its previous state.  When the interface is later brought back online,
    123  * in.mpathd acts as if the interface is new (and thus it transitions to
    124  * PI_RUNNING or PI_FAILED based on the status of the link and the result of
    125  * its probes, if probes are sent).
    126  *
    127  * pi_state -  PI_RUNNING or PI_FAILED
    128  *	PI_RUNNING: The failure detection logic says the phyint is good.
    129  *	PI_FAILED: The failure detection logic says the phyint has failed.
    130  *
    131  * pg_state  - PG_OK, PG_DEGRADED, or PG_FAILED.
    132  *	PG_OK: All interfaces in the group are OK.
    133  *	PG_DEGRADED: Some interfaces in the group are unusable.
    134  *	PG_FAILED: All interfaces in the group are unusable.
    135  *
    136  *	In the case of router targets, we assume that the current list of
    137  *	targets obtained from the routing table, is still valid, so the
    138  *	phyint stat is PI_FAILED. In the case of host targets, we delete the
    139  *	list of targets, and multicast to the all hosts, to reconstruct the
    140  *	target list. So the phyints are in the PI_NOTARGETS state.
    141  *
    142  * I -	value of (pi_flags & IFF_INACTIVE)
    143  *	IFF_INACTIVE: This phyint will not send or receive packets.
    144  *	Usually, inactive is tied to standby interfaces that are not yet
    145  *	needed (e.g., no non-standby interfaces in the group have failed).
    146  *	When failback has been disabled (FAILBACK=no configured), phyint can
    147  *	also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint
    148  *	subsequently recovers after a failure.
    149  *
    150  * Not all 9 possible combinations of the above 3-tuple are possible.
    151  *
    152  * I is tracked by IP. pi_state is tracked by mpathd.
    153  *
    154  *			pi_state state machine
    155  * ---------------------------------------------------------------------------
    156  *	Event			State			New State
    157  *				Action:
    158  * ---------------------------------------------------------------------------
    159  *	IP interface failure	(PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
    160  *	detection		: set IFF_FAILED on this phyint
    161  *
    162  *	IP interface failure	(PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
    163  *	detection		: set IFF_FAILED on this phyint
    164  *
    165  *	IP interface repair 	(PI_FAILED, I == 0, FAILBACK=yes)
    166  *	detection				     -> (PI_RUNNING, I == 0)
    167  *				: clear IFF_FAILED on this phyint
    168  *
    169  *	IP interface repair 	(PI_FAILED, I == 0, FAILBACK=no)
    170  *	detection				     ->	(PI_RUNNING, I == 1)
    171  *				: clear IFF_FAILED on this phyint
    172  *				: if failback is disabled set I == 1
    173  *
    174  *	Group failure		(perform on all phyints in the group)
    175  *	detection 		PI_RUNNING		PI_FAILED
    176  *	(Router targets)	: set IFF_FAILED
    177  *
    178  *	Group failure		(perform on all phyints in the group)
    179  *	detection 		PI_RUNNING		PI_NOTARGETS
    180  *	(Host targets)		: set IFF_FAILED
    181  *				: delete the target list on all phyints
    182  * ---------------------------------------------------------------------------
    183  */
    184 
    185 struct probes_missed probes_missed;
    186 
    187 /*
    188  * Compose and transmit an ICMP ECHO REQUEST packet.  The IP header
    189  * will be added on by the kernel.  The id field identifies this phyint.
    190  * and the sequence number is an increasing (modulo 2^^16) integer. The data
    191  * portion holds the time value when the packet is sent. On echo this is
    192  * extracted to compute the round-trip time. Three different types of
    193  * probe packets are used.
    194  *
    195  * PROBE_UNI: This type is used to do failure detection / failure recovery
    196  *	and RTT calculation. PROBE_UNI probes are spaced apart in time,
    197  *	not less than the current CRTT. pii_probes[] stores data
    198  *	about these probes. These packets consume sequence number space.
    199  *
    200  * PROBE_RTT: This type is used to make only rtt measurements. Normally these
    201  * 	are not used. Under heavy network load, the rtt may go up very high,
    202  *	due to a spike, or may appear to go high, due to extreme scheduling
    203  * 	delays. Once the network stress is removed, mpathd takes long time to
    204  *	recover, because the probe_interval is already high, and it takes
    205  *	a long time to send out sufficient number of probes to bring down the
    206  *	rtt. To avoid this problem, PROBE_RTT probes are sent out every
    207  *	user_probe_interval ms. and will cause only rtt updates. These packets
    208  *	do not consume sequence number space nor is information about these
    209  *	packets stored in the pii_probes[]
    210  *
    211  * PROBE_MULTI: This type is only used to construct a list of targets, when
    212  *	no targets are known. The packet is multicast to the all hosts addr.
    213  */
    214 static void
    215 probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime)
    216 {
    217 	hrtime_t sent_hrtime;
    218 	struct timeval sent_tv;
    219 	struct pr_icmp probe_pkt;	/* Probe packet */
    220 	struct sockaddr_storage targ;	/* target address */
    221 	uint_t	targaddrlen;		/* targed address length */
    222 	int	pr_ndx;			/* probe index in pii->pii_probes[] */
    223 	boolean_t sent = _B_FALSE;
    224 	int	rval;
    225 
    226 	if (debug & D_TARGET) {
    227 		logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af),
    228 		    pii->pii_name, probe_type, start_hrtime);
    229 	}
    230 
    231 	assert(pii->pii_probe_sock != -1);
    232 	assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI ||
    233 	    probe_type == PROBE_RTT);
    234 
    235 	probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ?
    236 	    ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST;
    237 	probe_pkt.pr_icmp_code = 0;
    238 	probe_pkt.pr_icmp_cksum = 0;
    239 	probe_pkt.pr_icmp_seq = htons(pii->pii_snxt);
    240 
    241 	/*
    242 	 * Since there is no need to do arithmetic on the icmpid,
    243 	 * (only equality check is done) pii_icmpid is stored in
    244 	 * network byte order at initialization itself.
    245 	 */
    246 	probe_pkt.pr_icmp_id = pii->pii_icmpid;
    247 	probe_pkt.pr_icmp_timestamp = htonll(start_hrtime);
    248 	probe_pkt.pr_icmp_mtype = htonl(probe_type);
    249 
    250 	/*
    251 	 * If probe_type is PROBE_MULTI, this packet will be multicast to
    252 	 * the all hosts address. Otherwise it is unicast to the next target.
    253 	 */
    254 	assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
    255 	    pii->pii_rtt_target_next != NULL));
    256 
    257 	bzero(&targ, sizeof (targ));
    258 	targ.ss_family = pii->pii_af;
    259 
    260 	if (pii->pii_af == AF_INET6) {
    261 		struct in6_addr *addr6;
    262 
    263 		addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr;
    264 		targaddrlen = sizeof (struct sockaddr_in6);
    265 		if (probe_type == PROBE_MULTI) {
    266 			*addr6 = all_nodes_mcast_v6;
    267 		} else if (probe_type == PROBE_UNI) {
    268 			*addr6 = pii->pii_target_next->tg_address;
    269 		} else { /* type is PROBE_RTT */
    270 			*addr6 = pii->pii_rtt_target_next->tg_address;
    271 		}
    272 	} else {
    273 		struct in_addr *addr4;
    274 
    275 		addr4 = &((struct sockaddr_in *)&targ)->sin_addr;
    276 		targaddrlen = sizeof (struct sockaddr_in);
    277 		if (probe_type == PROBE_MULTI) {
    278 			*addr4 = all_nodes_mcast_v4;
    279 		} else if (probe_type == PROBE_UNI) {
    280 			IN6_V4MAPPED_TO_INADDR(
    281 			    &pii->pii_target_next->tg_address, addr4);
    282 		} else { /* type is PROBE_RTT */
    283 			IN6_V4MAPPED_TO_INADDR(
    284 			    &pii->pii_rtt_target_next->tg_address, addr4);
    285 		}
    286 
    287 		/*
    288 		 * Compute the IPv4 icmp checksum. Does not cover the IP header.
    289 		 */
    290 		probe_pkt.pr_icmp_cksum =
    291 		    in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
    292 	}
    293 
    294 	/*
    295 	 * Use the current time as the time we sent.  Not atomic, but the best
    296 	 * we can do from here.
    297 	 */
    298 	sent_hrtime = gethrtime();
    299 	(void) gettimeofday(&sent_tv, NULL);
    300 	rval = sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0,
    301 	    (struct sockaddr *)&targ, targaddrlen);
    302 	/*
    303 	 * If the send would block, this may either be transient or a hang in a
    304 	 * lower layer. We pretend the probe was actually sent, the daemon will
    305 	 * not see a reply to the probe and will fail the interface if normal
    306 	 * failure detection criteria are met.
    307 	 */
    308 	if (rval == sizeof (probe_pkt) ||
    309 	    (rval == -1 && errno == EWOULDBLOCK)) {
    310 		sent = _B_TRUE;
    311 	} else {
    312 		logperror_pii(pii, "probe: probe sendto");
    313 	}
    314 
    315 	/*
    316 	 * If this is a PROBE_UNI probe packet being unicast to a target, then
    317 	 * update our tables. We will need this info in processing the probe
    318 	 * response. PROBE_MULTI and PROBE_RTT packets are not used for
    319 	 * the purpose of failure or recovery detection. PROBE_MULTI packets
    320 	 * are only used to construct a list of targets. PROBE_RTT packets are
    321 	 * used only for updating the rtt and not for failure detection.
    322 	 */
    323 	if (probe_type == PROBE_UNI && sent) {
    324 		pr_ndx = pii->pii_probe_next;
    325 		assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT);
    326 
    327 		/* Collect statistics, before we reuse the last slot. */
    328 		if (pii->pii_probes[pr_ndx].pr_status == PR_LOST)
    329 			pii->pii_cum_stats.lost++;
    330 		else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED)
    331 			pii->pii_cum_stats.acked++;
    332 		pii->pii_cum_stats.sent++;
    333 
    334 		pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt;
    335 		pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv;
    336 		pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime;
    337 		pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime;
    338 		pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
    339 		probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED);
    340 
    341 		pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
    342 		pii->pii_target_next = target_next(pii->pii_target_next);
    343 		assert(pii->pii_target_next != NULL);
    344 		/*
    345 		 * If we have a single variable to denote the next target to
    346 		 * probe for both rtt probes and failure detection probes, we
    347 		 * could end up with a situation where the failure detection
    348 		 * probe targets become disjoint from the rtt probe targets.
    349 		 * Eg. if 2 targets and the actual fdt is double the user
    350 		 * specified fdt. So we have 2 variables. In this scheme
    351 		 * we also reset pii_rtt_target_next for every fdt probe,
    352 		 * though that may not be necessary.
    353 		 */
    354 		pii->pii_rtt_target_next = pii->pii_target_next;
    355 		pii->pii_snxt++;
    356 	} else if (probe_type == PROBE_RTT) {
    357 		pii->pii_rtt_target_next =
    358 		    target_next(pii->pii_rtt_target_next);
    359 		assert(pii->pii_rtt_target_next != NULL);
    360 	}
    361 }
    362 
    363 /*
    364  * Incoming IPv4 data from wire, is received here. Called from main.
    365  */
    366 void
    367 in_data(struct phyint_instance *pii)
    368 {
    369 	struct	sockaddr_in 	from;
    370 	struct	in6_addr	fromaddr;
    371 	static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
    372 	static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
    373 	struct ip *ip;
    374 	int 	iphlen;
    375 	int 	len;
    376 	char 	abuf[INET_ADDRSTRLEN];
    377 	struct msghdr msg;
    378 	struct iovec iov;
    379 	struct pr_icmp *reply;
    380 	struct timeval *recv_tvp;
    381 
    382 	if (debug & D_PROBE) {
    383 		logdebug("in_data(%s %s)\n",
    384 		    AF_STR(pii->pii_af), pii->pii_name);
    385 	}
    386 
    387 	iov.iov_base = (char *)in_packet;
    388 	iov.iov_len = sizeof (in_packet);
    389 	msg.msg_iov = &iov;
    390 	msg.msg_iovlen = 1;
    391 	msg.msg_name = (struct sockaddr *)&from;
    392 	msg.msg_namelen = sizeof (from);
    393 	msg.msg_control = ancillary_data;
    394 	msg.msg_controllen = sizeof (ancillary_data);
    395 
    396 	/*
    397 	 * Poll has already told us that a message is waiting,
    398 	 * on this socket. Read it now. We should not block.
    399 	 */
    400 	if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
    401 		logperror_pii(pii, "in_data: recvmsg");
    402 		return;
    403 	}
    404 
    405 	/*
    406 	 * If the datalink has indicated the link is down, don't go
    407 	 * any further.
    408 	 */
    409 	if (LINK_DOWN(pii->pii_phyint))
    410 		return;
    411 
    412 	/* Get the printable address for error reporting */
    413 	(void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf));
    414 
    415 	/* Ignore packets > 64k or control buffers that don't fit */
    416 	if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
    417 		if (debug & D_PKTBAD) {
    418 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
    419 			    msg.msg_flags, abuf);
    420 		}
    421 		return;
    422 	}
    423 
    424 	/* Make sure packet contains at least minimum ICMP header */
    425 	ip = (struct ip *)in_packet;
    426 	iphlen = ip->ip_hl << 2;
    427 	if (len < iphlen + ICMP_MINLEN) {
    428 		if (debug & D_PKTBAD) {
    429 			logdebug("in_data: packet too short (%d bytes)"
    430 			    " from %s\n", len, abuf);
    431 		}
    432 		return;
    433 	}
    434 
    435 	/*
    436 	 * Subtract the IP hdr length, 'len' will be length of the probe
    437 	 * reply, starting from the icmp hdr.
    438 	 */
    439 	len -= iphlen;
    440 	/* LINTED */
    441 	reply = (struct pr_icmp *)((char *)in_packet + iphlen);
    442 
    443 	/* Probe replies are icmp echo replies. Ignore anything else */
    444 	if (reply->pr_icmp_type != ICMP_ECHO_REPLY)
    445 		return;
    446 
    447 	/*
    448 	 * The icmp id should match what we sent, which is stored
    449 	 * in pi_icmpid. The icmp code for reply must be 0.
    450 	 * The reply content must be a struct pr_icmp
    451 	 */
    452 	if (reply->pr_icmp_id != pii->pii_icmpid) {
    453 		/* Not in response to our probe */
    454 		return;
    455 	}
    456 
    457 	if (reply->pr_icmp_code != 0) {
    458 		logtrace("probe reply code %d from %s on %s\n",
    459 		    reply->pr_icmp_code, abuf, pii->pii_name);
    460 		return;
    461 	}
    462 
    463 	if (len < sizeof (struct pr_icmp)) {
    464 		logtrace("probe reply too short: %d bytes from %s on %s\n",
    465 		    len, abuf, pii->pii_name);
    466 		return;
    467 	}
    468 
    469 	recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP);
    470 	if (recv_tvp == NULL) {
    471 		logtrace("message without timestamp from %s on %s\n",
    472 		    abuf, pii->pii_name);
    473 		return;
    474 	}
    475 
    476 	IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr);
    477 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI))
    478 		/* Unicast probe reply */
    479 		incoming_echo_reply(pii, reply, fromaddr, recv_tvp);
    480 	else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
    481 		/* Multicast reply */
    482 		incoming_mcast_reply(pii, reply, fromaddr);
    483 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
    484 		incoming_rtt_reply(pii, reply, fromaddr);
    485 	} else {
    486 		/* Probably not in response to our probe */
    487 		logtrace("probe reply type: %d from %s on %s\n",
    488 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
    489 		return;
    490 	}
    491 }
    492 
    493 /*
    494  * Incoming IPv6 data from wire is received here. Called from main.
    495  */
    496 void
    497 in6_data(struct phyint_instance *pii)
    498 {
    499 	struct sockaddr_in6 from;
    500 	static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
    501 	static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
    502 	int len;
    503 	char abuf[INET6_ADDRSTRLEN];
    504 	struct msghdr msg;
    505 	struct iovec iov;
    506 	void	*opt;
    507 	struct	pr_icmp *reply;
    508 	struct	timeval *recv_tvp;
    509 
    510 	if (debug & D_PROBE) {
    511 		logdebug("in6_data(%s %s)\n",
    512 		    AF_STR(pii->pii_af), pii->pii_name);
    513 	}
    514 
    515 	iov.iov_base = (char *)in_packet;
    516 	iov.iov_len = sizeof (in_packet);
    517 	msg.msg_iov = &iov;
    518 	msg.msg_iovlen = 1;
    519 	msg.msg_name = (struct sockaddr *)&from;
    520 	msg.msg_namelen = sizeof (from);
    521 	msg.msg_control = ancillary_data;
    522 	msg.msg_controllen = sizeof (ancillary_data);
    523 
    524 	if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
    525 		logperror_pii(pii, "in6_data: recvmsg");
    526 		return;
    527 	}
    528 
    529 	/*
    530 	 * If the datalink has indicated that the link is down, don't go
    531 	 * any further.
    532 	 */
    533 	if (LINK_DOWN(pii->pii_phyint))
    534 		return;
    535 
    536 	/* Get the printable address for error reporting */
    537 	(void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf));
    538 	if (len < ICMP_MINLEN) {
    539 		if (debug & D_PKTBAD) {
    540 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
    541 			    msg.msg_flags, abuf);
    542 		}
    543 		return;
    544 	}
    545 	/* Ignore packets > 64k or control buffers that don't fit */
    546 	if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
    547 		if (debug & D_PKTBAD) {
    548 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
    549 			    msg.msg_flags, abuf);
    550 		}
    551 		return;
    552 	}
    553 
    554 	reply = (struct pr_icmp *)in_packet;
    555 	if (reply->pr_icmp_type != ICMP6_ECHO_REPLY)
    556 		return;
    557 
    558 	if (reply->pr_icmp_id != pii->pii_icmpid) {
    559 		/* Not in response to our probe */
    560 		return;
    561 	}
    562 
    563 	/*
    564 	 * The kernel has already verified the the ICMP checksum.
    565 	 */
    566 	if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) {
    567 		logtrace("ICMPv6 echo reply source address not linklocal from "
    568 		    "%s on %s\n", abuf, pii->pii_name);
    569 		return;
    570 	}
    571 	opt = find_ancillary(&msg, IPPROTO_IPV6, IPV6_RTHDR);
    572 	if (opt != NULL) {
    573 		/* Can't allow routing headers in probe replies  */
    574 		logtrace("message with routing header from %s on %s\n",
    575 		    abuf, pii->pii_name);
    576 		return;
    577 	}
    578 
    579 	if (reply->pr_icmp_code != 0) {
    580 		logtrace("probe reply code: %d from %s on %s\n",
    581 		    reply->pr_icmp_code, abuf, pii->pii_name);
    582 		return;
    583 	}
    584 	if (len < (sizeof (struct pr_icmp))) {
    585 		logtrace("probe reply too short: %d bytes from %s on %s\n",
    586 		    len, abuf, pii->pii_name);
    587 		return;
    588 	}
    589 
    590 	recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP);
    591 	if (recv_tvp == NULL) {
    592 		logtrace("message without timestamp from %s on %s\n",
    593 		    abuf, pii->pii_name);
    594 		return;
    595 	}
    596 
    597 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) {
    598 		incoming_echo_reply(pii, reply, from.sin6_addr, recv_tvp);
    599 	} else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
    600 		incoming_mcast_reply(pii, reply, from.sin6_addr);
    601 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
    602 		incoming_rtt_reply(pii, reply, from.sin6_addr);
    603 	} else  {
    604 		/* Probably not in response to our probe */
    605 		logtrace("probe reply type: %d from %s on %s\n",
    606 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
    607 	}
    608 }
    609 
    610 /*
    611  * Process the incoming rtt reply, in response to our rtt probe.
    612  * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
    613  * have any stored information about the probe we sent. So we don't log
    614  * any errors if we receive bad replies.
    615  */
    616 static void
    617 incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
    618     struct in6_addr fromaddr)
    619 {
    620 	int64_t	m;		/* rtt measurement in ns */
    621 	char	abuf[INET6_ADDRSTRLEN];
    622 	struct	target	*target;
    623 	struct 	phyint_group *pg;
    624 
    625 	/* Get the printable address for error reporting */
    626 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
    627 
    628 	if (debug & D_PROBE) {
    629 		logdebug("incoming_rtt_reply: %s %s %s\n",
    630 		    AF_STR(pii->pii_af), pii->pii_name, abuf);
    631 	}
    632 
    633 	/* Do we know this target ? */
    634 	target = target_lookup(pii, fromaddr);
    635 	if (target == NULL)
    636 		return;
    637 
    638 	m = (int64_t)(gethrtime() - ntohll(reply->pr_icmp_timestamp));
    639 	/* Invalid rtt. It has wrapped around */
    640 	if (m < 0)
    641 		return;
    642 
    643 	/*
    644 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
    645 	 * The initial few responses after the interface is repaired may
    646 	 * contain high rtt's because they could have been queued up waiting
    647 	 * for ARP/NDP resolution on a failed interface.
    648 	 */
    649 	pg = pii->pii_phyint->pi_group;
    650 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
    651 		return;
    652 
    653 	/*
    654 	 * Update rtt only if the new rtt is lower than the current rtt.
    655 	 * (specified by the 3rd parameter to pi_set_crtt).
    656 	 * If a spike has caused the current probe_interval to be >
    657 	 * user_probe_interval, then this mechanism is used to bring down
    658 	 * the rtt rapidly once the network stress is removed.
    659 	 * If the new rtt is higher than the current rtt, we don't want to
    660 	 * update the rtt. We are having more than 1 outstanding probe and
    661 	 * the increase in rtt we are seeing is being unnecessarily weighted
    662 	 * many times. The regular rtt update will be handled by
    663 	 * incoming_echo_reply() and will take care of any rtt increase.
    664 	 */
    665 	pi_set_crtt(target, m, _B_FALSE);
    666 	if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
    667 	    (user_failure_detection_time < pg->pg_fdt) &&
    668 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
    669 		/*
    670 		 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
    671 		 * investigate if we can improve the failure detection time to
    672 		 * meet whatever the user specified.
    673 		 */
    674 		if (check_pg_crtt_improved(pg)) {
    675 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
    676 			    user_failure_detection_time);
    677 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
    678 			if (pii->pii_phyint->pi_group != phyint_anongroup) {
    679 				logerr("Improved failure detection time %d ms "
    680 				    "on (%s %s) for group \"%s\"\n",
    681 				    pg->pg_fdt, AF_STR(pii->pii_af),
    682 				    pii->pii_name,
    683 				    pii->pii_phyint->pi_group->pg_name);
    684 			}
    685 			if (user_failure_detection_time == pg->pg_fdt) {
    686 				/* Avoid any truncation or rounding errors */
    687 				pg->pg_probeint = user_probe_interval;
    688 				/*
    689 				 * No more rtt probes will be sent. The actual
    690 				 * fdt has dropped to the user specified value.
    691 				 * pii_fd_snxt_basetime and pii_snxt_basetime
    692 				 * will be in sync henceforth.
    693 				 */
    694 				reset_snxt_basetimes();
    695 			}
    696 		}
    697 	}
    698 }
    699 
    700 /*
    701  * Process the incoming echo reply, in response to our unicast probe.
    702  * Common for both IPv4 and IPv6
    703  */
    704 static void
    705 incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
    706     struct in6_addr fromaddr, struct timeval *recv_tvp)
    707 {
    708 	int64_t	m;		/* rtt measurement in ns */
    709 	hrtime_t cur_hrtime;	/* in ns from some arbitrary point */
    710 	char	abuf[INET6_ADDRSTRLEN];
    711 	int	pr_ndx;
    712 	struct	target	*target;
    713 	boolean_t exception;
    714 	uint64_t pr_icmp_timestamp;
    715 	uint16_t pr_icmp_seq;
    716 	struct	probe_stats *pr_statp;
    717 	struct 	phyint_group *pg = pii->pii_phyint->pi_group;
    718 
    719 	/* Get the printable address for error reporting */
    720 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
    721 
    722 	if (debug & D_PROBE) {
    723 		logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n",
    724 		    AF_STR(pii->pii_af), pii->pii_name, abuf,
    725 		    ntohs(reply->pr_icmp_seq), tv2ns(recv_tvp));
    726 	}
    727 
    728 	pr_icmp_timestamp = ntohll(reply->pr_icmp_timestamp);
    729 	pr_icmp_seq = ntohs(reply->pr_icmp_seq);
    730 
    731 	/* Reject out of window probe replies */
    732 	if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) ||
    733 	    SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) {
    734 		logtrace("out of window probe seq %u snxt %u on %s from %s\n",
    735 		    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
    736 		pii->pii_cum_stats.unknown++;
    737 		return;
    738 	}
    739 
    740 	cur_hrtime = gethrtime();
    741 	m = (int64_t)(cur_hrtime - pr_icmp_timestamp);
    742 	if (m < 0) {
    743 		/*
    744 		 * This is a ridiculously high value of rtt. rtt has wrapped
    745 		 * around. Log a message, and ignore the rtt.
    746 		 */
    747 		logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld "
    748 		    "reply timestamp %lld\n", cur_hrtime, pr_icmp_timestamp);
    749 	}
    750 
    751 	/*
    752 	 * Get the probe index pr_ndx corresponding to the received icmp seq.
    753 	 * number in our pii->pii_probes[] array. The icmp sequence number
    754 	 * pii_snxt corresponds to the probe index pii->pii_probe_next
    755 	 */
    756 	pr_ndx = MOD_SUB(pii->pii_probe_next,
    757 	    (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT);
    758 
    759 	assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status));
    760 
    761 	target = pii->pii_probes[pr_ndx].pr_target;
    762 
    763 	/*
    764 	 * Perform sanity checks, whether this probe reply that we
    765 	 * have received is genuine
    766 	 */
    767 	if (target != NULL) {
    768 		/*
    769 		 * Compare the src. addr of the received ICMP or ICMPv6
    770 		 * probe reply with the target address in our tables.
    771 		 */
    772 		if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) {
    773 			/*
    774 			 * We don't have any record of having sent a probe to
    775 			 * this target. This is a fake probe reply. Log an error
    776 			 */
    777 			logtrace("probe status %d Fake probe reply seq %u "
    778 			    "snxt %u on %s from %s\n",
    779 			    pii->pii_probes[pr_ndx].pr_status,
    780 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
    781 			pii->pii_cum_stats.unknown++;
    782 			return;
    783 		} else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
    784 			/*
    785 			 * The address matches, but our tables indicate that
    786 			 * this probe reply has been acked already. So this
    787 			 * is a duplicate probe reply. Log an error
    788 			 */
    789 			logtrace("probe status %d Duplicate probe reply seq %u "
    790 			    "snxt %u on %s from %s\n",
    791 			    pii->pii_probes[pr_ndx].pr_status,
    792 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
    793 			pii->pii_cum_stats.unknown++;
    794 			return;
    795 		}
    796 	} else {
    797 		/*
    798 		 * Target must not be NULL in the PR_UNACKED state
    799 		 */
    800 		assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED);
    801 		if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) {
    802 			/*
    803 			 * The probe stats slot is unused. So we didn't
    804 			 * send out any probe to this target. This is a fake.
    805 			 * Log an error.
    806 			 */
    807 			logtrace("probe status %d Fake probe reply seq %u "
    808 			    "snxt %u on %s from %s\n",
    809 			    pii->pii_probes[pr_ndx].pr_status,
    810 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
    811 		}
    812 		pii->pii_cum_stats.unknown++;
    813 		return;
    814 	}
    815 
    816 	/*
    817 	 * If the rtt does not appear to be right, don't update the
    818 	 * rtt stats. This can happen if the system dropped into the
    819 	 * debugger, or the system was hung or too busy for a
    820 	 * substantial time that we didn't get a chance to run.
    821 	 */
    822 	if ((m < 0) || (ns2ms(m) > PROBE_STATS_COUNT * pg->pg_probeint)) {
    823 		/*
    824 		 * If the probe corresponding to this received response
    825 		 * was truly sent 'm' ns. ago, then this response must
    826 		 * have been rejected by the sequence number checks. The
    827 		 * fact that it has passed the sequence number checks
    828 		 * means that the measured rtt is wrong. We were probably
    829 		 * scheduled long after the packet was received.
    830 		 */
    831 		goto out;
    832 	}
    833 
    834 	/*
    835 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
    836 	 * The initial few responses after the interface is repaired may
    837 	 * contain high rtt's because they could have been queued up waiting
    838 	 * for ARP/NDP resolution on a failed interface.
    839 	 */
    840 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
    841 		goto out;
    842 
    843 	/*
    844 	 * Don't update the Conservative Round Trip Time estimate for this
    845 	 * (phint, target) pair if this is the not the highest ack seq seen
    846 	 * thus far on this target.
    847 	 */
    848 	if (!highest_ack_tg(pr_icmp_seq, target))
    849 		goto out;
    850 
    851 	/*
    852 	 * Always update the rtt. This is a failure detection probe
    853 	 * and we want to measure both increase / decrease in rtt.
    854 	 */
    855 	pi_set_crtt(target, m, _B_TRUE);
    856 
    857 	/*
    858 	 * If the crtt exceeds the average time between probes,
    859 	 * investigate if this slow target is an exception. If so we
    860 	 * can avoid this target and still meet the failure detection
    861 	 * time. Otherwise we can't meet the failure detection time.
    862 	 */
    863 	if (target->tg_crtt > pg->pg_probeint) {
    864 		exception = check_exception_target(pii, target);
    865 		if (exception) {
    866 			/*
    867 			 * This target is exceptionally slow. Don't use it
    868 			 * for future probes. check_exception_target() has
    869 			 * made sure that we have at least MIN_PROBE_TARGETS
    870 			 * other active targets
    871 			 */
    872 			if (pii->pii_targets_are_routers) {
    873 				/*
    874 				 * This is a slow router, mark it as slow
    875 				 * and don't use it for further probes. We
    876 				 * don't delete it, since it will be populated
    877 				 * again when we do a router scan. Hence we
    878 				 * need to maintain extra state (unlike the
    879 				 * host case below).  Mark it as TG_SLOW.
    880 				 */
    881 				if (target->tg_status == TG_ACTIVE)
    882 					pii->pii_ntargets--;
    883 				target->tg_status = TG_SLOW;
    884 				target->tg_latime = gethrtime();
    885 				target->tg_rtt_sa = -1;
    886 				target->tg_crtt = 0;
    887 				target->tg_rtt_sd = 0;
    888 				if (pii->pii_target_next == target) {
    889 					pii->pii_target_next =
    890 					    target_next(target);
    891 				}
    892 			} else {
    893 				/*
    894 				 * the slow target is not a router, we can
    895 				 * just delete it. Send an icmp multicast and
    896 				 * pick the fastest responder that is not
    897 				 * already an active target. target_delete()
    898 				 * adjusts pii->pii_target_next
    899 				 */
    900 				target_delete(target);
    901 				probe(pii, PROBE_MULTI, cur_hrtime);
    902 			}
    903 		} else {
    904 			/*
    905 			 * We can't meet the failure detection time.
    906 			 * Log a message, and update the detection time to
    907 			 * whatever we can achieve.
    908 			 */
    909 			pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE;
    910 			pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2);
    911 			last_fdt_bumpup_time = gethrtime();
    912 			if (pg != phyint_anongroup) {
    913 				logtrace("Cannot meet requested failure"
    914 				    " detection time of %d ms on (%s %s) new"
    915 				    " failure detection time for group \"%s\""
    916 				    " is %d ms\n", user_failure_detection_time,
    917 				    AF_STR(pii->pii_af), pii->pii_name,
    918 				    pg->pg_name, pg->pg_fdt);
    919 			}
    920 		}
    921 	} else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
    922 	    (user_failure_detection_time < pg->pg_fdt) &&
    923 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
    924 		/*
    925 		 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
    926 		 * investigate if we can improve the failure detection time to
    927 		 * meet whatever the user specified.
    928 		 */
    929 		if (check_pg_crtt_improved(pg)) {
    930 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
    931 			    user_failure_detection_time);
    932 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
    933 			if (pg != phyint_anongroup) {
    934 				logtrace("Improved failure detection time %d ms"
    935 				    " on (%s %s) for group \"%s\"\n",
    936 				    pg->pg_fdt, AF_STR(pii->pii_af),
    937 				    pii->pii_name, pg->pg_name);
    938 			}
    939 			if (user_failure_detection_time == pg->pg_fdt) {
    940 				/* Avoid any truncation or rounding errors */
    941 				pg->pg_probeint = user_probe_interval;
    942 				/*
    943 				 * No more rtt probes will be sent. The actual
    944 				 * fdt has dropped to the user specified value.
    945 				 * pii_fd_snxt_basetime and pii_snxt_basetime
    946 				 * will be in sync henceforth.
    947 				 */
    948 				reset_snxt_basetimes();
    949 			}
    950 		}
    951 	}
    952 out:
    953 	pr_statp = &pii->pii_probes[pr_ndx];
    954 	pr_statp->pr_hrtime_ackproc = cur_hrtime;
    955 	pr_statp->pr_hrtime_ackrecv = pr_statp->pr_hrtime_sent +
    956 	    (tv2ns(recv_tvp) - tv2ns(&pr_statp->pr_tv_sent));
    957 
    958 	probe_chstate(pr_statp, pii, PR_ACKED);
    959 
    960 	/*
    961 	 * Update pii->pii_rack, i.e. the sequence number of the last received
    962 	 * probe response, based on the echo reply we have received now, if
    963 	 * either of the following conditions are satisfied.
    964 	 * a. pii_rack is outside the current receive window of
    965 	 *    [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
    966 	 *    This means we have not received probe responses for a
    967 	 *    long time, and the sequence number has wrapped around.
    968 	 * b. pii_rack is within the current receive window and this echo
    969 	 *    reply corresponds to the highest sequence number we have seen
    970 	 *    so far.
    971 	 */
    972 	if (SEQ_GE(pii->pii_rack, pii->pii_snxt) ||
    973 	    SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) ||
    974 	    SEQ_GT(pr_icmp_seq, pii->pii_rack)) {
    975 		pii->pii_rack = pr_icmp_seq;
    976 	}
    977 }
    978 
    979 /*
    980  * Returns true if seq is the highest unacknowledged seq for target tg
    981  * else returns false
    982  */
    983 static boolean_t
    984 highest_ack_tg(uint16_t seq, struct target *tg)
    985 {
    986 	struct phyint_instance *pii;
    987 	int	 pr_ndx;
    988 	uint16_t pr_seq;
    989 
    990 	pii = tg->tg_phyint_inst;
    991 
    992 	/*
    993 	 * Get the seq number of the most recent probe sent so far,
    994 	 * and also get the corresponding probe index in the probe stats
    995 	 * array.
    996 	 */
    997 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
    998 	pr_seq = pii->pii_snxt;
    999 	pr_seq--;
   1000 
   1001 	/*
   1002 	 * Start from the most recent probe and walk back, trying to find
   1003 	 * an acked probe corresponding to target tg.
   1004 	 */
   1005 	for (; pr_ndx != pii->pii_probe_next;
   1006 	    pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) {
   1007 		if (pii->pii_probes[pr_ndx].pr_target == tg &&
   1008 		    pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
   1009 			if (SEQ_GT(pr_seq, seq))
   1010 				return (_B_FALSE);
   1011 		}
   1012 	}
   1013 	return (_B_TRUE);
   1014 }
   1015 
   1016 /*
   1017  * Check whether the crtt for the group has improved by a factor of
   1018  * LOWER_FDT_TRIGGER.  Small crtt improvements are ignored to avoid failure
   1019  * detection time flapping in the face of small crtt changes.
   1020  */
   1021 static boolean_t
   1022 check_pg_crtt_improved(struct phyint_group *pg)
   1023 {
   1024 	struct	phyint *pi;
   1025 
   1026 	if (debug & D_PROBE)
   1027 		logdebug("check_pg_crtt_improved()\n");
   1028 
   1029 	/*
   1030 	 * The crtt for the group is only improved if each phyint_instance
   1031 	 * for both ipv4 and ipv6 is improved.
   1032 	 */
   1033 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
   1034 		if (!check_pii_crtt_improved(pi->pi_v4) ||
   1035 		    !check_pii_crtt_improved(pi->pi_v6))
   1036 			return (_B_FALSE);
   1037 	}
   1038 
   1039 	return (_B_TRUE);
   1040 }
   1041 
   1042 /*
   1043  * Check whether the crtt has improved substantially on this phyint_instance.
   1044  * Returns _B_TRUE if there's no crtt information available, because pii
   1045  * is NULL or the phyint_instance is not capable of probing.
   1046  */
   1047 boolean_t
   1048 check_pii_crtt_improved(struct phyint_instance *pii) {
   1049 	struct 	target *tg;
   1050 
   1051 	if (pii == NULL)
   1052 		return (_B_TRUE);
   1053 
   1054 	if (!PROBE_CAPABLE(pii) ||
   1055 	    pii->pii_phyint->pi_state == PI_FAILED)
   1056 		return (_B_TRUE);
   1057 
   1058 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
   1059 		if (tg->tg_status != TG_ACTIVE)
   1060 			continue;
   1061 		if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint /
   1062 		    LOWER_FDT_TRIGGER)) {
   1063 			return (_B_FALSE);
   1064 		}
   1065 	}
   1066 
   1067 	return (_B_TRUE);
   1068 }
   1069 
   1070 /*
   1071  * This target responds very slowly to probes. The target's crtt exceeds
   1072  * the probe interval of its group. Compare against other targets
   1073  * and determine if this target is an exception, if so return true, else false
   1074  */
   1075 static boolean_t
   1076 check_exception_target(struct phyint_instance *pii, struct target *target)
   1077 {
   1078 	struct	target *tg;
   1079 	char abuf[INET6_ADDRSTRLEN];
   1080 
   1081 	if (debug & D_PROBE) {
   1082 		logdebug("check_exception_target(%s %s target %s)\n",
   1083 		    AF_STR(pii->pii_af), pii->pii_name,
   1084 		    pr_addr(pii->pii_af, target->tg_address,
   1085 		    abuf, sizeof (abuf)));
   1086 	}
   1087 
   1088 	/*
   1089 	 * We should have at least MIN_PROBE_TARGETS + 1 good targets now,
   1090 	 * to make a good judgement. Otherwise don't drop this target.
   1091 	 */
   1092 	if (pii->pii_ntargets <  MIN_PROBE_TARGETS + 1)
   1093 		return (_B_FALSE);
   1094 
   1095 	/*
   1096 	 * Determine whether only this particular target is slow.
   1097 	 * We know that this target's crtt exceeds the group's probe interval.
   1098 	 * If all other active targets have a
   1099 	 * crtt < (this group's probe interval) / EXCEPTION_FACTOR,
   1100 	 * then this target is considered slow.
   1101 	 */
   1102 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
   1103 		if (tg != target && tg->tg_status == TG_ACTIVE) {
   1104 			if (tg->tg_crtt >
   1105 			    pii->pii_phyint->pi_group->pg_probeint /
   1106 			    EXCEPTION_FACTOR) {
   1107 				return (_B_FALSE);
   1108 			}
   1109 		}
   1110 	}
   1111 
   1112 	return (_B_TRUE);
   1113 }
   1114 
   1115 /*
   1116  * Update the target list. The icmp all hosts multicast has given us
   1117  * some host to which we can send probes. If we already have sufficient
   1118  * targets, discard it.
   1119  */
   1120 static void
   1121 incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply,
   1122     struct in6_addr fromaddr)
   1123 /* ARGSUSED */
   1124 {
   1125 	int af;
   1126 	char abuf[INET6_ADDRSTRLEN];
   1127 	struct phyint *pi;
   1128 
   1129 	if (debug & D_PROBE) {
   1130 		logdebug("incoming_mcast_reply(%s %s %s)\n",
   1131 		    AF_STR(pii->pii_af), pii->pii_name,
   1132 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)));
   1133 	}
   1134 
   1135 	/*
   1136 	 * Using host targets is a fallback mechanism. If we have
   1137 	 * found a router, don't add this host target. If we already
   1138 	 * know MAX_PROBE_TARGETS, don't add another target.
   1139 	 */
   1140 	assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
   1141 	if (pii->pii_targets != NULL) {
   1142 		if (pii->pii_targets_are_routers ||
   1143 		    (pii->pii_ntargets == MAX_PROBE_TARGETS)) {
   1144 			return;
   1145 		}
   1146 	}
   1147 
   1148 	if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) ||
   1149 	    IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) {
   1150 		/*
   1151 		 * Guard against response from 0.0.0.0
   1152 		 * and ::. Log a trace message
   1153 		 */
   1154 		logtrace("probe response from %s on %s\n",
   1155 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)),
   1156 		    pii->pii_name);
   1157 		return;
   1158 	}
   1159 
   1160 	/*
   1161 	 * This address is one of our own, so reject this address as a
   1162 	 * valid probe target.
   1163 	 */
   1164 	af = pii->pii_af;
   1165 	if (own_address(fromaddr))
   1166 		return;
   1167 
   1168 	/*
   1169 	 * If the phyint is part a named group, then add the address to all
   1170 	 * members of the group.  Otherwise, add the address only to the
   1171 	 * phyint itself, since other phyints in the anongroup may not be on
   1172 	 * the same subnet.
   1173 	 */
   1174 	pi = pii->pii_phyint;
   1175 	if (pi->pi_group == phyint_anongroup) {
   1176 		target_add(pii, fromaddr, _B_FALSE);
   1177 	} else {
   1178 		pi = pi->pi_group->pg_phyint;
   1179 		for (; pi != NULL; pi = pi->pi_pgnext)
   1180 			target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE);
   1181 	}
   1182 }
   1183 
   1184 /*
   1185  * Compute CRTT given an existing scaled average, scaled deviation estimate
   1186  * and a new rtt time.  The formula is from Jacobson and Karels'
   1187  * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
   1188  * are the same as those in Appendix A.2 of that paper.
   1189  *
   1190  * m = new measurement
   1191  * sa = scaled RTT average (8 * average estimates)
   1192  * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
   1193  * crtt = Conservative round trip time. Used to determine whether probe
   1194  * has timed out.
   1195  *
   1196  * New scaled average and deviation are passed back via sap and svp
   1197  */
   1198 static int64_t
   1199 compute_crtt(int64_t *sap, int64_t *svp, int64_t m)
   1200 {
   1201 	int64_t sa = *sap;
   1202 	int64_t sv = *svp;
   1203 	int64_t crtt;
   1204 	int64_t saved_m = m;
   1205 
   1206 	assert(*sap >= -1);
   1207 	assert(*svp >= 0);
   1208 
   1209 	if (sa != -1) {
   1210 		/*
   1211 		 * Update average estimator:
   1212 		 *	new rtt = old rtt + 1/8 Error
   1213 		 *	    where Error = m - old rtt
   1214 		 *	i.e. 8 * new rtt = 8 * old rtt + Error
   1215 		 *	i.e. new sa =  old sa + Error
   1216 		 */
   1217 		m -= sa >> 3;		/* m is now Error in estimate. */
   1218 		if ((sa += m) < 0) {
   1219 			/* Don't allow the smoothed average to be negative. */
   1220 			sa = 0;
   1221 		}
   1222 
   1223 		/*
   1224 		 * Update deviation estimator:
   1225 		 *	new mdev =  old mdev + 1/4 (abs(Error) - old mdev)
   1226 		 *	i.e. 4 * new mdev = 4 * old mdev +
   1227 		 *		(abs(Error) - old mdev)
   1228 		 * 	i.e. new sv = old sv + (abs(Error) - old mdev)
   1229 		 */
   1230 		if (m < 0)
   1231 			m = -m;
   1232 		m -= sv >> 2;
   1233 		sv += m;
   1234 	} else {
   1235 		/* Initialization. This is the first response received. */
   1236 		sa = (m << 3);
   1237 		sv = (m << 1);
   1238 	}
   1239 
   1240 	crtt = (sa >> 3) + sv;
   1241 
   1242 	if (debug & D_PROBE) {
   1243 		logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> "
   1244 		    "crtt = %lld\n", saved_m, sa, sv, crtt);
   1245 	}
   1246 
   1247 	*sap = sa;
   1248 	*svp = sv;
   1249 
   1250 	/*
   1251 	 * CRTT = average estimates  + 4 * deviation estimates
   1252 	 *	= sa / 8 + sv
   1253 	 */
   1254 	return (crtt);
   1255 }
   1256 
   1257 static void
   1258 pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni)
   1259 {
   1260 	struct phyint_instance *pii = tg->tg_phyint_inst;
   1261 	int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
   1262 	int64_t sa = tg->tg_rtt_sa;
   1263 	int64_t sv = tg->tg_rtt_sd;
   1264 	int new_crtt;
   1265 	int i;
   1266 
   1267 	if (debug & D_PROBE)
   1268 		logdebug("pi_set_crtt: target -  m %lld\n", m);
   1269 
   1270 	/* store the round trip time, in case we need to defer computation */
   1271 	tg->tg_deferred[tg->tg_num_deferred] = m;
   1272 
   1273 	new_crtt = ns2ms(compute_crtt(&sa, &sv, m));
   1274 
   1275 	/*
   1276 	 * If this probe's round trip time would singlehandedly cause an
   1277 	 * increase in the group's probe interval consider it suspect.
   1278 	 */
   1279 	if ((new_crtt > probe_interval) && is_probe_uni) {
   1280 		if (debug & D_PROBE) {
   1281 			logdebug("Received a suspect probe on %s, new_crtt ="
   1282 			    " %d, probe_interval = %d, num_deferred = %d\n",
   1283 			    pii->pii_probe_logint->li_name, new_crtt,
   1284 			    probe_interval, tg->tg_num_deferred);
   1285 		}
   1286 
   1287 		/*
   1288 		 * If we've deferred as many rtts as we plan on deferring, then
   1289 		 * assume the link really did slow down and process all queued
   1290 		 * rtts
   1291 		 */
   1292 		if (tg->tg_num_deferred == MAXDEFERREDRTT) {
   1293 			if (debug & D_PROBE) {
   1294 				logdebug("Received MAXDEFERREDRTT probes which "
   1295 				    "would cause an increased probe_interval.  "
   1296 				    "Integrating queued rtt data points.\n");
   1297 			}
   1298 
   1299 			for (i = 0; i <= tg->tg_num_deferred; i++) {
   1300 				tg->tg_crtt = ns2ms(compute_crtt(&tg->tg_rtt_sa,
   1301 				    &tg->tg_rtt_sd, tg->tg_deferred[i]));
   1302 			}
   1303 
   1304 			tg->tg_num_deferred = 0;
   1305 		} else {
   1306 			tg->tg_num_deferred++;
   1307 		}
   1308 		return;
   1309 	}
   1310 
   1311 	/*
   1312 	 * If this is a normal probe, or an RTT probe that would lead to a
   1313 	 * reduced CRTT, then update our CRTT data.  Further, if this was
   1314 	 * a normal probe, pitch any deferred probes since our probes are
   1315 	 * again being answered within our CRTT estimates.
   1316 	 */
   1317 	if (is_probe_uni || new_crtt < tg->tg_crtt) {
   1318 		tg->tg_rtt_sa = sa;
   1319 		tg->tg_rtt_sd = sv;
   1320 		tg->tg_crtt = new_crtt;
   1321 		if (is_probe_uni)
   1322 			tg->tg_num_deferred = 0;
   1323 	}
   1324 }
   1325 
   1326 /*
   1327  * Return a pointer to the specified option buffer.
   1328  * If not found return NULL.
   1329  */
   1330 static void *
   1331 find_ancillary(struct msghdr *msg, int cmsg_level, int cmsg_type)
   1332 {
   1333 	struct cmsghdr *cmsg;
   1334 
   1335 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
   1336 	    cmsg = CMSG_NXTHDR(msg, cmsg)) {
   1337 		if (cmsg->cmsg_level == cmsg_level &&
   1338 		    cmsg->cmsg_type == cmsg_type) {
   1339 			return (CMSG_DATA(cmsg));
   1340 		}
   1341 	}
   1342 	return (NULL);
   1343 }
   1344 
   1345 /*
   1346  * Try to activate another INACTIVE interface in the same group as `pi'.
   1347  * Prefer STANDBY INACTIVE to just INACTIVE.
   1348  */
   1349 void
   1350 phyint_activate_another(struct phyint *pi)
   1351 {
   1352 	struct phyint *pi2;
   1353 	struct phyint *inactivepi = NULL;
   1354 
   1355 	if (pi->pi_group == phyint_anongroup)
   1356 		return;
   1357 
   1358 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
   1359 		if (pi == pi2 || !phyint_is_functioning(pi2) ||
   1360 		    !(pi2->pi_flags & IFF_INACTIVE))
   1361 			continue;
   1362 
   1363 		inactivepi = pi2;
   1364 		if (pi2->pi_flags & IFF_STANDBY)
   1365 			break;
   1366 	}
   1367 
   1368 	if (inactivepi != NULL)
   1369 		(void) change_pif_flags(inactivepi, 0, IFF_INACTIVE);
   1370 }
   1371 
   1372 /*
   1373  * Transition a phyint to PI_RUNNING.  The caller must ensure that the
   1374  * transition is appropriate.  Clears IFF_OFFLINE or IFF_FAILED if
   1375  * appropriate.  Also sets IFF_INACTIVE on this or other interfaces as
   1376  * appropriate (see comment below).  Finally, also updates the phyint's group
   1377  * state to account for the change.
   1378  */
   1379 void
   1380 phyint_transition_to_running(struct phyint *pi)
   1381 {
   1382 	struct phyint *pi2;
   1383 	struct phyint *actstandbypi = NULL;
   1384 	uint_t nactive = 0, nnonstandby = 0;
   1385 	boolean_t onlining = (pi->pi_state == PI_OFFLINE);
   1386 	boolean_t initial = (pi->pi_state == PI_INIT);
   1387 	uint64_t set, clear;
   1388 
   1389 	/*
   1390 	 * The interface is running again, but should it or another interface
   1391 	 * in the group end up INACTIVE?  There are three cases:
   1392 	 *
   1393 	 * 1. If it's a STANDBY interface, it should be end up INACTIVE if
   1394 	 *    the group is operating at capacity (i.e., there are at least as
   1395 	 *    many active interfaces as non-STANDBY interfaces in the group).
   1396 	 *    No other interfaces should be changed.
   1397 	 *
   1398 	 * 2. If it's a non-STANDBY interface and we're onlining it or
   1399 	 *    FAILBACK is enabled, then it should *not* end up INACTIVE.
   1400 	 *    Further, if the group is above capacity as a result of this
   1401 	 *    interface, then an active STANDBY interface in the group should
   1402 	 *    end up INACTIVE.
   1403 	 *
   1404 	 * 3. If it's a non-STANDBY interface, we're repairing it, and
   1405 	 *    FAILBACK is disabled, then it should end up INACTIVE *unless*
   1406 	 *    the group was failed (in which case we have no choice but to
   1407 	 *    use it).  No other interfaces should be changed.
   1408 	 */
   1409 	if (pi->pi_group != phyint_anongroup) {
   1410 		pi2 = pi->pi_group->pg_phyint;
   1411 		for (; pi2 != NULL; pi2 = pi2->pi_pgnext) {
   1412 			if (!(pi2->pi_flags & IFF_STANDBY))
   1413 				nnonstandby++;
   1414 
   1415 			if (phyint_is_functioning(pi2) &&
   1416 			    !(pi2->pi_flags & IFF_INACTIVE)) {
   1417 				nactive++;
   1418 				if (pi2->pi_flags & IFF_STANDBY)
   1419 					actstandbypi = pi2;
   1420 			}
   1421 		}
   1422 	}
   1423 
   1424 	set = 0;
   1425 	clear = (onlining ? IFF_OFFLINE : IFF_FAILED);
   1426 
   1427 	if (pi->pi_flags & IFF_STANDBY) {			/* case 1 */
   1428 		if (nactive >= nnonstandby)
   1429 			set |= IFF_INACTIVE;
   1430 		else
   1431 			clear |= IFF_INACTIVE;
   1432 	} else if (onlining || failback_enabled) {		/* case 2 */
   1433 		if (nactive >= nnonstandby && actstandbypi != NULL)
   1434 			(void) change_pif_flags(actstandbypi, IFF_INACTIVE, 0);
   1435 	} else if (!initial && !GROUP_FAILED(pi->pi_group)) {	/* case 3 */
   1436 		set |= IFF_INACTIVE;
   1437 	}
   1438 	(void) change_pif_flags(pi, set, clear);
   1439 
   1440 	phyint_chstate(pi, PI_RUNNING);
   1441 
   1442 	/*
   1443 	 * Update the group state to account for the change.
   1444 	 */
   1445 	phyint_group_refresh_state(pi->pi_group);
   1446 }
   1447 
   1448 /*
   1449  * Adjust IFF_INACTIVE on the provided `pi' to trend the group configuration
   1450  * to have at least one active interface and as many active interfaces as
   1451  * non-standby interfaces.
   1452  */
   1453 void
   1454 phyint_standby_refresh_inactive(struct phyint *pi)
   1455 {
   1456 	struct phyint *pi2;
   1457 	uint_t nactive = 0, nnonstandby = 0;
   1458 
   1459 	/*
   1460 	 * All phyints in the anonymous group are effectively in their own
   1461 	 * group and thus active regardless of whether they're marked standby.
   1462 	 */
   1463 	if (pi->pi_group == phyint_anongroup) {
   1464 		(void) change_pif_flags(pi, 0, IFF_INACTIVE);
   1465 		return;
   1466 	}
   1467 
   1468 	/*
   1469 	 * If the phyint isn't functioning we can't consider it.
   1470 	 */
   1471 	if (!phyint_is_functioning(pi))
   1472 		return;
   1473 
   1474 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
   1475 		if (!(pi2->pi_flags & IFF_STANDBY))
   1476 			nnonstandby++;
   1477 
   1478 		if (phyint_is_functioning(pi2) &&
   1479 		    !(pi2->pi_flags & IFF_INACTIVE))
   1480 			nactive++;
   1481 	}
   1482 
   1483 	if (nactive == 0 || nactive < nnonstandby)
   1484 		(void) change_pif_flags(pi, 0, IFF_INACTIVE);
   1485 	else if (nactive > nnonstandby)
   1486 		(void) change_pif_flags(pi, IFF_INACTIVE, 0);
   1487 }
   1488 
   1489 /*
   1490  * See if a previously failed interface has started working again.
   1491  */
   1492 void
   1493 phyint_check_for_repair(struct phyint *pi)
   1494 {
   1495 	if (!phyint_repaired(pi))
   1496 		return;
   1497 
   1498 	if (pi->pi_group == phyint_anongroup) {
   1499 		logerr("IP interface repair detected on %s\n", pi->pi_name);
   1500 	} else {
   1501 		logerr("IP interface repair detected on %s of group %s\n",
   1502 		    pi->pi_name, pi->pi_group->pg_name);
   1503 	}
   1504 
   1505 	/*
   1506 	 * If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet.
   1507 	 * So just clear IFF_OFFLINE and defer phyint_transition_to_running()
   1508 	 * until it is brought back online.
   1509 	 */
   1510 	if (pi->pi_state == PI_OFFLINE) {
   1511 		(void) change_pif_flags(pi, 0, IFF_FAILED);
   1512 		return;
   1513 	}
   1514 
   1515 	phyint_transition_to_running(pi);	/* calls phyint_chstate() */
   1516 }
   1517 
   1518 /*
   1519  * See if an interface has failed, or if the whole group of interfaces has
   1520  * failed.
   1521  */
   1522 static void
   1523 phyint_inst_check_for_failure(struct phyint_instance *pii)
   1524 {
   1525 	struct phyint	*pi = pii->pii_phyint;
   1526 	struct phyint	*pi2;
   1527 	boolean_t	was_active;
   1528 
   1529 	switch (failure_state(pii)) {
   1530 	case PHYINT_FAILURE:
   1531 		was_active = ((pi->pi_flags & IFF_INACTIVE) == 0);
   1532 
   1533 		(void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE);
   1534 		if (pi->pi_group == phyint_anongroup) {
   1535 			logerr("IP interface failure detected on %s\n",
   1536 			    pii->pii_name);
   1537 		} else {
   1538 			logerr("IP interface failure detected on %s of group"
   1539 			    " %s\n", pii->pii_name, pi->pi_group->pg_name);
   1540 		}
   1541 
   1542 		/*
   1543 		 * If the failed interface was active, activate another
   1544 		 * INACTIVE interface in the group if possible.
   1545 		 */
   1546 		if (was_active)
   1547 			phyint_activate_another(pi);
   1548 
   1549 		/*
   1550 		 * If the interface is offline, the state change will be
   1551 		 * noted when it comes back online.
   1552 		 */
   1553 		if (pi->pi_state != PI_OFFLINE) {
   1554 			phyint_chstate(pi, PI_FAILED);
   1555 			reset_crtt_all(pi);
   1556 		}
   1557 		break;
   1558 
   1559 	case GROUP_FAILURE:
   1560 		pi2 = pi->pi_group->pg_phyint;
   1561 		for (; pi2 != NULL; pi2 = pi2->pi_pgnext) {
   1562 			(void) change_pif_flags(pi2, IFF_FAILED, IFF_INACTIVE);
   1563 			if (pi2->pi_state == PI_OFFLINE) /* see comment above */
   1564 				continue;
   1565 
   1566 			reset_crtt_all(pi2);
   1567 			/*
   1568 			 * In the case of host targets, we would have flushed
   1569 			 * the targets, and gone to PI_NOTARGETS state.
   1570 			 */
   1571 			if (pi2->pi_state == PI_RUNNING)
   1572 				phyint_chstate(pi2, PI_FAILED);
   1573 		}
   1574 		break;
   1575 
   1576 	default:
   1577 		break;
   1578 	}
   1579 }
   1580 
   1581 /*
   1582  * Determines if any timeout event has occurred and returns the number of
   1583  * milliseconds until the next timeout event for the phyint. Returns
   1584  * TIMER_INFINITY for "never".
   1585  */
   1586 uint_t
   1587 phyint_inst_timer(struct phyint_instance *pii)
   1588 {
   1589 	int 	pr_ndx;
   1590 	uint_t	timeout;
   1591 	struct	target	*cur_tg;
   1592 	struct	probe_stats *pr_statp;
   1593 	struct	phyint_instance *pii_other;
   1594 	struct	phyint *pi;
   1595 	int	valid_unack_count;
   1596 	int	i;
   1597 	int	interval;
   1598 	uint_t	check_time;
   1599 	uint_t	cur_time;
   1600 	hrtime_t cur_hrtime;
   1601 	int	probe_interval = pii->pii_phyint->pi_group->pg_probeint;
   1602 
   1603 	cur_hrtime = gethrtime();
   1604 	cur_time = ns2ms(cur_hrtime);
   1605 
   1606 	if (debug & D_TIMER) {
   1607 		logdebug("phyint_inst_timer(%s %s)\n",
   1608 		    AF_STR(pii->pii_af), pii->pii_name);
   1609 	}
   1610 
   1611 	pii_other = phyint_inst_other(pii);
   1612 	if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) {
   1613 		/*
   1614 		 * Check to see if we're here due to link up/down flapping; If
   1615 		 * enough time has passed, then try to bring the interface
   1616 		 * back up; otherwise, schedule a timer to bring it back up
   1617 		 * when enough time *has* elapsed.
   1618 		 */
   1619 		pi = pii->pii_phyint;
   1620 		if (pi->pi_state == PI_FAILED && LINK_UP(pi)) {
   1621 			check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN;
   1622 			if (check_time > cur_time)
   1623 				return (check_time - cur_time);
   1624 
   1625 			phyint_check_for_repair(pi);
   1626 		}
   1627 	}
   1628 
   1629 	/*
   1630 	 * If probing is not enabled on this phyint instance, don't proceed.
   1631 	 */
   1632 	if (!PROBE_ENABLED(pii))
   1633 		return (TIMER_INFINITY);
   1634 
   1635 	/*
   1636 	 * If the timer has fired too soon, probably triggered
   1637 	 * by some other phyint instance, return the remaining
   1638 	 * time
   1639 	 */
   1640 	if (TIME_LT(cur_time, pii->pii_snxt_time))
   1641 		return (pii->pii_snxt_time - cur_time);
   1642 
   1643 	/*
   1644 	 * If the link is down, don't send any probes for now.
   1645 	 */
   1646 	if (LINK_DOWN(pii->pii_phyint))
   1647 		return (TIMER_INFINITY);
   1648 
   1649 	/*
   1650 	 * Randomize the next probe time, between MIN_RANDOM_FACTOR
   1651 	 * and MAX_RANDOM_FACTOR with respect to the base probe time.
   1652 	 * Base probe time is strictly periodic.
   1653 	 */
   1654 	interval = GET_RANDOM(
   1655 	    (int)(MIN_RANDOM_FACTOR * user_probe_interval),
   1656 	    (int)(MAX_RANDOM_FACTOR * user_probe_interval));
   1657 	pii->pii_snxt_time = pii->pii_snxt_basetime + interval;
   1658 
   1659 	/*
   1660 	 * Check if the current time > next time to probe. If so, we missed
   1661 	 * sending 1 or more probes, probably due to heavy system load. At least
   1662 	 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
   1663 	 * were scheduled. Make adjustments to the times, in multiples of
   1664 	 * user_probe_interval.
   1665 	 */
   1666 	if (TIME_GT(cur_time, pii->pii_snxt_time)) {
   1667 		int n;
   1668 
   1669 		n = (cur_time - pii->pii_snxt_time) / user_probe_interval;
   1670 		pii->pii_snxt_time 	+= (n + 1) * user_probe_interval;
   1671 		pii->pii_snxt_basetime 	+= (n + 1) * user_probe_interval;
   1672 		logtrace("missed sending %d probes cur_time %u snxt_time %u"
   1673 		    " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time,
   1674 		    pii->pii_snxt_basetime);
   1675 
   1676 		/* Collect statistics about missed probes */
   1677 		probes_missed.pm_nprobes += n + 1;
   1678 		probes_missed.pm_ntimes++;
   1679 	}
   1680 	pii->pii_snxt_basetime += user_probe_interval;
   1681 	interval = pii->pii_snxt_time - cur_time;
   1682 	if (debug & D_TARGET) {
   1683 		logdebug("cur_time %u snxt_time %u snxt_basetime %u"
   1684 		    " interval %u\n", cur_time, pii->pii_snxt_time,
   1685 		    pii->pii_snxt_basetime, interval);
   1686 	}
   1687 
   1688 	/*
   1689 	 * If no targets are known, we need to send an ICMP multicast. The
   1690 	 * probe type is PROBE_MULTI.  We'll check back in 'interval' msec
   1691 	 * to see if we found a target.
   1692 	 */
   1693 	if (pii->pii_target_next == NULL) {
   1694 		assert(pii->pii_ntargets == 0);
   1695 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
   1696 		probe(pii, PROBE_MULTI, cur_time);
   1697 		return (interval);
   1698 	}
   1699 
   1700 	if ((user_probe_interval != probe_interval) &&
   1701 	    TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) {
   1702 		/*
   1703 		 * the failure detection (fd) probe timer has not yet fired.
   1704 		 * Need to send only an rtt probe. The probe type is PROBE_RTT.
   1705 		 */
   1706 		probe(pii, PROBE_RTT, cur_hrtime);
   1707 		return (interval);
   1708 	}
   1709 	/*
   1710 	 * the fd probe timer has fired. Need to do all failure
   1711 	 * detection / recovery calculations, and then send an fd probe
   1712 	 * of type PROBE_UNI.
   1713 	 */
   1714 	if (user_probe_interval == probe_interval) {
   1715 		/*
   1716 		 * We could have missed some probes, and then adjusted
   1717 		 * pii_snxt_basetime above. Otherwise we could have
   1718 		 * blindly added probe_interval to pii_fd_snxt_basetime.
   1719 		 */
   1720 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
   1721 	} else {
   1722 		pii->pii_fd_snxt_basetime += probe_interval;
   1723 		if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) {
   1724 			int n;
   1725 
   1726 			n = (cur_time - pii->pii_fd_snxt_basetime) /
   1727 			    probe_interval;
   1728 			pii->pii_fd_snxt_basetime += (n + 1) * probe_interval;
   1729 		}
   1730 	}
   1731 
   1732 	/*
   1733 	 * We can have at most, the latest 2 probes that we sent, in
   1734 	 * the PR_UNACKED state. All previous probes sent, are either
   1735 	 * PR_LOST or PR_ACKED. An unacknowledged probe is considered
   1736 	 * timed out if the probe's time_start + the CRTT < currenttime.
   1737 	 * For each of the last 2 probes, examine whether it has timed
   1738 	 * out. If so, mark it PR_LOST. The probe stats is a circular array.
   1739 	 */
   1740 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
   1741 	valid_unack_count = 0;
   1742 
   1743 	for (i = 0; i < 2; i++) {
   1744 		pr_statp = &pii->pii_probes[pr_ndx];
   1745 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
   1746 		switch (pr_statp->pr_status) {
   1747 		case PR_ACKED:
   1748 			/*
   1749 			 * We received back an ACK, so the switch clearly
   1750 			 * is not dropping our traffic, and thus we can
   1751 			 * enable failure detection immediately.
   1752 			 */
   1753 			if (pii->pii_fd_hrtime > gethrtime()) {
   1754 				if (debug & D_PROBE) {
   1755 					logdebug("successful probe on %s; "
   1756 					    "ending quiet period\n",
   1757 					    pii->pii_phyint->pi_name);
   1758 				}
   1759 				pii->pii_fd_hrtime = gethrtime();
   1760 			}
   1761 			break;
   1762 
   1763 		case PR_UNACKED:
   1764 			assert(cur_tg != NULL);
   1765 			/*
   1766 			 * The crtt could be zero for some reason,
   1767 			 * Eg. the phyint could be failed. If the crtt is
   1768 			 * not available use group's probe interval,
   1769 			 * which is a worst case estimate.
   1770 			 */
   1771 			timeout = ns2ms(pr_statp->pr_hrtime_start);
   1772 			if (cur_tg->tg_crtt != 0) {
   1773 				timeout += cur_tg->tg_crtt;
   1774 			} else {
   1775 				timeout += probe_interval;
   1776 			}
   1777 			if (TIME_LT(timeout, cur_time)) {
   1778 				pr_statp->pr_time_lost = timeout;
   1779 				probe_chstate(pr_statp, pii, PR_LOST);
   1780 			} else if (i == 1) {
   1781 				/*
   1782 				 * We are forced to consider this probe
   1783 				 * lost, as we can have at most 2 unack.
   1784 				 * probes any time, and we will be sending a
   1785 				 * probe at the end of this function.
   1786 				 * Normally, we should not be here, but
   1787 				 * this can happen if an incoming response
   1788 				 * that was considered lost has increased
   1789 				 * the crtt for this target, and also bumped
   1790 				 * up the FDT. Note that we never cancel or
   1791 				 * increase the current pii_time_left, so
   1792 				 * when the timer fires, we find 2 valid
   1793 				 * unacked probes, and they are yet to timeout
   1794 				 */
   1795 				pr_statp->pr_time_lost = cur_time;
   1796 				probe_chstate(pr_statp, pii, PR_LOST);
   1797 			} else {
   1798 				/*
   1799 				 * Only the most recent probe can enter
   1800 				 * this 'else' arm. The second most recent
   1801 				 * probe must take either of the above arms,
   1802 				 * if it is unacked.
   1803 				 */
   1804 				valid_unack_count++;
   1805 			}
   1806 			break;
   1807 		}
   1808 		pr_ndx = PROBE_INDEX_PREV(pr_ndx);
   1809 	}
   1810 
   1811 	/*
   1812 	 * We send out 1 probe randomly in the interval between one half
   1813 	 * and one probe interval for the group. Given that the CRTT is always
   1814 	 * less than the group's probe interval, we can have at most 1
   1815 	 * unacknowledged probe now.  All previous probes are either lost or
   1816 	 * acked.
   1817 	 */
   1818 	assert(valid_unack_count == 0 || valid_unack_count == 1);
   1819 
   1820 	/*
   1821 	 * The timer has fired. Take appropriate action depending
   1822 	 * on the current state of the phyint.
   1823 	 *
   1824 	 * PI_RUNNING state 	- Failure detection
   1825 	 * PI_FAILED state 	- Repair detection
   1826 	 */
   1827 	switch (pii->pii_phyint->pi_state) {
   1828 	case PI_FAILED:
   1829 		/*
   1830 		 * If the most recent probe (excluding unacked probes that
   1831 		 * are yet to time out) has been acked, check whether the
   1832 		 * phyint is now repaired.
   1833 		 */
   1834 		if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) {
   1835 			phyint_check_for_repair(pii->pii_phyint);
   1836 		}
   1837 		break;
   1838 
   1839 	case PI_RUNNING:
   1840 		/*
   1841 		 * It's possible our probes have been lost because of a
   1842 		 * spanning-tree mandated quiet period on the switch.  If so,
   1843 		 * ignore the lost probes.
   1844 		 */
   1845 		if (pii->pii_fd_hrtime - cur_hrtime > 0)
   1846 			break;
   1847 
   1848 		if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) {
   1849 			/*
   1850 			 * We have 1 or more failed probes (excluding unacked
   1851 			 * probes that are yet to time out). Determine if the
   1852 			 * phyint has failed.
   1853 			 */
   1854 			phyint_inst_check_for_failure(pii);
   1855 		}
   1856 		break;
   1857 
   1858 	default:
   1859 		logerr("phyint_inst_timer: invalid state %d\n",
   1860 		    pii->pii_phyint->pi_state);
   1861 		abort();
   1862 	}
   1863 
   1864 	/*
   1865 	 * Start the next probe. probe() will also set pii->pii_probe_time_left
   1866 	 * to the group's probe interval. If phyint_failed -> target_flush_hosts
   1867 	 * was called, the target list may be empty.
   1868 	 */
   1869 	if (pii->pii_target_next != NULL) {
   1870 		probe(pii, PROBE_UNI, cur_hrtime);
   1871 		/*
   1872 		 * If we have just the one probe target, and we're not using
   1873 		 * router targets, try to find another as we presently have
   1874 		 * no resilience.
   1875 		 */
   1876 		if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1)
   1877 			probe(pii, PROBE_MULTI, cur_hrtime);
   1878 	} else {
   1879 		probe(pii, PROBE_MULTI, cur_hrtime);
   1880 	}
   1881 	return (interval);
   1882 }
   1883 
   1884 /*
   1885  * Start the probe timer for an interface instance.
   1886  */
   1887 void
   1888 start_timer(struct phyint_instance *pii)
   1889 {
   1890 	uint32_t interval;
   1891 
   1892 	/*
   1893 	 * Spread the base probe times (pi_snxt_basetime) across phyints
   1894 	 * uniformly over the (curtime..curtime + the group's probe_interval).
   1895 	 * pi_snxt_basetime is strictly periodic with a frequency of
   1896 	 * the group's probe interval. The actual probe time pi_snxt_time
   1897 	 * adds some randomness to pi_snxt_basetime and happens in probe().
   1898 	 * For the 1st probe on each phyint after the timer is started,
   1899 	 * pi_snxt_time and pi_snxt_basetime are the same.
   1900 	 */
   1901 	interval = GET_RANDOM(0,
   1902 	    (int)pii->pii_phyint->pi_group->pg_probeint);
   1903 
   1904 	pii->pii_snxt_basetime = getcurrenttime() + interval;
   1905 	pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
   1906 	pii->pii_snxt_time = pii->pii_snxt_basetime;
   1907 	timer_schedule(interval);
   1908 }
   1909 
   1910 /*
   1911  * Restart the probe timer on an interface instance.
   1912  */
   1913 static void
   1914 restart_timer(struct phyint_instance *pii)
   1915 {
   1916 	/*
   1917 	 * We don't need to restart the timer if it was never started in
   1918 	 * the first place (pii->pii_basetime_inited not set), as the timer
   1919 	 * won't have gone off yet.
   1920 	 */
   1921 	if (pii->pii_basetime_inited != 0) {
   1922 
   1923 		if (debug & D_LINKNOTE)
   1924 			logdebug("restart timer: restarting timer on %s, "
   1925 			    "address family %s\n", pii->pii_phyint->pi_name,
   1926 			    AF_STR(pii->pii_af));
   1927 
   1928 		start_timer(pii);
   1929 	}
   1930 }
   1931 
   1932 static void
   1933 process_link_state_down(struct phyint *pi)
   1934 {
   1935 	logerr("The link has gone down on %s\n", pi->pi_name);
   1936 
   1937 	/*
   1938 	 * Clear the probe statistics arrays, we don't want the repair
   1939 	 * detection logic relying on probes that were successful prior
   1940 	 * to the link going down.
   1941 	 */
   1942 	if (PROBE_CAPABLE(pi->pi_v4))
   1943 		clear_pii_probe_stats(pi->pi_v4);
   1944 	if (PROBE_CAPABLE(pi->pi_v6))
   1945 		clear_pii_probe_stats(pi->pi_v6);
   1946 	/*
   1947 	 * Check for interface failure.  Although we know the interface
   1948 	 * has failed, we don't know if all the other interfaces in the
   1949 	 * group have failed as well.
   1950 	 */
   1951 	if ((pi->pi_state == PI_RUNNING) ||
   1952 	    (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) {
   1953 		if (debug & D_LINKNOTE) {
   1954 			logdebug("process_link_state_down:"
   1955 			    " checking for failure on %s\n", pi->pi_name);
   1956 		}
   1957 
   1958 		if (pi->pi_v4 != NULL)
   1959 			phyint_inst_check_for_failure(pi->pi_v4);
   1960 		else if (pi->pi_v6 != NULL)
   1961 			phyint_inst_check_for_failure(pi->pi_v6);
   1962 	}
   1963 }
   1964 
   1965 static void
   1966 process_link_state_up(struct phyint *pi)
   1967 {
   1968 	logerr("The link has come up on %s\n", pi->pi_name);
   1969 
   1970 	/*
   1971 	 * We stopped any running timers on each instance when the link
   1972 	 * went down, so restart them.
   1973 	 */
   1974 	if (pi->pi_v4)
   1975 		restart_timer(pi->pi_v4);
   1976 	if (pi->pi_v6)
   1977 		restart_timer(pi->pi_v6);
   1978 
   1979 	phyint_check_for_repair(pi);
   1980 
   1981 	pi->pi_whenup[pi->pi_whendx++] = getcurrenttime();
   1982 	if (pi->pi_whendx == LINK_UP_PERMIN)
   1983 		pi->pi_whendx = 0;
   1984 }
   1985 
   1986 /*
   1987  * Process any changes in link state passed up from the interfaces.
   1988  */
   1989 void
   1990 process_link_state_changes(void)
   1991 {
   1992 	struct phyint *pi;
   1993 
   1994 	/* Look for interfaces where the link state has just changed */
   1995 
   1996 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
   1997 		boolean_t old_link_state_up = LINK_UP(pi);
   1998 
   1999 		/*
   2000 		 * Except when the "phyint" structure is created, this is
   2001 		 * the only place the link state is updated.  This allows
   2002 		 * this routine to detect changes in link state, rather
   2003 		 * than just the current state.
   2004 		 */
   2005 		UPDATE_LINK_STATE(pi);
   2006 
   2007 		if (LINK_DOWN(pi)) {
   2008 			/*
   2009 			 * Has link just gone down?
   2010 			 */
   2011 			if (old_link_state_up)
   2012 				process_link_state_down(pi);
   2013 		} else {
   2014 			/*
   2015 			 * Has link just gone back up?
   2016 			 */
   2017 			if (!old_link_state_up)
   2018 				process_link_state_up(pi);
   2019 		}
   2020 	}
   2021 }
   2022 
   2023 void
   2024 reset_crtt_all(struct phyint *pi)
   2025 {
   2026 	struct phyint_instance *pii;
   2027 	struct target *tg;
   2028 
   2029 	pii = pi->pi_v4;
   2030 	if (pii != NULL) {
   2031 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
   2032 			tg->tg_crtt = 0;
   2033 			tg->tg_rtt_sa = -1;
   2034 			tg->tg_rtt_sd = 0;
   2035 		}
   2036 	}
   2037 
   2038 	pii = pi->pi_v6;
   2039 	if (pii != NULL) {
   2040 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
   2041 			tg->tg_crtt = 0;
   2042 			tg->tg_rtt_sa = -1;
   2043 			tg->tg_rtt_sd = 0;
   2044 		}
   2045 	}
   2046 }
   2047 
   2048 /*
   2049  * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
   2050  * probes on both instances IPv4 and IPv6.
   2051  * If the interface has failed, return the time of the first probe failure
   2052  * in "tff".
   2053  */
   2054 static int
   2055 phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff)
   2056 {
   2057 	uint_t	pi_tff;
   2058 	struct	target *cur_tg;
   2059 	struct	probe_fail_count pfinfo;
   2060 	struct	phyint_instance *pii_other;
   2061 	int	pr_ndx;
   2062 
   2063 	/*
   2064 	 * Get the number of consecutive failed probes on
   2065 	 * this phyint across all targets. Also get the number
   2066 	 * of consecutive failed probes on this target only
   2067 	 */
   2068 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
   2069 	cur_tg = pii->pii_probes[pr_ndx].pr_target;
   2070 	probe_fail_info(pii, cur_tg, &pfinfo);
   2071 
   2072 	/* Get the time of first failure, for later use */
   2073 	pi_tff = pfinfo.pf_tff;
   2074 
   2075 	/*
   2076 	 * If the current target has not responded to the
   2077 	 * last NUM_PROBE_FAILS probes, and other targets are
   2078 	 * responding delete this target. Dead gateway detection
   2079 	 * will eventually remove this target (if router) from the
   2080 	 * routing tables. If that does not occur, we may end
   2081 	 * up adding this to our list again.
   2082 	 */
   2083 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS &&
   2084 	    pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) {
   2085 		if (pii->pii_targets_are_routers) {
   2086 			if (cur_tg->tg_status == TG_ACTIVE)
   2087 				pii->pii_ntargets--;
   2088 			cur_tg->tg_status = TG_DEAD;
   2089 			cur_tg->tg_crtt = 0;
   2090 			cur_tg->tg_rtt_sa = -1;
   2091 			cur_tg->tg_rtt_sd = 0;
   2092 			if (pii->pii_target_next == cur_tg)
   2093 				pii->pii_target_next = target_next(cur_tg);
   2094 		} else {
   2095 			target_delete(cur_tg);
   2096 			probe(pii, PROBE_MULTI, gethrtime());
   2097 		}
   2098 		return (PHYINT_OK);
   2099 	}
   2100 
   2101 	/*
   2102 	 * If the phyint has lost NUM_PROBE_FAILS or more
   2103 	 * consecutive probes, on both IPv4 and IPv6 protocol
   2104 	 * instances of the phyint, then trigger failure
   2105 	 * detection, else return false
   2106 	 */
   2107 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS)
   2108 		return (PHYINT_OK);
   2109 
   2110 	pii_other = phyint_inst_other(pii);
   2111 	if (PROBE_CAPABLE(pii_other)) {
   2112 		probe_fail_info(pii_other, NULL, &pfinfo);
   2113 		if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) {
   2114 			/*
   2115 			 * We have NUM_PROBE_FAILS or more failures
   2116 			 * on both IPv4 and IPv6. Get the earliest
   2117 			 * time when failure was detected on this
   2118 			 * phyint across IPv4 and IPv6.
   2119 			 */
   2120 			if (TIME_LT(pfinfo.pf_tff, pi_tff))
   2121 				pi_tff = pfinfo.pf_tff;
   2122 		} else {
   2123 			/*
   2124 			 * This instance has < NUM_PROBE_FAILS failure.
   2125 			 * So return false
   2126 			 */
   2127 			return (PHYINT_OK);
   2128 		}
   2129 	}
   2130 	*tff = pi_tff;
   2131 	return (PHYINT_FAILURE);
   2132 }
   2133 
   2134 /*
   2135  * Check if the link has gone down on this phyint, or it has failed the
   2136  * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
   2137  * Also look at other phyints of this group, for group failures.
   2138  */
   2139 int
   2140 failure_state(struct phyint_instance *pii)
   2141 {
   2142 	struct	probe_success_count psinfo;
   2143 	uint_t	pi2_tls;		/* time last success */
   2144 	uint_t	pi_tff;			/* time first fail */
   2145 	struct	phyint *pi2;
   2146 	struct	phyint *pi;
   2147 	struct	phyint_instance *pii2;
   2148 	struct  phyint_group *pg;
   2149 	int	retval;
   2150 
   2151 	if (debug & D_FAILREP)
   2152 		logdebug("phyint_failed(%s)\n", pii->pii_name);
   2153 
   2154 	pi = pii->pii_phyint;
   2155 	pg = pi->pi_group;
   2156 
   2157 	if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) ==
   2158 	    PHYINT_OK)
   2159 		return (PHYINT_OK);
   2160 
   2161 	/*
   2162 	 * At this point, the link is down, or the phyint is suspect, as it
   2163 	 * has lost NUM_PROBE_FAILS or more probes. If the phyint does not
   2164 	 * belong to any group, this is a PHYINT_FAILURE.  Otherwise, continue
   2165 	 * on to determine whether this should be considered a PHYINT_FAILURE
   2166 	 * or GROUP_FAILURE.
   2167 	 */
   2168 	if (pg == phyint_anongroup)
   2169 		return (PHYINT_FAILURE);
   2170 
   2171 	/*
   2172 	 * Need to compare against other phyints of the same group
   2173 	 * to exclude group failures. If the failure was detected via
   2174 	 * probing, then if the time of last success (tls) of any
   2175 	 * phyint is more recent than the time of first fail (tff) of the
   2176 	 * phyint in question, and the link is up on the phyint,
   2177 	 * then it is a phyint failure. Otherwise it is a group failure.
   2178 	 * If failure was detected via a link down notification sent from
   2179 	 * the driver to IP, we see if any phyints in the group are still
   2180 	 * running and haven't received a link down notification.  We
   2181 	 * will usually be processing the link down notification shortly
   2182 	 * after it was received, so there is no point looking at the tls
   2183 	 * of other phyints.
   2184 	 */
   2185 	retval = GROUP_FAILURE;
   2186 	for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
   2187 		/* Exclude ourself from comparison */
   2188 		if (pi2 == pi)
   2189 			continue;
   2190 
   2191 		if (LINK_DOWN(pi)) {
   2192 			/*
   2193 			 * We use FLAGS_TO_LINK_STATE() to test the flags
   2194 			 * directly, rather then LINK_UP() or LINK_DOWN(), as
   2195 			 * we may not have got round to processing the link
   2196 			 * state for the other phyints in the group yet.
   2197 			 *
   2198 			 * The check for PI_RUNNING and group failure handles
   2199 			 * the case when the group begins to recover.
   2200 			 * PI_RUNNING will be set, and group failure cleared
   2201 			 * only after receipt of NUM_PROBE_REPAIRS, by which
   2202 			 * time the other phyints should have received at
   2203 			 * least 1 packet, and so will not have NUM_PROBE_FAILS.
   2204 			 */
   2205 			if ((pi2->pi_state == PI_RUNNING) &&
   2206 			    !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) {
   2207 				retval = PHYINT_FAILURE;
   2208 				break;
   2209 			}
   2210 			continue;
   2211 		}
   2212 
   2213 		if (LINK_DOWN(pi2))
   2214 			continue;
   2215 
   2216 		/*
   2217 		 * If there's no probe-based failure detection on this
   2218 		 * interface, and its link is still up, then it's still
   2219 		 * working and thus the group has not failed.
   2220 		 */
   2221 		if (!PROBE_ENABLED(pi2->pi_v4) && !PROBE_ENABLED(pi2->pi_v6)) {
   2222 			retval = PHYINT_FAILURE;
   2223 			break;
   2224 		}
   2225 
   2226 		/*
   2227 		 * Need to compare against both IPv4 and IPv6 instances.
   2228 		 */
   2229 		pii2 = pi2->pi_v4;
   2230 		if (pii2 != NULL) {
   2231 			probe_success_info(pii2, NULL, &psinfo);
   2232 			if (psinfo.ps_tls_valid) {
   2233 				pi2_tls = psinfo.ps_tls;
   2234 				/*
   2235 				 * See comment above regarding check
   2236 				 * for PI_RUNNING and group failure.
   2237 				 */
   2238 				if (TIME_GT(pi2_tls, pi_tff) &&
   2239 				    (pi2->pi_state == PI_RUNNING) &&
   2240 				    !GROUP_FAILED(pg) &&
   2241 				    FLAGS_TO_LINK_STATE(pi2)) {
   2242 					retval = PHYINT_FAILURE;
   2243 					break;
   2244 				}
   2245 			}
   2246 		}
   2247 
   2248 		pii2 = pi2->pi_v6;
   2249 		if (pii2 != NULL) {
   2250 			probe_success_info(pii2, NULL, &psinfo);
   2251 			if (psinfo.ps_tls_valid) {
   2252 				pi2_tls = psinfo.ps_tls;
   2253 				/*
   2254 				 * See comment above regarding check
   2255 				 * for PI_RUNNING and group failure.
   2256 				 */
   2257 				if (TIME_GT(pi2_tls, pi_tff) &&
   2258 				    (pi2->pi_state == PI_RUNNING) &&
   2259 				    !GROUP_FAILED(pg) &&
   2260 				    FLAGS_TO_LINK_STATE(pi2)) {
   2261 					retval = PHYINT_FAILURE;
   2262 					break;
   2263 				}
   2264 			}
   2265 		}
   2266 	}
   2267 
   2268 	/*
   2269 	 * Update the group state to account for the changes.
   2270 	 */
   2271 	phyint_group_refresh_state(pg);
   2272 	return (retval);
   2273 }
   2274 
   2275 /*
   2276  * Return the information associated with consecutive probe successes
   2277  * starting with the most recent probe. At most the last 2 probes can be
   2278  * in the unacknowledged state. All previous probes have either failed
   2279  * or succeeded.
   2280  */
   2281 static void
   2282 probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
   2283     struct probe_success_count *psinfo)
   2284 {
   2285 	uint_t	i;
   2286 	struct probe_stats *pr_statp;
   2287 	uint_t most_recent;
   2288 	uint_t second_most_recent;
   2289 	boolean_t pi_found_failure = _B_FALSE;
   2290 	boolean_t tg_found_failure = _B_FALSE;
   2291 	uint_t now;
   2292 	uint_t timeout;
   2293 	struct target *tg;
   2294 
   2295 	if (debug & D_FAILREP)
   2296 		logdebug("probe_success_info(%s)\n", pii->pii_name);
   2297 
   2298 	bzero(psinfo, sizeof (*psinfo));
   2299 	now = getcurrenttime();
   2300 
   2301 	/*
   2302 	 * Start with the most recent probe, and count the number
   2303 	 * of consecutive probe successes. Latch the number of successes
   2304 	 * on hitting a failure.
   2305 	 */
   2306 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
   2307 	second_most_recent = PROBE_INDEX_PREV(most_recent);
   2308 
   2309 	for (i = most_recent; i != pii->pii_probe_next;
   2310 	    i = PROBE_INDEX_PREV(i)) {
   2311 		pr_statp = &pii->pii_probes[i];
   2312 
   2313 		switch (pr_statp->pr_status) {
   2314 		case PR_UNACKED:
   2315 			/*
   2316 			 * Only the most recent 2 probes can be unacknowledged
   2317 			 */
   2318 			assert(i == most_recent || i == second_most_recent);
   2319 
   2320 			tg = pr_statp->pr_target;
   2321 			assert(tg != NULL);
   2322 			/*
   2323 			 * The crtt could be zero for some reason,
   2324 			 * Eg. the phyint could be failed. If the crtt is
   2325 			 * not available use the value of the group's probe
   2326 			 * interval which is a worst case estimate.
   2327 			 */
   2328 			timeout = ns2ms(pr_statp->pr_hrtime_start);
   2329 			if (tg->tg_crtt != 0) {
   2330 				timeout += tg->tg_crtt;
   2331 			} else {
   2332 				timeout +=
   2333 				    pii->pii_phyint->pi_group->pg_probeint;
   2334 			}
   2335 
   2336 			if (TIME_LT(timeout, now)) {
   2337 				/*
   2338 				 * We hit a failure. Latch the total number of
   2339 				 * recent consecutive successes.
   2340 				 */
   2341 				pr_statp->pr_time_lost = timeout;
   2342 				probe_chstate(pr_statp, pii, PR_LOST);
   2343 				pi_found_failure = _B_TRUE;
   2344 				if (cur_tg != NULL && tg == cur_tg) {
   2345 					/*
   2346 					 * We hit a failure for the desired
   2347 					 * target. Latch the number of recent
   2348 					 * consecutive successes for this target
   2349 					 */
   2350 					tg_found_failure = _B_TRUE;
   2351 				}
   2352 			}
   2353 			break;
   2354 
   2355 		case PR_ACKED:
   2356 			/*
   2357 			 * Bump up the count of probe successes, if we
   2358 			 * have not seen any failure so far.
   2359 			 */
   2360 			if (!pi_found_failure)
   2361 				psinfo->ps_nsucc++;
   2362 
   2363 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
   2364 			    !tg_found_failure) {
   2365 				psinfo->ps_nsucc_tg++;
   2366 			}
   2367 
   2368 			/*
   2369 			 * Record the time of last success, if this is
   2370 			 * the most recent probe success.
   2371 			 */
   2372 			if (!psinfo->ps_tls_valid) {
   2373 				psinfo->ps_tls =
   2374 				    ns2ms(pr_statp->pr_hrtime_ackproc);
   2375 				psinfo->ps_tls_valid = _B_TRUE;
   2376 			}
   2377 			break;
   2378 
   2379 		case PR_LOST:
   2380 			/*
   2381 			 * We hit a failure. Latch the total number of
   2382 			 * recent consecutive successes.
   2383 			 */
   2384 			pi_found_failure = _B_TRUE;
   2385 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
   2386 				/*
   2387 				 * We hit a failure for the desired target.
   2388 				 * Latch the number of recent consecutive
   2389 				 * successes for this target
   2390 				 */
   2391 				tg_found_failure = _B_TRUE;
   2392 			}
   2393 			break;
   2394 
   2395 		default:
   2396 			return;
   2397 
   2398 		}
   2399 	}
   2400 }
   2401 
   2402 /*
   2403  * Return the information associated with consecutive probe failures
   2404  * starting with the most recent probe. Only the last 2 probes can be in the
   2405  * unacknowledged state. All previous probes have either failed or succeeded.
   2406  */
   2407 static void
   2408 probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
   2409     struct probe_fail_count *pfinfo)
   2410 {
   2411 	int	i;
   2412 	struct probe_stats *pr_statp;
   2413 	boolean_t	tg_found_success = _B_FALSE;
   2414 	boolean_t	pi_found_success = _B_FALSE;
   2415 	int	most_recent;
   2416 	int	second_most_recent;
   2417 	uint_t	now;
   2418 	uint_t	timeout;
   2419 	struct	target *tg;
   2420 
   2421 	if (debug & D_FAILREP)
   2422 		logdebug("probe_fail_info(%s)\n", pii->pii_name);
   2423 
   2424 	bzero(pfinfo, sizeof (*pfinfo));
   2425 	now = getcurrenttime();
   2426 
   2427 	/*
   2428 	 * Start with the most recent probe, and count the number
   2429 	 * of consecutive probe failures. Latch the number of failures
   2430 	 * on hitting a probe success.
   2431 	 */
   2432 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
   2433 	second_most_recent = PROBE_INDEX_PREV(most_recent);
   2434 
   2435 	for (i = most_recent; i != pii->pii_probe_next;
   2436 	    i = PROBE_INDEX_PREV(i)) {
   2437 		pr_statp = &pii->pii_probes[i];
   2438 
   2439 		assert(PR_STATUS_VALID(pr_statp->pr_status));
   2440 
   2441 		switch (pr_statp->pr_status) {
   2442 		case PR_UNACKED:
   2443 			/*
   2444 			 * Only the most recent 2 probes can be unacknowledged
   2445 			 */
   2446 			assert(i == most_recent || i == second_most_recent);
   2447 
   2448 			tg = pr_statp->pr_target;
   2449 			/*
   2450 			 * Target is guaranteed to exist in the unack. state
   2451 			 */
   2452 			assert(tg != NULL);
   2453 			/*
   2454 			 * The crtt could be zero for some reason,
   2455 			 * Eg. the phyint could be failed. If the crtt is
   2456 			 * not available use the group's probe interval,
   2457 			 * which is a worst case estimate.
   2458 			 */
   2459 			timeout = ns2ms(pr_statp->pr_hrtime_start);
   2460 			if (tg->tg_crtt != 0) {
   2461 				timeout += tg->tg_crtt;
   2462 			} else {
   2463 				timeout +=
   2464 				    pii->pii_phyint->pi_group->pg_probeint;
   2465 			}
   2466 
   2467 			if (TIME_GT(timeout, now))
   2468 				break;
   2469 
   2470 			pr_statp->pr_time_lost = timeout;
   2471 			probe_chstate(pr_statp, pii, PR_LOST);
   2472 			/* FALLTHRU */
   2473 
   2474 		case PR_LOST:
   2475 			if (!pi_found_success) {
   2476 				pfinfo->pf_nfail++;
   2477 				pfinfo->pf_tff = pr_statp->pr_time_lost;
   2478 			}
   2479 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
   2480 			    !tg_found_success)  {
   2481 				pfinfo->pf_nfail_tg++;
   2482 			}
   2483 			break;
   2484 
   2485 		default:
   2486 			/*
   2487 			 * We hit a success or unused slot. Latch the
   2488 			 * total number of recent consecutive failures.
   2489 			 */
   2490 			pi_found_success = _B_TRUE;
   2491 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
   2492 				/*
   2493 				 * We hit a success for the desired target.
   2494 				 * Latch the number of recent consecutive
   2495 				 * failures for this target
   2496 				 */
   2497 				tg_found_success = _B_TRUE;
   2498 			}
   2499 		}
   2500 	}
   2501 }
   2502 
   2503 /*
   2504  * Change the state of probe `pr' on phyint_instance `pii' to state `state'.
   2505  */
   2506 void
   2507 probe_chstate(struct probe_stats *pr, struct phyint_instance *pii, int state)
   2508 {
   2509 	if (pr->pr_status == state)
   2510 		return;
   2511 
   2512 	pr->pr_status = state;
   2513 	(void) probe_state_event(pr, pii);
   2514 }
   2515 
   2516 /*
   2517  * Check if the phyint has been repaired.  If no test address has been
   2518  * configured, then consider the interface repaired if the link is up (unless
   2519  * the link is flapping; see below).  Otherwise, look for proof of probes
   2520  * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
   2521  * either IPv4 or IPv6 instance, the phyint can be considered repaired.
   2522  */
   2523 static boolean_t
   2524 phyint_repaired(struct phyint *pi)
   2525 {
   2526 	struct	probe_success_count psinfo;
   2527 	struct	phyint_instance *pii;
   2528 	struct	target *cur_tg;
   2529 	int	pr_ndx;
   2530 	uint_t	cur_time;
   2531 
   2532 	if (debug & D_FAILREP)
   2533 		logdebug("phyint_repaired(%s)\n", pi->pi_name);
   2534 
   2535 	if (LINK_DOWN(pi))
   2536 		return (_B_FALSE);
   2537 
   2538 	/*
   2539 	 * If we don't have any test addresses and the link is up, then
   2540 	 * consider the interface repaired, unless we've received more than
   2541 	 * LINK_UP_PERMIN link up notifications in the last minute, in
   2542 	 * which case we keep the link down until we drop back below
   2543 	 * the threshold.
   2544 	 */
   2545 	if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
   2546 		cur_time = getcurrenttime();
   2547 		if ((pi->pi_whenup[pi->pi_whendx] == 0 ||
   2548 		    (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) {
   2549 			pi->pi_lfmsg_printed = 0;
   2550 			return (_B_TRUE);
   2551 		}
   2552 		if (!pi->pi_lfmsg_printed) {
   2553 			logerr("The link has come up on %s more than %d times "
   2554 			    "in the last minute; disabling repair until it "
   2555 			    "stabilizes\n", pi->pi_name, LINK_UP_PERMIN);
   2556 			pi->pi_lfmsg_printed = 1;
   2557 		}
   2558 
   2559 		return (_B_FALSE);
   2560 	}
   2561 
   2562 	pii = pi->pi_v4;
   2563 	if (PROBE_CAPABLE(pii)) {
   2564 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
   2565 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
   2566 		probe_success_info(pii, cur_tg, &psinfo);
   2567 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
   2568 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
   2569 			return (_B_TRUE);
   2570 	}
   2571 
   2572 	pii = pi->pi_v6;
   2573 	if (PROBE_CAPABLE(pii)) {
   2574 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
   2575 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
   2576 		probe_success_info(pii, cur_tg, &psinfo);
   2577 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
   2578 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
   2579 			return (_B_TRUE);
   2580 	}
   2581 
   2582 	return (_B_FALSE);
   2583 }
   2584 
   2585 /*
   2586  * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call.
   2587  */
   2588 boolean_t
   2589 change_pif_flags(struct phyint *pi, uint64_t set, uint64_t clear)
   2590 {
   2591 	int ifsock;
   2592 	struct lifreq lifr;
   2593 	uint64_t old_flags;
   2594 
   2595 	if (debug & D_FAILREP) {
   2596 		logdebug("change_pif_flags(%s): set %llx clear %llx\n",
   2597 		    pi->pi_name, set, clear);
   2598 	}
   2599 
   2600 	if (pi->pi_v4 != NULL)
   2601 		ifsock = ifsock_v4;
   2602 	else
   2603 		ifsock = ifsock_v6;
   2604 
   2605 	/*
   2606 	 * Get the current flags from the kernel, and set/clear the
   2607 	 * desired phyint flags. Since we set only phyint flags, we can
   2608 	 * do it on either IPv4 or IPv6 instance.
   2609 	 */
   2610 	(void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
   2611 
   2612 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
   2613 		if (errno != ENXIO)
   2614 			logperror("change_pif_flags: ioctl (get flags)");
   2615 		return (_B_FALSE);
   2616 	}
   2617 
   2618 	old_flags = lifr.lifr_flags;
   2619 	lifr.lifr_flags |= set;
   2620 	lifr.lifr_flags &= ~clear;
   2621 
   2622 	if (old_flags == lifr.lifr_flags) {
   2623 		/* No change in the flags. No need to send ioctl */
   2624 		return (_B_TRUE);
   2625 	}
   2626 
   2627 	if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
   2628 		if (errno != ENXIO)
   2629 			logperror("change_pif_flags: ioctl (set flags)");
   2630 		return (_B_FALSE);
   2631 	}
   2632 
   2633 	/*
   2634 	 * Keep pi_flags in synch. with actual flags. Assumes flags are
   2635 	 * phyint flags.
   2636 	 */
   2637 	pi->pi_flags |= set;
   2638 	pi->pi_flags &= ~clear;
   2639 
   2640 	if (pi->pi_v4 != NULL)
   2641 		pi->pi_v4->pii_flags = pi->pi_flags;
   2642 
   2643 	if (pi->pi_v6 != NULL)
   2644 		pi->pi_v6->pii_flags = pi->pi_flags;
   2645 
   2646 	return (_B_TRUE);
   2647 }
   2648 
   2649 /*
   2650  * icmp cksum computation for IPv4.
   2651  */
   2652 static int
   2653 in_cksum(ushort_t *addr, int len)
   2654 {
   2655 	register int nleft = len;
   2656 	register ushort_t *w = addr;
   2657 	register ushort_t answer;
   2658 	ushort_t odd_byte = 0;
   2659 	register int sum = 0;
   2660 
   2661 	/*
   2662 	 *  Our algorithm is simple, using a 32 bit accumulator (sum),
   2663 	 *  we add sequential 16 bit words to it, and at the end, fold
   2664 	 *  back all the carry bits from the top 16 bits into the lower
   2665 	 *  16 bits.
   2666 	 */
   2667 	while (nleft > 1)  {
   2668 		sum += *w++;
   2669 		nleft -= 2;
   2670 	}
   2671 
   2672 	/* mop up an odd byte, if necessary */
   2673 	if (nleft == 1) {
   2674 		*(uchar_t *)(&odd_byte) = *(uchar_t *)w;
   2675 		sum += odd_byte;
   2676 	}
   2677 
   2678 	/*
   2679 	 * add back carry outs from top 16 bits to low 16 bits
   2680 	 */
   2681 	sum = (sum >> 16) + (sum & 0xffff);	/* add hi 16 to low 16 */
   2682 	sum += (sum >> 16);			/* add carry */
   2683 	answer = ~sum;				/* truncate to 16 bits */
   2684 	return (answer);
   2685 }
   2686 
   2687 static void
   2688 reset_snxt_basetimes(void)
   2689 {
   2690 	struct phyint_instance *pii;
   2691 
   2692 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
   2693 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
   2694 	}
   2695 }
   2696 
   2697 /*
   2698  * Is the address one of our own addresses? Unfortunately,
   2699  * we cannot check our phyint tables to determine if the address
   2700  * is our own. This is because, we don't track interfaces that
   2701  * are not part of any group. We have to either use a 'bind' or
   2702  * get the complete list of all interfaces using SIOCGLIFCONF,
   2703  * to do this check. We could also use SIOCTMYADDR.
   2704  * Bind fails for the local zone address, so we might include local zone
   2705  * address as target address. If local zone address is a target address
   2706  * and it is up, it is not possible to detect the interface failure.
   2707  * SIOCTMYADDR also doesn't consider local zone address as own address.
   2708  * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they
   2709  * are stored in `localaddrs'
   2710  */
   2711 boolean_t
   2712 own_address(struct in6_addr addr)
   2713 {
   2714 	addrlist_t *addrp;
   2715 	struct sockaddr_storage ss;
   2716 	int af = IN6_IS_ADDR_V4MAPPED(&addr) ? AF_INET : AF_INET6;
   2717 
   2718 	addr2storage(af, &addr, &ss);
   2719 	for (addrp = localaddrs; addrp != NULL; addrp = addrp->al_next) {
   2720 		if (sockaddrcmp(&ss, &addrp->al_addr))
   2721 			return (_B_TRUE);
   2722 	}
   2723 	return (_B_FALSE);
   2724 }
   2725 
   2726 static int
   2727 ns2ms(int64_t ns)
   2728 {
   2729 	return (ns / (NANOSEC / MILLISEC));
   2730 }
   2731 
   2732 static int64_t
   2733 tv2ns(struct timeval *tvp)
   2734 {
   2735 	return (tvp->tv_sec * NANOSEC + tvp->tv_usec * 1000);
   2736 }
   2737