Home | History | Annotate | Download | only in in.mpathd
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include "mpd_defs.h"
     27 #include "mpd_tables.h"
     28 
     29 /*
     30  * Global list of phyints, phyint instances, phyint groups and the anonymous
     31  * group; the latter is initialized in phyint_init().
     32  */
     33 struct phyint *phyints = NULL;
     34 struct phyint_instance	*phyint_instances = NULL;
     35 struct phyint_group *phyint_groups = NULL;
     36 struct phyint_group *phyint_anongroup;
     37 
     38 /*
     39  * Grouplist signature; initialized in phyint_init().
     40  */
     41 static uint64_t phyint_grouplistsig;
     42 
     43 static void phyint_inst_insert(struct phyint_instance *pii);
     44 static void phyint_inst_print(struct phyint_instance *pii);
     45 
     46 static void phyint_insert(struct phyint *pi, struct phyint_group *pg);
     47 static void phyint_delete(struct phyint *pi);
     48 static boolean_t phyint_is_usable(struct phyint *pi);
     49 
     50 static void logint_print(struct logint *li);
     51 static void logint_insert(struct phyint_instance *pii, struct logint *li);
     52 static struct logint *logint_lookup(struct phyint_instance *pii, char *li_name);
     53 
     54 static void target_print(struct target *tg);
     55 static void target_insert(struct phyint_instance *pii, struct target *tg);
     56 static struct target *target_first(struct phyint_instance *pii);
     57 static struct target *target_select_best(struct phyint_instance *pii);
     58 static void target_flush_hosts(struct phyint_group *pg);
     59 
     60 static void reset_pii_probes(struct phyint_instance *pii, struct target *tg);
     61 
     62 static boolean_t phyint_inst_v6_sockinit(struct phyint_instance *pii);
     63 static boolean_t phyint_inst_v4_sockinit(struct phyint_instance *pii);
     64 
     65 static int phyint_state_event(struct phyint_group *pg, struct phyint *pi);
     66 static int phyint_group_state_event(struct phyint_group *pg);
     67 static int phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t);
     68 static int phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
     69     ipmp_if_op_t op);
     70 
     71 static int logint_upcount(struct phyint *pi);
     72 static uint64_t gensig(void);
     73 
     74 /* Initialize any per-file global state.  Returns 0 on success, -1 on failure */
     75 int
     76 phyint_init(void)
     77 {
     78 	phyint_grouplistsig = gensig();
     79 	if (track_all_phyints) {
     80 		phyint_anongroup = phyint_group_create("");
     81 		if (phyint_anongroup == NULL)
     82 			return (-1);
     83 		phyint_group_insert(phyint_anongroup);
     84 	}
     85 	return (0);
     86 }
     87 
     88 /* Return the phyint with the given name */
     89 struct phyint *
     90 phyint_lookup(const char *name)
     91 {
     92 	struct phyint *pi;
     93 
     94 	if (debug & D_PHYINT)
     95 		logdebug("phyint_lookup(%s)\n", name);
     96 
     97 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
     98 		if (strncmp(pi->pi_name, name, sizeof (pi->pi_name)) == 0)
     99 			break;
    100 	}
    101 	return (pi);
    102 }
    103 
    104 /*
    105  * Lookup a phyint in the group that has the same hardware address as `pi', or
    106  * NULL if there's none.  If `online_only' is set, then only online phyints
    107  * are considered when matching.  Otherwise, phyints that had been offlined
    108  * due to a duplicate hardware address will also be considered.
    109  */
    110 static struct phyint *
    111 phyint_lookup_hwaddr(struct phyint *pi, boolean_t online_only)
    112 {
    113 	struct phyint *pi2;
    114 
    115 	if (pi->pi_group == phyint_anongroup)
    116 		return (NULL);
    117 
    118 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
    119 		if (pi2 == pi)
    120 			continue;
    121 
    122 		/*
    123 		 * NOTE: even when online_only is B_FALSE, we ignore phyints
    124 		 * that are administratively offline (rather than offline
    125 		 * because they're dups); when they're brought back online,
    126 		 * they'll be flagged as dups if need be.
    127 		 */
    128 		if (pi2->pi_state == PI_OFFLINE &&
    129 		    (online_only || !pi2->pi_hwaddrdup))
    130 			continue;
    131 
    132 		if (pi2->pi_hwaddrlen == pi->pi_hwaddrlen &&
    133 		    bcmp(pi2->pi_hwaddr, pi->pi_hwaddr, pi->pi_hwaddrlen) == 0)
    134 			return (pi2);
    135 	}
    136 	return (NULL);
    137 }
    138 
    139 /*
    140  * Respond to DLPI notifications.  Currently, this only processes physical
    141  * address changes for the phyint passed via `arg' by onlining or offlining
    142  * phyints in the group.
    143  */
    144 /* ARGSUSED */
    145 static void
    146 phyint_link_notify(dlpi_handle_t dh, dlpi_notifyinfo_t *dnip, void *arg)
    147 {
    148 	struct phyint *pi = arg;
    149 	struct phyint *oduppi = NULL, *duppi = NULL;
    150 
    151 	assert((dnip->dni_note & pi->pi_notes) != 0);
    152 
    153 	if (dnip->dni_note != DL_NOTE_PHYS_ADDR)
    154 		return;
    155 
    156 	assert(dnip->dni_physaddrlen <= DLPI_PHYSADDR_MAX);
    157 
    158 	/*
    159 	 * If our hardware address hasn't changed, there's nothing to do.
    160 	 */
    161 	if (pi->pi_hwaddrlen == dnip->dni_physaddrlen &&
    162 	    bcmp(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen) == 0)
    163 		return;
    164 
    165 	oduppi = phyint_lookup_hwaddr(pi, _B_FALSE);
    166 	pi->pi_hwaddrlen = dnip->dni_physaddrlen;
    167 	(void) memcpy(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen);
    168 	duppi = phyint_lookup_hwaddr(pi, _B_FALSE);
    169 
    170 	if (oduppi != NULL || pi->pi_hwaddrdup) {
    171 		/*
    172 		 * Our old hardware address was a duplicate.  If we'd been
    173 		 * offlined because of it, and our new hardware address is not
    174 		 * a duplicate, then bring us online.  Otherwise, `oduppi'
    175 		 * must've been the one brought offline; bring it online.
    176 		 */
    177 		if (pi->pi_hwaddrdup) {
    178 			if (duppi == NULL)
    179 				(void) phyint_undo_offline(pi);
    180 		} else {
    181 			assert(oduppi->pi_hwaddrdup);
    182 			(void) phyint_undo_offline(oduppi);
    183 		}
    184 	}
    185 
    186 	if (duppi != NULL && !pi->pi_hwaddrdup) {
    187 		/*
    188 		 * Our new hardware address was a duplicate and we're not
    189 		 * yet flagged as a duplicate; bring us offline.
    190 		 */
    191 		pi->pi_hwaddrdup = _B_TRUE;
    192 		(void) phyint_offline(pi, 0);
    193 	}
    194 }
    195 
    196 /*
    197  * Initialize information about the underlying link for `pi', and set us
    198  * up to be notified about future changes.  Returns _B_TRUE on success.
    199  */
    200 boolean_t
    201 phyint_link_init(struct phyint *pi)
    202 {
    203 	int retval;
    204 	uint_t notes;
    205 	const char *errmsg;
    206 	dlpi_notifyid_t id;
    207 
    208 	pi->pi_notes = 0;
    209 	retval = dlpi_open(pi->pi_name, &pi->pi_dh, 0);
    210 	if (retval != DLPI_SUCCESS) {
    211 		pi->pi_dh = NULL;
    212 		errmsg = "cannot open";
    213 		goto failed;
    214 	}
    215 
    216 	pi->pi_hwaddrlen = DLPI_PHYSADDR_MAX;
    217 	retval = dlpi_get_physaddr(pi->pi_dh, DL_CURR_PHYS_ADDR, pi->pi_hwaddr,
    218 	    &pi->pi_hwaddrlen);
    219 	if (retval != DLPI_SUCCESS) {
    220 		errmsg = "cannot get hardware address";
    221 		goto failed;
    222 	}
    223 
    224 	/*
    225 	 * Check if the link supports DLPI link state notifications.  For
    226 	 * historical reasons, the actual changes are tracked through routing
    227 	 * sockets, so we immediately disable the notification upon success.
    228 	 */
    229 	notes = DL_NOTE_LINK_UP | DL_NOTE_LINK_DOWN;
    230 	retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id);
    231 	if (retval == DLPI_SUCCESS) {
    232 		(void) dlpi_disabnotify(pi->pi_dh, id, NULL);
    233 		pi->pi_notes |= notes;
    234 	}
    235 
    236 	/*
    237 	 * Enable notification of hardware address changes to keep pi_hwaddr
    238 	 * up-to-date and track if we need to offline/undo-offline phyints.
    239 	 */
    240 	notes = DL_NOTE_PHYS_ADDR;
    241 	retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id);
    242 	if (retval == DLPI_SUCCESS && poll_add(dlpi_fd(pi->pi_dh)) == 0)
    243 		pi->pi_notes |= notes;
    244 
    245 	return (_B_TRUE);
    246 failed:
    247 	logerr("%s: %s: %s\n", pi->pi_name, errmsg, dlpi_strerror(retval));
    248 	if (pi->pi_dh != NULL) {
    249 		dlpi_close(pi->pi_dh);
    250 		pi->pi_dh = NULL;
    251 	}
    252 	return (_B_FALSE);
    253 }
    254 
    255 /*
    256  * Close use of link on `pi'.
    257  */
    258 void
    259 phyint_link_close(struct phyint *pi)
    260 {
    261 	if (pi->pi_notes & DL_NOTE_PHYS_ADDR) {
    262 		(void) poll_remove(dlpi_fd(pi->pi_dh));
    263 		pi->pi_notes &= ~DL_NOTE_PHYS_ADDR;
    264 	}
    265 
    266 	/*
    267 	 * NOTE: we don't clear pi_notes here so that iflinkstate() can still
    268 	 * properly report the link state even when offline (which is possible
    269 	 * since we use IFF_RUNNING to track link state).
    270 	 */
    271 	dlpi_close(pi->pi_dh);
    272 	pi->pi_dh = NULL;
    273 }
    274 
    275 /* Return the phyint instance with the given name and the given family */
    276 struct phyint_instance *
    277 phyint_inst_lookup(int af, char *name)
    278 {
    279 	struct phyint *pi;
    280 
    281 	if (debug & D_PHYINT)
    282 		logdebug("phyint_inst_lookup(%s %s)\n", AF_STR(af), name);
    283 
    284 	assert(af == AF_INET || af == AF_INET6);
    285 
    286 	pi = phyint_lookup(name);
    287 	if (pi == NULL)
    288 		return (NULL);
    289 
    290 	return (PHYINT_INSTANCE(pi, af));
    291 }
    292 
    293 struct phyint_group *
    294 phyint_group_lookup(const char *pg_name)
    295 {
    296 	struct phyint_group *pg;
    297 
    298 	if (debug & D_PHYINT)
    299 		logdebug("phyint_group_lookup(%s)\n", pg_name);
    300 
    301 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
    302 		if (strncmp(pg->pg_name, pg_name, sizeof (pg->pg_name)) == 0)
    303 			break;
    304 	}
    305 	return (pg);
    306 }
    307 
    308 /*
    309  * Insert the phyint in the linked list of all phyints. If the phyint belongs
    310  * to some group, insert it in the phyint group list.
    311  */
    312 static void
    313 phyint_insert(struct phyint *pi, struct phyint_group *pg)
    314 {
    315 	if (debug & D_PHYINT)
    316 		logdebug("phyint_insert(%s '%s')\n", pi->pi_name, pg->pg_name);
    317 
    318 	/* Insert the phyint at the head of the 'all phyints' list */
    319 	pi->pi_next = phyints;
    320 	pi->pi_prev = NULL;
    321 	if (phyints != NULL)
    322 		phyints->pi_prev = pi;
    323 	phyints = pi;
    324 
    325 	/*
    326 	 * Insert the phyint at the head of the 'phyint_group members' list
    327 	 * of the phyint group to which it belongs.
    328 	 */
    329 	pi->pi_pgnext = NULL;
    330 	pi->pi_pgprev = NULL;
    331 	pi->pi_group = pg;
    332 
    333 	pi->pi_pgnext = pg->pg_phyint;
    334 	if (pi->pi_pgnext != NULL)
    335 		pi->pi_pgnext->pi_pgprev = pi;
    336 	pg->pg_phyint = pi;
    337 
    338 	/* Refresh the group state now that this phyint has been added */
    339 	phyint_group_refresh_state(pg);
    340 
    341 	pg->pg_sig++;
    342 	(void) phyint_group_member_event(pg, pi, IPMP_IF_ADD);
    343 }
    344 
    345 /* Insert the phyint instance in the linked list of all phyint instances. */
    346 static void
    347 phyint_inst_insert(struct phyint_instance *pii)
    348 {
    349 	if (debug & D_PHYINT) {
    350 		logdebug("phyint_inst_insert(%s %s)\n",
    351 		    AF_STR(pii->pii_af), pii->pii_name);
    352 	}
    353 
    354 	/*
    355 	 * Insert the phyint at the head of the 'all phyint instances' list.
    356 	 */
    357 	pii->pii_next = phyint_instances;
    358 	pii->pii_prev = NULL;
    359 	if (phyint_instances != NULL)
    360 		phyint_instances->pii_prev = pii;
    361 	phyint_instances = pii;
    362 }
    363 
    364 /*
    365  * Create a new phyint with the given parameters. Also insert it into
    366  * the list of all phyints and the list of phyint group members by calling
    367  * phyint_insert().
    368  */
    369 static struct phyint *
    370 phyint_create(char *pi_name, struct phyint_group *pg, uint_t ifindex,
    371     uint64_t flags)
    372 {
    373 	struct phyint *pi;
    374 
    375 	pi = calloc(1, sizeof (struct phyint));
    376 	if (pi == NULL) {
    377 		logperror("phyint_create: calloc");
    378 		return (NULL);
    379 	}
    380 
    381 	/*
    382 	 * Record the phyint values.
    383 	 */
    384 	(void) strlcpy(pi->pi_name, pi_name, sizeof (pi->pi_name));
    385 	pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME;
    386 	pi->pi_ifindex = ifindex;
    387 	pi->pi_icmpid = htons(((getpid() & 0xFF) << 8) | (ifindex & 0xFF));
    388 
    389 	pi->pi_state = PI_INIT;
    390 	pi->pi_flags = PHYINT_FLAGS(flags);
    391 
    392 	/*
    393 	 * Initialize the link state.  The link state is initialized to
    394 	 * up, so that if the link is down when IPMP starts monitoring
    395 	 * the interface, it will appear as though there has been a
    396 	 * transition from the link up to link down.  This avoids
    397 	 * having to treat this situation as a special case.
    398 	 */
    399 	INIT_LINK_STATE(pi);
    400 
    401 	if (!phyint_link_init(pi)) {
    402 		free(pi);
    403 		return (NULL);
    404 	}
    405 
    406 	/*
    407 	 * Insert the phyint in the list of all phyints, and the
    408 	 * list of phyint group members
    409 	 */
    410 	phyint_insert(pi, pg);
    411 
    412 	return (pi);
    413 }
    414 
    415 /*
    416  * Create a new phyint instance belonging to the phyint 'pi' and address
    417  * family 'af'. Also insert it into the list of all phyint instances by
    418  * calling phyint_inst_insert().
    419  */
    420 static struct phyint_instance *
    421 phyint_inst_create(struct phyint *pi, int af)
    422 {
    423 	struct phyint_instance *pii;
    424 
    425 	pii = calloc(1, sizeof (struct phyint_instance));
    426 	if (pii == NULL) {
    427 		logperror("phyint_inst_create: calloc");
    428 		return (NULL);
    429 	}
    430 
    431 	/*
    432 	 * Attach the phyint instance to the phyint.
    433 	 * Set the back pointers as well
    434 	 */
    435 	pii->pii_phyint = pi;
    436 	if (af == AF_INET)
    437 		pi->pi_v4 = pii;
    438 	else
    439 		pi->pi_v6 = pii;
    440 
    441 	pii->pii_in_use = 1;
    442 	pii->pii_probe_sock = -1;
    443 	pii->pii_snxt = 1;
    444 	pii->pii_af = af;
    445 	pii->pii_fd_hrtime = gethrtime() +
    446 	    (FAILURE_DETECTION_QP * (hrtime_t)NANOSEC);
    447 	pii->pii_flags = pi->pi_flags;
    448 
    449 	/* Insert the phyint instance in the list of all phyint instances. */
    450 	phyint_inst_insert(pii);
    451 	return (pii);
    452 }
    453 
    454 /*
    455  * Change the state of phyint `pi' to state `state'.
    456  */
    457 void
    458 phyint_chstate(struct phyint *pi, enum pi_state state)
    459 {
    460 	/*
    461 	 * To simplify things, some callers always set a given state
    462 	 * regardless of the previous state of the phyint (e.g., setting
    463 	 * PI_RUNNING when it's already set).  We shouldn't bother
    464 	 * generating an event or consuming a signature for these, since
    465 	 * the actual state of the interface is unchanged.
    466 	 */
    467 	if (pi->pi_state == state)
    468 		return;
    469 
    470 	pi->pi_state = state;
    471 	phyint_changed(pi);
    472 }
    473 
    474 /*
    475  * Note that `pi' has changed state.
    476  */
    477 void
    478 phyint_changed(struct phyint *pi)
    479 {
    480 	pi->pi_group->pg_sig++;
    481 	(void) phyint_state_event(pi->pi_group, pi);
    482 }
    483 
    484 /*
    485  * Insert the phyint group in the linked list of all phyint groups
    486  * at the head of the list
    487  */
    488 void
    489 phyint_group_insert(struct phyint_group *pg)
    490 {
    491 	pg->pg_next = phyint_groups;
    492 	pg->pg_prev = NULL;
    493 	if (phyint_groups != NULL)
    494 		phyint_groups->pg_prev = pg;
    495 	phyint_groups = pg;
    496 
    497 	phyint_grouplistsig++;
    498 	(void) phyint_group_change_event(pg, IPMP_GROUP_ADD);
    499 }
    500 
    501 /*
    502  * Create a new phyint group called 'name'.
    503  */
    504 struct phyint_group *
    505 phyint_group_create(const char *name)
    506 {
    507 	struct	phyint_group *pg;
    508 
    509 	if (debug & D_PHYINT)
    510 		logdebug("phyint_group_create(%s)\n", name);
    511 
    512 	pg = calloc(1, sizeof (struct phyint_group));
    513 	if (pg == NULL) {
    514 		logperror("phyint_group_create: calloc");
    515 		return (NULL);
    516 	}
    517 
    518 	(void) strlcpy(pg->pg_name, name, sizeof (pg->pg_name));
    519 	pg->pg_sig = gensig();
    520 	pg->pg_fdt = user_failure_detection_time;
    521 	pg->pg_probeint = user_probe_interval;
    522 	pg->pg_in_use = _B_TRUE;
    523 
    524 	/*
    525 	 * Normal groups always start in the PG_FAILED state since they
    526 	 * have no active interfaces.  In contrast, anonymous groups are
    527 	 * heterogeneous and thus always PG_OK.
    528 	 */
    529 	pg->pg_state = (name[0] == '\0' ? PG_OK : PG_FAILED);
    530 
    531 	return (pg);
    532 }
    533 
    534 /*
    535  * Change the state of the phyint group `pg' to state `state'.
    536  */
    537 void
    538 phyint_group_chstate(struct phyint_group *pg, enum pg_state state)
    539 {
    540 	assert(pg != phyint_anongroup);
    541 
    542 	/*
    543 	 * To simplify things, some callers always set a given state
    544 	 * regardless of the previous state of the group (e.g., setting
    545 	 * PG_DEGRADED when it's already set).  We shouldn't bother
    546 	 * generating an event or consuming a signature for these, since
    547 	 * the actual state of the group is unchanged.
    548 	 */
    549 	if (pg->pg_state == state)
    550 		return;
    551 
    552 	pg->pg_state = state;
    553 
    554 	switch (state) {
    555 	case PG_FAILED:
    556 		/*
    557 		 * We can never know with certainty that a group has
    558 		 * failed.  It is possible that all known targets have
    559 		 * failed simultaneously, and new targets have come up
    560 		 * instead. If the targets are routers then router
    561 		 * discovery will kick in, and we will see the new routers
    562 		 * thru routing socket messages. But if the targets are
    563 		 * hosts, we have to discover it by multicast.	So flush
    564 		 * all the host targets. The next probe will send out a
    565 		 * multicast echo request. If this is a group failure, we
    566 		 * will still not see any response, otherwise the group
    567 		 * will be repaired after we get NUM_PROBE_REPAIRS
    568 		 * consecutive unicast replies on any phyint.
    569 		 */
    570 		target_flush_hosts(pg);
    571 		break;
    572 
    573 	case PG_OK:
    574 	case PG_DEGRADED:
    575 		break;
    576 
    577 	default:
    578 		logerr("phyint_group_chstate: invalid group state %d; "
    579 		    "aborting\n", state);
    580 		abort();
    581 	}
    582 
    583 	pg->pg_sig++;
    584 	(void) phyint_group_state_event(pg);
    585 }
    586 
    587 /*
    588  * Create a new phyint instance and initialize it from the values supplied by
    589  * the kernel. Always check for ENXIO before logging any error, because the
    590  * interface could have vanished after completion of SIOCGLIFCONF.
    591  * Return values:
    592  *	pointer to the phyint instance on success
    593  *	NULL on failure Eg. if the phyint instance is not found in the kernel
    594  */
    595 struct phyint_instance *
    596 phyint_inst_init_from_k(int af, char *pi_name)
    597 {
    598 	char	pg_name[LIFNAMSIZ + 1];
    599 	int	ifsock;
    600 	uint_t	ifindex;
    601 	uint64_t	flags;
    602 	struct lifreq	lifr;
    603 	struct phyint	*pi;
    604 	struct phyint_instance	*pii;
    605 	boolean_t	pi_created;
    606 	struct phyint_group	*pg;
    607 
    608 retry:
    609 	pii = NULL;
    610 	pi = NULL;
    611 	pg = NULL;
    612 	pi_created = _B_FALSE;
    613 
    614 	if (debug & D_PHYINT) {
    615 		logdebug("phyint_inst_init_from_k(%s %s)\n",
    616 		    AF_STR(af), pi_name);
    617 	}
    618 
    619 	assert(af == AF_INET || af == AF_INET6);
    620 
    621 	/* Get the socket for doing ioctls */
    622 	ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
    623 
    624 	/*
    625 	 * Get the interface flags.  Ignore virtual interfaces, IPMP
    626 	 * meta-interfaces, point-to-point interfaces, and interfaces
    627 	 * that can't support multicast.
    628 	 */
    629 	(void) strlcpy(lifr.lifr_name, pi_name, sizeof (lifr.lifr_name));
    630 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
    631 		if (errno != ENXIO) {
    632 			logperror("phyint_inst_init_from_k:"
    633 			    " ioctl (get flags)");
    634 		}
    635 		return (NULL);
    636 	}
    637 	flags = lifr.lifr_flags;
    638 	if (!(flags & IFF_MULTICAST) ||
    639 	    (flags & (IFF_VIRTUAL|IFF_IPMP|IFF_POINTOPOINT)))
    640 		return (NULL);
    641 
    642 	/*
    643 	 * Get the ifindex for recording later in our tables, in case we need
    644 	 * to create a new phyint.
    645 	 */
    646 	if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) {
    647 		if (errno != ENXIO) {
    648 			logperror("phyint_inst_init_from_k: "
    649 			    " ioctl (get lifindex)");
    650 		}
    651 		return (NULL);
    652 	}
    653 	ifindex = lifr.lifr_index;
    654 
    655 	/*
    656 	 * Get the phyint group name of this phyint, from the kernel.
    657 	 */
    658 	if (ioctl(ifsock, SIOCGLIFGROUPNAME, (char *)&lifr) < 0) {
    659 		if (errno != ENXIO) {
    660 			logperror("phyint_inst_init_from_k: "
    661 			    "ioctl (get group name)");
    662 		}
    663 		return (NULL);
    664 	}
    665 	(void) strlcpy(pg_name, lifr.lifr_groupname, sizeof (pg_name));
    666 
    667 	/*
    668 	 * If the phyint is not part of any group, pg_name is the
    669 	 * null string. If 'track_all_phyints' is false, there is no
    670 	 * need to create a phyint.
    671 	 */
    672 	if (pg_name[0] == '\0' && !track_all_phyints) {
    673 		/*
    674 		 * If the IFF_FAILED, IFF_INACTIVE, or IFF_OFFLINE flags are
    675 		 * set, reset them. These flags shouldn't be set if in.mpathd
    676 		 * isn't tracking the interface.
    677 		 */
    678 		if ((flags & (IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE))) {
    679 			lifr.lifr_flags = flags &
    680 			    ~(IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE);
    681 			if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
    682 				if (errno != ENXIO) {
    683 					logperror("phyint_inst_init_from_k:"
    684 					    " ioctl (set flags)");
    685 				}
    686 			}
    687 		}
    688 		return (NULL);
    689 	}
    690 
    691 	/*
    692 	 * We need to create a new phyint instance.  We may also need to
    693 	 * create the group if e.g. the SIOCGLIFCONF loop in initifs() found
    694 	 * an underlying interface before it found its IPMP meta-interface.
    695 	 * Note that we keep any created groups even if phyint_inst_from_k()
    696 	 * fails since a group's existence is not dependent on the ability of
    697 	 * in.mpathd to the track the group's interfaces.
    698 	 */
    699 	if ((pg = phyint_group_lookup(pg_name)) == NULL) {
    700 		if ((pg = phyint_group_create(pg_name)) == NULL) {
    701 			logerr("phyint_inst_init_from_k: cannot create group "
    702 			    "%s\n", pg_name);
    703 			return (NULL);
    704 		}
    705 		phyint_group_insert(pg);
    706 	}
    707 
    708 	/*
    709 	 * Lookup the phyint. If the phyint does not exist create it.
    710 	 */
    711 	pi = phyint_lookup(pi_name);
    712 	if (pi == NULL) {
    713 		pi = phyint_create(pi_name, pg, ifindex, flags);
    714 		if (pi == NULL) {
    715 			logerr("phyint_inst_init_from_k:"
    716 			    " unable to create phyint %s\n", pi_name);
    717 			return (NULL);
    718 		}
    719 		pi_created = _B_TRUE;
    720 	} else {
    721 		/* The phyint exists already. */
    722 		assert(pi_created == _B_FALSE);
    723 		/*
    724 		 * Normally we should see consistent values for the IPv4 and
    725 		 * IPv6 instances, for phyint properties. If we don't, it
    726 		 * means things have changed underneath us, and we should
    727 		 * resync our tables with the kernel. Check whether the
    728 		 * interface index has changed. If so, it is most likely
    729 		 * the interface has been unplumbed and replumbed,
    730 		 * while we are yet to update our tables. Do it now.
    731 		 */
    732 		if (pi->pi_ifindex != ifindex) {
    733 			phyint_inst_delete(PHYINT_INSTANCE(pi, AF_OTHER(af)));
    734 			goto retry;
    735 		}
    736 		assert(PHYINT_INSTANCE(pi, af) == NULL);
    737 
    738 		/*
    739 		 * If the group name seen by the IPv4 and IPv6 instances
    740 		 * are different, it is most likely the groupname has
    741 		 * changed, while we are yet to update our tables. Do it now.
    742 		 */
    743 		if (strcmp(pi->pi_group->pg_name, pg_name) != 0) {
    744 			phyint_inst_delete(PHYINT_INSTANCE(pi,
    745 			    AF_OTHER(af)));
    746 			goto retry;
    747 		}
    748 	}
    749 
    750 	/*
    751 	 * Create a new phyint instance, corresponding to the 'af'
    752 	 * passed in.
    753 	 */
    754 	pii = phyint_inst_create(pi, af);
    755 	if (pii == NULL) {
    756 		logerr("phyint_inst_init_from_k: unable to create"
    757 		    "phyint inst %s\n", pi->pi_name);
    758 		if (pi_created)
    759 			phyint_delete(pi);
    760 
    761 		return (NULL);
    762 	}
    763 
    764 	/*
    765 	 * NOTE: the change_pif_flags() implementation requires a phyint
    766 	 * instance before it can function, so a number of tasks that would
    767 	 * otherwise be done in phyint_create() are deferred to here.
    768 	 */
    769 	if (pi_created) {
    770 		/*
    771 		 * If the interface is offline, set the state to PI_OFFLINE.
    772 		 * Otherwise, optimistically consider this interface running.
    773 		 * Later (in process_link_state_changes()), we will adjust
    774 		 * this to match the current state of the link.  Further, if
    775 		 * test addresses are subsequently assigned, we will
    776 		 * transition to PI_NOTARGETS and then to either PI_RUNNING or
    777 		 * PI_FAILED depending on the probe results.
    778 		 */
    779 		if (pi->pi_flags & IFF_OFFLINE) {
    780 			phyint_chstate(pi, PI_OFFLINE);
    781 		} else {
    782 			/* calls phyint_chstate() */
    783 			phyint_transition_to_running(pi);
    784 		}
    785 
    786 		/*
    787 		 * If this a standby phyint, determine whether it should be
    788 		 * IFF_INACTIVE.
    789 		 */
    790 		if (pi->pi_flags & IFF_STANDBY)
    791 			phyint_standby_refresh_inactive(pi);
    792 
    793 		/*
    794 		 * If this phyint does not have a unique hardware address in its
    795 		 * group, offline it.
    796 		 */
    797 		if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) {
    798 			pi->pi_hwaddrdup = _B_TRUE;
    799 			(void) phyint_offline(pi, 0);
    800 		}
    801 	}
    802 
    803 	return (pii);
    804 }
    805 
    806 /*
    807  * Bind pii_probe_sock to the address associated with pii_probe_logint.
    808  * This socket will be used for sending and receiving ICMP/ICMPv6 probes to
    809  * targets. Do the common part in this function, and complete the
    810  * initializations by calling the protocol specific functions
    811  * phyint_inst_v{4,6}_sockinit() respectively.
    812  *
    813  * Return values: _B_TRUE/_B_FALSE for success or failure respectively.
    814  */
    815 boolean_t
    816 phyint_inst_sockinit(struct phyint_instance *pii)
    817 {
    818 	boolean_t success;
    819 	struct phyint_group *pg;
    820 
    821 	if (debug & D_PHYINT) {
    822 		logdebug("phyint_inst_sockinit(%s %s)\n",
    823 		    AF_STR(pii->pii_af), pii->pii_name);
    824 	}
    825 
    826 	assert(pii->pii_probe_logint != NULL);
    827 	assert(pii->pii_probe_logint->li_flags & IFF_UP);
    828 	assert(pii->pii_probe_logint->li_flags & IFF_NOFAILOVER);
    829 	assert(pii->pii_af == AF_INET || pii->pii_af == AF_INET6);
    830 
    831 	/*
    832 	 * If the socket is already bound, close pii_probe_sock
    833 	 */
    834 	if (pii->pii_probe_sock != -1)
    835 		close_probe_socket(pii, _B_TRUE);
    836 
    837 	/*
    838 	 * If the phyint is not part of a named group and track_all_phyints is
    839 	 * false, simply return.
    840 	 */
    841 	pg = pii->pii_phyint->pi_group;
    842 	if (pg == phyint_anongroup && !track_all_phyints) {
    843 		if (debug & D_PHYINT)
    844 			logdebug("phyint_inst_sockinit: no group\n");
    845 		return (_B_FALSE);
    846 	}
    847 
    848 	/*
    849 	 * Initialize the socket by calling the protocol specific function.
    850 	 * If it succeeds, add the socket to the poll list.
    851 	 */
    852 	if (pii->pii_af == AF_INET6)
    853 		success = phyint_inst_v6_sockinit(pii);
    854 	else
    855 		success = phyint_inst_v4_sockinit(pii);
    856 
    857 	if (success && (poll_add(pii->pii_probe_sock) == 0))
    858 		return (_B_TRUE);
    859 
    860 	/* Something failed, cleanup and return false */
    861 	if (pii->pii_probe_sock != -1)
    862 		close_probe_socket(pii, _B_FALSE);
    863 
    864 	return (_B_FALSE);
    865 }
    866 
    867 /*
    868  * IPv6 specific part in initializing the pii_probe_sock. This socket is
    869  * used to send/receive ICMPv6 probe packets.
    870  */
    871 static boolean_t
    872 phyint_inst_v6_sockinit(struct phyint_instance *pii)
    873 {
    874 	icmp6_filter_t filter;
    875 	int hopcount = 1;
    876 	int off = 0;
    877 	int on = 1;
    878 	struct	sockaddr_in6	testaddr;
    879 	int flags;
    880 
    881 	/*
    882 	 * Open a raw socket with ICMPv6 protocol.
    883 	 *
    884 	 * Use IPV6_BOUND_IF to make sure that probes are sent and received on
    885 	 * the specified phyint only.  Bind to the test address to ensure that
    886 	 * the responses are sent to the specified phyint.
    887 	 *
    888 	 * Set the hopcount to 1 so that probe packets are not routed.
    889 	 * Disable multicast loopback. Set the receive filter to
    890 	 * receive only ICMPv6 echo replies.
    891 	 */
    892 	pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMPV6);
    893 	if (pii->pii_probe_sock < 0) {
    894 		logperror_pii(pii, "phyint_inst_v6_sockinit: socket");
    895 		return (_B_FALSE);
    896 	}
    897 
    898 	/*
    899 	 * Probes must not block in case of lower layer issues.
    900 	 */
    901 	if ((flags = fcntl(pii->pii_probe_sock, F_GETFL, 0)) == -1) {
    902 		logperror_pii(pii, "phyint_inst_v6_sockinit: fcntl"
    903 		    " F_GETFL");
    904 		return (_B_FALSE);
    905 	}
    906 	if (fcntl(pii->pii_probe_sock, F_SETFL,
    907 	    flags | O_NONBLOCK) == -1) {
    908 		logperror_pii(pii, "phyint_inst_v6_sockinit: fcntl"
    909 		    " F_SETFL O_NONBLOCK");
    910 		return (_B_FALSE);
    911 	}
    912 
    913 	bzero(&testaddr, sizeof (testaddr));
    914 	testaddr.sin6_family = AF_INET6;
    915 	testaddr.sin6_port = 0;
    916 	testaddr.sin6_addr = pii->pii_probe_logint->li_addr;
    917 
    918 	if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr,
    919 	    sizeof (testaddr)) < 0) {
    920 		logperror_pii(pii, "phyint_inst_v6_sockinit: IPv6 bind");
    921 		return (_B_FALSE);
    922 	}
    923 
    924 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_IF,
    925 	    (char *)&pii->pii_ifindex, sizeof (uint_t)) < 0) {
    926 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
    927 		    " IPV6_MULTICAST_IF");
    928 		return (_B_FALSE);
    929 	}
    930 
    931 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_BOUND_IF,
    932 	    &pii->pii_ifindex, sizeof (uint_t)) < 0) {
    933 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
    934 		    " IPV6_BOUND_IF");
    935 		return (_B_FALSE);
    936 	}
    937 
    938 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_UNICAST_HOPS,
    939 	    (char *)&hopcount, sizeof (hopcount)) < 0) {
    940 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
    941 		    " IPV6_UNICAST_HOPS");
    942 		return (_B_FALSE);
    943 	}
    944 
    945 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_HOPS,
    946 	    (char *)&hopcount, sizeof (hopcount)) < 0) {
    947 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
    948 		    " IPV6_MULTICAST_HOPS");
    949 		return (_B_FALSE);
    950 	}
    951 
    952 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_LOOP,
    953 	    (char *)&off, sizeof (off)) < 0) {
    954 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
    955 		    " IPV6_MULTICAST_LOOP");
    956 		return (_B_FALSE);
    957 	}
    958 
    959 	/*
    960 	 * Filter out so that we only receive ICMP echo replies
    961 	 */
    962 	ICMP6_FILTER_SETBLOCKALL(&filter);
    963 	ICMP6_FILTER_SETPASS(ICMP6_ECHO_REPLY, &filter);
    964 
    965 	if (setsockopt(pii->pii_probe_sock, IPPROTO_ICMPV6, ICMP6_FILTER,
    966 	    (char *)&filter, sizeof (filter)) < 0) {
    967 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
    968 		    " ICMP6_FILTER");
    969 		return (_B_FALSE);
    970 	}
    971 
    972 	/* Enable receipt of hoplimit */
    973 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_RECVHOPLIMIT,
    974 	    &on, sizeof (on)) < 0) {
    975 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
    976 		    " IPV6_RECVHOPLIMIT");
    977 		return (_B_FALSE);
    978 	}
    979 
    980 	/* Enable receipt of timestamp */
    981 	if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP,
    982 	    &on, sizeof (on)) < 0) {
    983 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
    984 		    " SO_TIMESTAMP");
    985 		return (_B_FALSE);
    986 	}
    987 
    988 	return (_B_TRUE);
    989 }
    990 
    991 /*
    992  * IPv4 specific part in initializing the pii_probe_sock. This socket is
    993  * used to send/receive ICMPv4 probe packets.
    994  */
    995 static boolean_t
    996 phyint_inst_v4_sockinit(struct phyint_instance *pii)
    997 {
    998 	struct sockaddr_in  testaddr;
    999 	char	char_off = 0;
   1000 	int	ttl = 1;
   1001 	char	char_ttl = 1;
   1002 	int	on = 1;
   1003 	int	flags;
   1004 
   1005 	/*
   1006 	 * Open a raw socket with ICMPv4 protocol.
   1007 	 *
   1008 	 * Use IP_BOUND_IF to make sure that probes are sent and received on
   1009 	 * the specified phyint only.  Bind to the test address to ensure that
   1010 	 * the responses are sent to the specified phyint.
   1011 	 *
   1012 	 * Set the ttl to 1 so that probe packets are not routed.
   1013 	 * Disable multicast loopback.  Enable receipt of timestamp.
   1014 	 */
   1015 	pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMP);
   1016 	if (pii->pii_probe_sock < 0) {
   1017 		logperror_pii(pii, "phyint_inst_v4_sockinit: socket");
   1018 		return (_B_FALSE);
   1019 	}
   1020 
   1021 	/*
   1022 	 * Probes must not block in case of lower layer issues.
   1023 	 */
   1024 	if ((flags = fcntl(pii->pii_probe_sock, F_GETFL, 0)) == -1) {
   1025 		logperror_pii(pii, "phyint_inst_v4_sockinit: fcntl"
   1026 		    " F_GETFL");
   1027 		return (_B_FALSE);
   1028 	}
   1029 	if (fcntl(pii->pii_probe_sock, F_SETFL,
   1030 	    flags | O_NONBLOCK) == -1) {
   1031 		logperror_pii(pii, "phyint_inst_v4_sockinit: fcntl"
   1032 		    " F_SETFL O_NONBLOCK");
   1033 		return (_B_FALSE);
   1034 	}
   1035 
   1036 	bzero(&testaddr, sizeof (testaddr));
   1037 	testaddr.sin_family = AF_INET;
   1038 	testaddr.sin_port = 0;
   1039 	IN6_V4MAPPED_TO_INADDR(&pii->pii_probe_logint->li_addr,
   1040 	    &testaddr.sin_addr);
   1041 
   1042 	if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr,
   1043 	    sizeof (testaddr)) < 0) {
   1044 		logperror_pii(pii, "phyint_inst_v4_sockinit: IPv4 bind");
   1045 		return (_B_FALSE);
   1046 	}
   1047 
   1048 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_BOUND_IF,
   1049 	    &pii->pii_ifindex, sizeof (uint_t)) < 0) {
   1050 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
   1051 		    " IP_BOUND_IF");
   1052 		return (_B_FALSE);
   1053 	}
   1054 
   1055 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_IF,
   1056 	    (char *)&testaddr.sin_addr, sizeof (struct in_addr)) < 0) {
   1057 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
   1058 		    " IP_MULTICAST_IF");
   1059 		return (_B_FALSE);
   1060 	}
   1061 
   1062 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_TTL,
   1063 	    (char *)&ttl, sizeof (ttl)) < 0) {
   1064 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
   1065 		    " IP_TTL");
   1066 		return (_B_FALSE);
   1067 	}
   1068 
   1069 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_LOOP,
   1070 	    (char *)&char_off, sizeof (char_off)) == -1) {
   1071 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
   1072 		    " IP_MULTICAST_LOOP");
   1073 		return (_B_FALSE);
   1074 	}
   1075 
   1076 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_TTL,
   1077 	    (char *)&char_ttl, sizeof (char_ttl)) == -1) {
   1078 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
   1079 		    " IP_MULTICAST_TTL");
   1080 		return (_B_FALSE);
   1081 	}
   1082 
   1083 	if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP, &on,
   1084 	    sizeof (on)) < 0) {
   1085 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
   1086 		    " SO_TIMESTAMP");
   1087 		return (_B_FALSE);
   1088 	}
   1089 
   1090 	return (_B_TRUE);
   1091 }
   1092 
   1093 /*
   1094  * Remove the phyint group from the list of 'all phyint groups'
   1095  * and free it.
   1096  */
   1097 void
   1098 phyint_group_delete(struct phyint_group *pg)
   1099 {
   1100 	/*
   1101 	 * The anonymous group always exists, even when empty.
   1102 	 */
   1103 	if (pg == phyint_anongroup)
   1104 		return;
   1105 
   1106 	if (debug & D_PHYINT)
   1107 		logdebug("phyint_group_delete('%s')\n", pg->pg_name);
   1108 
   1109 	/*
   1110 	 * The phyint group must be empty, and must not have any phyints.
   1111 	 * The phyint group must be in the list of all phyint groups
   1112 	 */
   1113 	assert(pg->pg_phyint == NULL);
   1114 	assert(phyint_groups == pg || pg->pg_prev != NULL);
   1115 
   1116 	if (pg->pg_prev != NULL)
   1117 		pg->pg_prev->pg_next = pg->pg_next;
   1118 	else
   1119 		phyint_groups = pg->pg_next;
   1120 
   1121 	if (pg->pg_next != NULL)
   1122 		pg->pg_next->pg_prev = pg->pg_prev;
   1123 
   1124 	pg->pg_next = NULL;
   1125 	pg->pg_prev = NULL;
   1126 
   1127 	phyint_grouplistsig++;
   1128 	(void) phyint_group_change_event(pg, IPMP_GROUP_REMOVE);
   1129 
   1130 	addrlist_free(&pg->pg_addrs);
   1131 	free(pg);
   1132 }
   1133 
   1134 /*
   1135  * Refresh the state of `pg' based on its current members.
   1136  */
   1137 void
   1138 phyint_group_refresh_state(struct phyint_group *pg)
   1139 {
   1140 	enum pg_state state;
   1141 	enum pg_state origstate = pg->pg_state;
   1142 	struct phyint *pi, *usablepi;
   1143 	uint_t nif = 0, nusable = 0;
   1144 
   1145 	/*
   1146 	 * Anonymous groups never change state.
   1147 	 */
   1148 	if (pg == phyint_anongroup)
   1149 		return;
   1150 
   1151 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
   1152 		nif++;
   1153 		if (phyint_is_usable(pi)) {
   1154 			nusable++;
   1155 			usablepi = pi;
   1156 		}
   1157 	}
   1158 
   1159 	if (nusable == 0)
   1160 		state = PG_FAILED;
   1161 	else if (nif == nusable)
   1162 		state = PG_OK;
   1163 	else
   1164 		state = PG_DEGRADED;
   1165 
   1166 	phyint_group_chstate(pg, state);
   1167 
   1168 	/*
   1169 	 * If we're shutting down, skip logging messages since otherwise our
   1170 	 * shutdown housecleaning will make us report that groups are unusable.
   1171 	 */
   1172 	if (cleanup_started)
   1173 		return;
   1174 
   1175 	/*
   1176 	 * NOTE: We use pg_failmsg_printed rather than origstate since
   1177 	 * otherwise at startup we'll log a "now usable" message when the
   1178 	 * first usable phyint is added to an empty group.
   1179 	 */
   1180 	if (state != PG_FAILED && pg->pg_failmsg_printed) {
   1181 		assert(origstate == PG_FAILED);
   1182 		logerr("At least 1 IP interface (%s) in group %s is now "
   1183 		    "usable\n", usablepi->pi_name, pg->pg_name);
   1184 		pg->pg_failmsg_printed = _B_FALSE;
   1185 	} else if (origstate != PG_FAILED && state == PG_FAILED) {
   1186 		logerr("All IP interfaces in group %s are now unusable\n",
   1187 		    pg->pg_name);
   1188 		pg->pg_failmsg_printed = _B_TRUE;
   1189 	}
   1190 }
   1191 
   1192 /*
   1193  * Extract information from the kernel about the desired phyint.
   1194  * Look only for properties of the phyint and not properties of logints.
   1195  * Take appropriate action on the changes.
   1196  * Return codes:
   1197  *	PI_OK
   1198  *		The phyint exists in the kernel and matches our knowledge
   1199  *		of the phyint.
   1200  *	PI_DELETED
   1201  *		The phyint has vanished in the kernel.
   1202  *	PI_IFINDEX_CHANGED
   1203  *		The phyint's interface index has changed.
   1204  *		Ask the caller to delete and recreate the phyint.
   1205  *	PI_IOCTL_ERROR
   1206  *		Some ioctl error. Don't change anything.
   1207  *	PI_GROUP_CHANGED
   1208  *		The phyint has changed group.
   1209  */
   1210 int
   1211 phyint_inst_update_from_k(struct phyint_instance *pii)
   1212 {
   1213 	struct lifreq lifr;
   1214 	int	ifsock;
   1215 	struct phyint *pi;
   1216 
   1217 	pi = pii->pii_phyint;
   1218 
   1219 	if (debug & D_PHYINT) {
   1220 		logdebug("phyint_inst_update_from_k(%s %s)\n",
   1221 		    AF_STR(pii->pii_af), pi->pi_name);
   1222 	}
   1223 
   1224 	/*
   1225 	 * Get the ifindex from the kernel, for comparison with the
   1226 	 * value in our tables.
   1227 	 */
   1228 	(void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
   1229 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
   1230 
   1231 	ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6;
   1232 	if (ioctl(ifsock, SIOCGLIFINDEX, &lifr) < 0) {
   1233 		if (errno == ENXIO) {
   1234 			return (PI_DELETED);
   1235 		} else {
   1236 			logperror_pii(pii, "phyint_inst_update_from_k:"
   1237 			    " ioctl (get lifindex)");
   1238 			return (PI_IOCTL_ERROR);
   1239 		}
   1240 	}
   1241 
   1242 	if (lifr.lifr_index != pi->pi_ifindex) {
   1243 		/*
   1244 		 * The index has changed. Most likely the interface has
   1245 		 * been unplumbed and replumbed. Ask the caller to take
   1246 		 * appropriate action.
   1247 		 */
   1248 		if (debug & D_PHYINT) {
   1249 			logdebug("phyint_inst_update_from_k:"
   1250 			    " old index %d new index %d\n",
   1251 			    pi->pi_ifindex, lifr.lifr_index);
   1252 		}
   1253 		return (PI_IFINDEX_CHANGED);
   1254 	}
   1255 
   1256 	/*
   1257 	 * Get the group name from the kernel, for comparison with
   1258 	 * the value in our tables.
   1259 	 */
   1260 	if (ioctl(ifsock, SIOCGLIFGROUPNAME, &lifr) < 0) {
   1261 		if (errno == ENXIO) {
   1262 			return (PI_DELETED);
   1263 		} else {
   1264 			logperror_pii(pii, "phyint_inst_update_from_k:"
   1265 			    " ioctl (get groupname)");
   1266 			return (PI_IOCTL_ERROR);
   1267 		}
   1268 	}
   1269 
   1270 	/*
   1271 	 * If the phyint has changed group i.e. if the phyint group name
   1272 	 * returned by the kernel is different, ask the caller to delete
   1273 	 * and recreate the phyint in the right group
   1274 	 */
   1275 	if (strcmp(lifr.lifr_groupname, pi->pi_group->pg_name) != 0) {
   1276 		/* Groupname has changed */
   1277 		if (debug & D_PHYINT) {
   1278 			logdebug("phyint_inst_update_from_k:"
   1279 			    " groupname change\n");
   1280 		}
   1281 		return (PI_GROUP_CHANGED);
   1282 	}
   1283 
   1284 	/*
   1285 	 * Get the current phyint flags from the kernel, and determine what
   1286 	 * flags have changed by comparing against our tables.	Note that the
   1287 	 * IFF_INACTIVE processing in initifs() relies on this call to ensure
   1288 	 * that IFF_INACTIVE is really still set on the interface.
   1289 	 */
   1290 	if (ioctl(ifsock, SIOCGLIFFLAGS, &lifr) < 0) {
   1291 		if (errno == ENXIO) {
   1292 			return (PI_DELETED);
   1293 		} else {
   1294 			logperror_pii(pii, "phyint_inst_update_from_k: "
   1295 			    " ioctl (get flags)");
   1296 			return (PI_IOCTL_ERROR);
   1297 		}
   1298 	}
   1299 
   1300 	pi->pi_flags = PHYINT_FLAGS(lifr.lifr_flags);
   1301 	if (pi->pi_v4 != NULL)
   1302 		pi->pi_v4->pii_flags = pi->pi_flags;
   1303 	if (pi->pi_v6 != NULL)
   1304 		pi->pi_v6->pii_flags = pi->pi_flags;
   1305 
   1306 	/*
   1307 	 * Make sure the IFF_FAILED flag is set if and only if we think
   1308 	 * the interface should be failed.
   1309 	 */
   1310 	if (pi->pi_flags & IFF_FAILED) {
   1311 		if (pi->pi_state == PI_RUNNING)
   1312 			(void) change_pif_flags(pi, 0, IFF_FAILED);
   1313 	} else {
   1314 		if (pi->pi_state == PI_FAILED)
   1315 			(void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE);
   1316 	}
   1317 
   1318 	/* No change in phyint status */
   1319 	return (PI_OK);
   1320 }
   1321 
   1322 /*
   1323  * Delete the phyint. Remove it from the list of all phyints, and the
   1324  * list of phyint group members.
   1325  */
   1326 static void
   1327 phyint_delete(struct phyint *pi)
   1328 {
   1329 	boolean_t active;
   1330 	struct phyint *pi2;
   1331 	struct phyint_group *pg = pi->pi_group;
   1332 
   1333 	if (debug & D_PHYINT)
   1334 		logdebug("phyint_delete(%s)\n", pi->pi_name);
   1335 
   1336 	/* Both IPv4 and IPv6 phyint instances must have been deleted. */
   1337 	assert(pi->pi_v4 == NULL && pi->pi_v6 == NULL);
   1338 
   1339 	/*
   1340 	 * The phyint must belong to a group.
   1341 	 */
   1342 	assert(pg->pg_phyint == pi || pi->pi_pgprev != NULL);
   1343 
   1344 	/* The phyint must be in the list of all phyints */
   1345 	assert(phyints == pi || pi->pi_prev != NULL);
   1346 
   1347 	/* Remove the phyint from the phyint group list */
   1348 	pg->pg_sig++;
   1349 	(void) phyint_group_member_event(pg, pi, IPMP_IF_REMOVE);
   1350 
   1351 	if (pi->pi_pgprev == NULL) {
   1352 		/* Phyint is the 1st in the phyint group list */
   1353 		pg->pg_phyint = pi->pi_pgnext;
   1354 	} else {
   1355 		pi->pi_pgprev->pi_pgnext = pi->pi_pgnext;
   1356 	}
   1357 	if (pi->pi_pgnext != NULL)
   1358 		pi->pi_pgnext->pi_pgprev = pi->pi_pgprev;
   1359 	pi->pi_pgnext = NULL;
   1360 	pi->pi_pgprev = NULL;
   1361 
   1362 	/* Refresh the group state now that this phyint has been removed */
   1363 	phyint_group_refresh_state(pg);
   1364 
   1365 	/* Remove the phyint from the global list of phyints */
   1366 	if (pi->pi_prev == NULL) {
   1367 		/* Phyint is the 1st in the list */
   1368 		phyints = pi->pi_next;
   1369 	} else {
   1370 		pi->pi_prev->pi_next = pi->pi_next;
   1371 	}
   1372 	if (pi->pi_next != NULL)
   1373 		pi->pi_next->pi_prev = pi->pi_prev;
   1374 	pi->pi_next = NULL;
   1375 	pi->pi_prev = NULL;
   1376 
   1377 	/*
   1378 	 * See if another phyint in the group had been offlined because
   1379 	 * it was a dup of `pi' -- and if so, online it.
   1380 	 */
   1381 	if (!pi->pi_hwaddrdup &&
   1382 	    (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) {
   1383 		assert(pi2->pi_hwaddrdup);
   1384 		(void) phyint_undo_offline(pi2);
   1385 	}
   1386 
   1387 	/*
   1388 	 * If the interface was in a named group and was either an active
   1389 	 * standby or the last active interface, try to activate another
   1390 	 * interface to compensate.
   1391 	 */
   1392 	if (pg != phyint_anongroup) {
   1393 		active = _B_FALSE;
   1394 		for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
   1395 			if (phyint_is_functioning(pi2) &&
   1396 			    !(pi2->pi_flags & IFF_INACTIVE)) {
   1397 				active = _B_TRUE;
   1398 				break;
   1399 			}
   1400 		}
   1401 
   1402 		if (!active ||
   1403 		    (pi->pi_flags & (IFF_STANDBY|IFF_INACTIVE)) == IFF_STANDBY)
   1404 			phyint_activate_another(pi);
   1405 	}
   1406 
   1407 	phyint_link_close(pi);
   1408 	free(pi);
   1409 }
   1410 
   1411 /*
   1412  * Offline phyint `pi' if at least `minred' usable interfaces remain in the
   1413  * group.  Returns an IPMP error code.
   1414  */
   1415 int
   1416 phyint_offline(struct phyint *pi, uint_t minred)
   1417 {
   1418 	boolean_t was_active;
   1419 	unsigned int nusable = 0;
   1420 	struct phyint *pi2;
   1421 	struct phyint_group *pg = pi->pi_group;
   1422 
   1423 	/*
   1424 	 * Verify that enough usable interfaces in the group would remain.
   1425 	 * As a special case, if the group has failed, allow any non-offline
   1426 	 * phyints to be offlined.
   1427 	 */
   1428 	if (pg != phyint_anongroup) {
   1429 		for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
   1430 			if (pi2 == pi)
   1431 				continue;
   1432 			if (phyint_is_usable(pi2) ||
   1433 			    (GROUP_FAILED(pg) && pi2->pi_state != PI_OFFLINE))
   1434 				nusable++;
   1435 		}
   1436 	}
   1437 	if (nusable < minred)
   1438 		return (IPMP_EMINRED);
   1439 
   1440 	was_active = ((pi->pi_flags & IFF_INACTIVE) == 0);
   1441 
   1442 	if (!change_pif_flags(pi, IFF_OFFLINE, IFF_INACTIVE))
   1443 		return (IPMP_FAILURE);
   1444 
   1445 	/*
   1446 	 * The interface is now offline, so stop probing it.  Note that
   1447 	 * if_mpadm(1M) will down the test addresses, after receiving a
   1448 	 * success reply from us. The routing socket message will then make us
   1449 	 * close the socket used for sending probes. But it is more logical
   1450 	 * that an offlined interface must not be probed, even if it has test
   1451 	 * addresses.
   1452 	 *
   1453 	 * NOTE: stop_probing() also sets PI_OFFLINE.
   1454 	 */
   1455 	stop_probing(pi);
   1456 
   1457 	/*
   1458 	 * If we're offlining the phyint because it has a duplicate hardware
   1459 	 * address, print a warning -- and leave the link open so that we can
   1460 	 * be notified of hardware address changes that make it usable again.
   1461 	 * Otherwise, close the link so that we won't prevent a detach.
   1462 	 */
   1463 	if (pi->pi_hwaddrdup) {
   1464 		logerr("IP interface %s has a hardware address which is not "
   1465 		    "unique in group %s; offlining\n", pi->pi_name,
   1466 		    pg->pg_name);
   1467 	} else {
   1468 		phyint_link_close(pi);
   1469 	}
   1470 
   1471 	/*
   1472 	 * If this phyint was preventing another phyint with a duplicate
   1473 	 * hardware address from being online, bring that one online now.
   1474 	 */
   1475 	if (!pi->pi_hwaddrdup &&
   1476 	    (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) {
   1477 		assert(pi2->pi_hwaddrdup);
   1478 		(void) phyint_undo_offline(pi2);
   1479 	}
   1480 
   1481 	/*
   1482 	 * If this interface was active, try to activate another INACTIVE
   1483 	 * interface in the group.
   1484 	 */
   1485 	if (was_active)
   1486 		phyint_activate_another(pi);
   1487 
   1488 	return (IPMP_SUCCESS);
   1489 }
   1490 
   1491 /*
   1492  * Undo a previous offline of `pi'.  Returns an IPMP error code.
   1493  */
   1494 int
   1495 phyint_undo_offline(struct phyint *pi)
   1496 {
   1497 	if (pi->pi_state != PI_OFFLINE) {
   1498 		errno = EINVAL;
   1499 		return (IPMP_FAILURE);
   1500 	}
   1501 
   1502 	/*
   1503 	 * If necessary, reinitialize our link information and verify that its
   1504 	 * hardware address is still unique across the group.
   1505 	 */
   1506 	if (pi->pi_dh == NULL && !phyint_link_init(pi)) {
   1507 		errno = EIO;
   1508 		return (IPMP_FAILURE);
   1509 	}
   1510 
   1511 	if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) {
   1512 		pi->pi_hwaddrdup = _B_TRUE;
   1513 		return (IPMP_EHWADDRDUP);
   1514 	}
   1515 
   1516 	if (pi->pi_hwaddrdup) {
   1517 		logerr("IP interface %s now has a unique hardware address in "
   1518 		    "group %s; onlining\n", pi->pi_name, pi->pi_group->pg_name);
   1519 		pi->pi_hwaddrdup = _B_FALSE;
   1520 	}
   1521 
   1522 	if (!change_pif_flags(pi, 0, IFF_OFFLINE))
   1523 		return (IPMP_FAILURE);
   1524 
   1525 	/*
   1526 	 * While the interface was offline, it may have failed (e.g. the link
   1527 	 * may have gone down).  phyint_inst_check_for_failure() will have
   1528 	 * already set pi_flags with IFF_FAILED, so we can use that to decide
   1529 	 * whether the phyint should transition to running.  Note that after
   1530 	 * we transition to running, we will start sending probes again (if
   1531 	 * test addresses are configured), which may also reveal that the
   1532 	 * interface is in fact failed.
   1533 	 */
   1534 	if (pi->pi_flags & IFF_FAILED) {
   1535 		phyint_chstate(pi, PI_FAILED);
   1536 	} else {
   1537 		/* calls phyint_chstate() */
   1538 		phyint_transition_to_running(pi);
   1539 	}
   1540 
   1541 	/*
   1542 	 * Give the requestor time to configure test addresses before
   1543 	 * complaining that they're missing.
   1544 	 */
   1545 	pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME;
   1546 
   1547 	return (IPMP_SUCCESS);
   1548 }
   1549 
   1550 /*
   1551  * Delete (unlink and free), the phyint instance.
   1552  */
   1553 void
   1554 phyint_inst_delete(struct phyint_instance *pii)
   1555 {
   1556 	struct phyint *pi = pii->pii_phyint;
   1557 
   1558 	assert(pi != NULL);
   1559 
   1560 	if (debug & D_PHYINT) {
   1561 		logdebug("phyint_inst_delete(%s %s)\n",
   1562 		    AF_STR(pii->pii_af), pi->pi_name);
   1563 	}
   1564 
   1565 	/*
   1566 	 * If the phyint instance has associated probe targets
   1567 	 * delete all the targets
   1568 	 */
   1569 	while (pii->pii_targets != NULL)
   1570 		target_delete(pii->pii_targets);
   1571 
   1572 	/*
   1573 	 * Delete all the logints associated with this phyint
   1574 	 * instance.
   1575 	 */
   1576 	while (pii->pii_logint != NULL)
   1577 		logint_delete(pii->pii_logint);
   1578 
   1579 	/*
   1580 	 * Close the socket used to send probes to targets from this phyint.
   1581 	 */
   1582 	if (pii->pii_probe_sock != -1)
   1583 		close_probe_socket(pii, _B_TRUE);
   1584 
   1585 	/*
   1586 	 * Phyint instance must be in the list of all phyint instances.
   1587 	 * Remove phyint instance from the global list of phyint instances.
   1588 	 */
   1589 	assert(phyint_instances == pii || pii->pii_prev != NULL);
   1590 	if (pii->pii_prev == NULL) {
   1591 		/* Phyint is the 1st in the list */
   1592 		phyint_instances = pii->pii_next;
   1593 	} else {
   1594 		pii->pii_prev->pii_next = pii->pii_next;
   1595 	}
   1596 	if (pii->pii_next != NULL)
   1597 		pii->pii_next->pii_prev = pii->pii_prev;
   1598 	pii->pii_next = NULL;
   1599 	pii->pii_prev = NULL;
   1600 
   1601 	/*
   1602 	 * Reset the phyint instance pointer in the phyint.
   1603 	 * If this is the last phyint instance (being deleted) on this
   1604 	 * phyint, then delete the phyint.
   1605 	 */
   1606 	if (pii->pii_af == AF_INET)
   1607 		pi->pi_v4 = NULL;
   1608 	else
   1609 		pi->pi_v6 = NULL;
   1610 
   1611 	if (pi->pi_v4 == NULL && pi->pi_v6 == NULL)
   1612 		phyint_delete(pi);
   1613 
   1614 	free(pii);
   1615 }
   1616 
   1617 static void
   1618 phyint_inst_print(struct phyint_instance *pii)
   1619 {
   1620 	struct logint *li;
   1621 	struct target *tg;
   1622 	char abuf[INET6_ADDRSTRLEN];
   1623 	int most_recent;
   1624 	int i;
   1625 
   1626 	if (pii->pii_phyint == NULL) {
   1627 		logdebug("pii->pi_phyint NULL can't print\n");
   1628 		return;
   1629 	}
   1630 
   1631 	logdebug("\nPhyint instance: %s %s index %u state %x flags %llx	 "
   1632 	    "sock %x in_use %d\n",
   1633 	    AF_STR(pii->pii_af), pii->pii_name, pii->pii_ifindex,
   1634 	    pii->pii_state, pii->pii_phyint->pi_flags, pii->pii_probe_sock,
   1635 	    pii->pii_in_use);
   1636 
   1637 	for (li = pii->pii_logint; li != NULL; li = li->li_next)
   1638 		logint_print(li);
   1639 
   1640 	logdebug("\n");
   1641 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
   1642 		target_print(tg);
   1643 
   1644 	if (pii->pii_targets == NULL)
   1645 		logdebug("pi_targets NULL\n");
   1646 
   1647 	if (pii->pii_target_next != NULL) {
   1648 		logdebug("pi_target_next %s %s\n", AF_STR(pii->pii_af),
   1649 		    pr_addr(pii->pii_af, pii->pii_target_next->tg_address,
   1650 		    abuf, sizeof (abuf)));
   1651 	} else {
   1652 		logdebug("pi_target_next NULL\n");
   1653 	}
   1654 
   1655 	if (pii->pii_rtt_target_next != NULL) {
   1656 		logdebug("pi_rtt_target_next %s %s\n", AF_STR(pii->pii_af),
   1657 		    pr_addr(pii->pii_af, pii->pii_rtt_target_next->tg_address,
   1658 		    abuf, sizeof (abuf)));
   1659 	} else {
   1660 		logdebug("pi_rtt_target_next NULL\n");
   1661 	}
   1662 
   1663 	if (pii->pii_targets != NULL) {
   1664 		most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
   1665 
   1666 		i = most_recent;
   1667 		do {
   1668 			if (pii->pii_probes[i].pr_target != NULL) {
   1669 				logdebug("#%d target %s ", i,
   1670 				    pr_addr(pii->pii_af,
   1671 				    pii->pii_probes[i].pr_target->tg_address,
   1672 				    abuf, sizeof (abuf)));
   1673 			} else {
   1674 				logdebug("#%d target NULL ", i);
   1675 			}
   1676 			logdebug("time_start %lld status %d "
   1677 			    "time_ackproc %lld time_lost %u",
   1678 			    pii->pii_probes[i].pr_hrtime_start,
   1679 			    pii->pii_probes[i].pr_status,
   1680 			    pii->pii_probes[i].pr_hrtime_ackproc,
   1681 			    pii->pii_probes[i].pr_time_lost);
   1682 			i = PROBE_INDEX_PREV(i);
   1683 		} while (i != most_recent);
   1684 	}
   1685 }
   1686 
   1687 /*
   1688  * Lookup a logint based on the logical interface name, on the given
   1689  * phyint instance.
   1690  */
   1691 static struct logint *
   1692 logint_lookup(struct phyint_instance *pii, char *name)
   1693 {
   1694 	struct logint *li;
   1695 
   1696 	if (debug & D_LOGINT) {
   1697 		logdebug("logint_lookup(%s, %s)\n",
   1698 		    AF_STR(pii->pii_af), name);
   1699 	}
   1700 
   1701 	for (li = pii->pii_logint; li != NULL; li = li->li_next) {
   1702 		if (strncmp(name, li->li_name, sizeof (li->li_name)) == 0)
   1703 			break;
   1704 	}
   1705 	return (li);
   1706 }
   1707 
   1708 /*
   1709  * Insert a logint at the head of the list of logints of the given
   1710  * phyint instance
   1711  */
   1712 static void
   1713 logint_insert(struct phyint_instance *pii, struct logint *li)
   1714 {
   1715 	li->li_next = pii->pii_logint;
   1716 	li->li_prev = NULL;
   1717 	if (pii->pii_logint != NULL)
   1718 		pii->pii_logint->li_prev = li;
   1719 	pii->pii_logint = li;
   1720 	li->li_phyint_inst = pii;
   1721 }
   1722 
   1723 /*
   1724  * Create a new named logint, on the specified phyint instance.
   1725  */
   1726 static struct logint *
   1727 logint_create(struct phyint_instance *pii, char *name)
   1728 {
   1729 	struct logint *li;
   1730 
   1731 	if (debug & D_LOGINT) {
   1732 		logdebug("logint_create(%s %s %s)\n",
   1733 		    AF_STR(pii->pii_af), pii->pii_name, name);
   1734 	}
   1735 
   1736 	li = calloc(1, sizeof (struct logint));
   1737 	if (li == NULL) {
   1738 		logperror("logint_create: calloc");
   1739 		return (NULL);
   1740 	}
   1741 
   1742 	(void) strncpy(li->li_name, name, sizeof (li->li_name));
   1743 	li->li_name[sizeof (li->li_name) - 1] = '\0';
   1744 	logint_insert(pii, li);
   1745 	return (li);
   1746 }
   1747 
   1748 /*
   1749  * Initialize the logint based on the data returned by the kernel.
   1750  */
   1751 void
   1752 logint_init_from_k(struct phyint_instance *pii, char *li_name)
   1753 {
   1754 	int	ifsock;
   1755 	uint64_t flags;
   1756 	uint64_t saved_flags;
   1757 	struct	logint	*li;
   1758 	struct lifreq	lifr;
   1759 	struct in6_addr	test_subnet;
   1760 	struct in6_addr	testaddr;
   1761 	int	test_subnet_len;
   1762 	struct sockaddr_in6	*sin6;
   1763 	struct sockaddr_in	*sin;
   1764 	char abuf[INET6_ADDRSTRLEN];
   1765 	boolean_t  ptp = _B_FALSE;
   1766 	struct in6_addr tgaddr;
   1767 
   1768 	if (debug & D_LOGINT) {
   1769 		logdebug("logint_init_from_k(%s %s)\n",
   1770 		    AF_STR(pii->pii_af), li_name);
   1771 	}
   1772 
   1773 	/* Get the socket for doing ioctls */
   1774 	ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6;
   1775 
   1776 	/*
   1777 	 * Get the flags from the kernel. Also serves as a check whether
   1778 	 * the logical still exists. If it doesn't exist, no need to proceed
   1779 	 * any further. li_in_use will make the caller clean up the logint
   1780 	 */
   1781 	(void) strncpy(lifr.lifr_name, li_name, sizeof (lifr.lifr_name));
   1782 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
   1783 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
   1784 		/* Interface may have vanished */
   1785 		if (errno != ENXIO) {
   1786 			logperror_pii(pii, "logint_init_from_k: "
   1787 			    "ioctl (get flags)");
   1788 		}
   1789 		return;
   1790 	}
   1791 
   1792 	flags = lifr.lifr_flags;
   1793 
   1794 	/*
   1795 	 * Verified the logint exists. Now lookup the logint in our tables.
   1796 	 * If it does not exist, create a new logint.
   1797 	 */
   1798 	li = logint_lookup(pii, li_name);
   1799 	if (li == NULL) {
   1800 		li = logint_create(pii, li_name);
   1801 		if (li == NULL) {
   1802 			/*
   1803 			 * Pretend the interface does not exist
   1804 			 * in the kernel
   1805 			 */
   1806 			return;
   1807 		}
   1808 	}
   1809 
   1810 	/*
   1811 	 * Update li->li_flags with the new flags, after saving the old
   1812 	 * value. This is used later to check what flags has changed and
   1813 	 * take any action
   1814 	 */
   1815 	saved_flags = li->li_flags;
   1816 	li->li_flags = flags;
   1817 
   1818 	/*
   1819 	 * Get the address, prefix, prefixlength and update the logint.
   1820 	 * Check if anything has changed. If the logint used for the
   1821 	 * test address has changed, take suitable action.
   1822 	 */
   1823 	if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) {
   1824 		/* Interface may have vanished */
   1825 		if (errno != ENXIO) {
   1826 			logperror_li(li, "logint_init_from_k: (get addr)");
   1827 		}
   1828 		goto error;
   1829 	}
   1830 
   1831 	if (pii->pii_af == AF_INET) {
   1832 		sin = (struct sockaddr_in *)&lifr.lifr_addr;
   1833 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &testaddr);
   1834 	} else {
   1835 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr;
   1836 		testaddr = sin6->sin6_addr;
   1837 	}
   1838 
   1839 	if (ioctl(ifsock, SIOCGLIFSUBNET, (char *)&lifr) < 0) {
   1840 		/* Interface may have vanished */
   1841 		if (errno != ENXIO)
   1842 			logperror_li(li, "logint_init_from_k: (get subnet)");
   1843 		goto error;
   1844 	}
   1845 	if (lifr.lifr_subnet.ss_family == AF_INET6) {
   1846 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_subnet;
   1847 		test_subnet = sin6->sin6_addr;
   1848 		test_subnet_len = lifr.lifr_addrlen;
   1849 	} else {
   1850 		sin = (struct sockaddr_in *)&lifr.lifr_subnet;
   1851 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &test_subnet);
   1852 		test_subnet_len = lifr.lifr_addrlen + (IPV6_ABITS - IP_ABITS);
   1853 	}
   1854 
   1855 	/*
   1856 	 * If this is the logint corresponding to the test address used for
   1857 	 * sending probes, then if anything significant has changed we need to
   1858 	 * determine the test address again.  We ignore changes to the
   1859 	 * IFF_FAILED and IFF_RUNNING flags since those happen as a matter of
   1860 	 * course.
   1861 	 */
   1862 	if (pii->pii_probe_logint == li) {
   1863 		if (((li->li_flags ^ saved_flags) &
   1864 		    ~(IFF_FAILED | IFF_RUNNING)) != 0 ||
   1865 		    !IN6_ARE_ADDR_EQUAL(&testaddr, &li->li_addr) ||
   1866 		    (!ptp && !IN6_ARE_ADDR_EQUAL(&test_subnet,
   1867 		    &li->li_subnet)) ||
   1868 		    (!ptp && test_subnet_len != li->li_subnet_len) ||
   1869 		    (ptp && !IN6_ARE_ADDR_EQUAL(&tgaddr, &li->li_dstaddr))) {
   1870 			/*
   1871 			 * Something significant that affects the testaddress
   1872 			 * has changed. Redo the testaddress selection later on
   1873 			 * in select_test_ifs(). For now do the cleanup and
   1874 			 * set pii_probe_logint to NULL.
   1875 			 */
   1876 			if (pii->pii_probe_sock != -1)
   1877 				close_probe_socket(pii, _B_TRUE);
   1878 			pii->pii_probe_logint = NULL;
   1879 		}
   1880 	}
   1881 
   1882 
   1883 	/* Update the logint with the values obtained from the kernel.	*/
   1884 	li->li_addr = testaddr;
   1885 	li->li_in_use = 1;
   1886 	if (ptp) {
   1887 		li->li_dstaddr = tgaddr;
   1888 		li->li_subnet_len = (pii->pii_af == AF_INET) ?
   1889 		    IP_ABITS : IPV6_ABITS;
   1890 	} else {
   1891 		li->li_subnet = test_subnet;
   1892 		li->li_subnet_len = test_subnet_len;
   1893 	}
   1894 
   1895 	if (debug & D_LOGINT)
   1896 		logint_print(li);
   1897 
   1898 	return;
   1899 
   1900 error:
   1901 	logerr("logint_init_from_k: IGNORED %s %s %s addr %s\n",
   1902 	    AF_STR(pii->pii_af), pii->pii_name, li->li_name,
   1903 	    pr_addr(pii->pii_af, testaddr, abuf, sizeof (abuf)));
   1904 	logint_delete(li);
   1905 }
   1906 
   1907 /*
   1908  * Delete (unlink and free) a logint.
   1909  */
   1910 void
   1911 logint_delete(struct logint *li)
   1912 {
   1913 	struct phyint_instance *pii;
   1914 
   1915 	pii = li->li_phyint_inst;
   1916 	assert(pii != NULL);
   1917 
   1918 	if (debug & D_LOGINT) {
   1919 		int af;
   1920 		char abuf[INET6_ADDRSTRLEN];
   1921 
   1922 		af = pii->pii_af;
   1923 		logdebug("logint_delete(%s %s %s/%u)\n",
   1924 		    AF_STR(af), li->li_name,
   1925 		    pr_addr(af, li->li_addr, abuf, sizeof (abuf)),
   1926 		    li->li_subnet_len);
   1927 	}
   1928 
   1929 	/* logint must be in the list of logints */
   1930 	assert(pii->pii_logint == li || li->li_prev != NULL);
   1931 
   1932 	/* Remove the logint from the list of logints  */
   1933 	if (li->li_prev == NULL) {
   1934 		/* logint is the 1st in the list */
   1935 		pii->pii_logint = li->li_next;
   1936 	} else {
   1937 		li->li_prev->li_next = li->li_next;
   1938 	}
   1939 	if (li->li_next != NULL)
   1940 		li->li_next->li_prev = li->li_prev;
   1941 	li->li_next = NULL;
   1942 	li->li_prev = NULL;
   1943 
   1944 	/*
   1945 	 * If this logint is also being used for probing, then close the
   1946 	 * associated socket, if it exists.
   1947 	 */
   1948 	if (pii->pii_probe_logint == li) {
   1949 		if (pii->pii_probe_sock != -1)
   1950 			close_probe_socket(pii, _B_TRUE);
   1951 		pii->pii_probe_logint = NULL;
   1952 	}
   1953 
   1954 	free(li);
   1955 }
   1956 
   1957 static void
   1958 logint_print(struct logint *li)
   1959 {
   1960 	char abuf[INET6_ADDRSTRLEN];
   1961 	int af = li->li_phyint_inst->pii_af;
   1962 
   1963 	logdebug("logint: %s %s addr %s/%u", AF_STR(af), li->li_name,
   1964 	    pr_addr(af, li->li_addr, abuf, sizeof (abuf)), li->li_subnet_len);
   1965 
   1966 	logdebug("\tFlags: %llx in_use %d\n", li->li_flags, li->li_in_use);
   1967 }
   1968 
   1969 char *
   1970 pr_addr(int af, struct in6_addr addr, char *abuf, int len)
   1971 {
   1972 	struct in_addr	addr_v4;
   1973 
   1974 	if (af == AF_INET) {
   1975 		IN6_V4MAPPED_TO_INADDR(&addr, &addr_v4);
   1976 		(void) inet_ntop(AF_INET, (void *)&addr_v4, abuf, len);
   1977 	} else {
   1978 		(void) inet_ntop(AF_INET6, (void *)&addr, abuf, len);
   1979 	}
   1980 	return (abuf);
   1981 }
   1982 
   1983 /*
   1984  * Fill in the sockaddr_storage pointed to by `ssp' with the IP address
   1985  * represented by the [`af',`addr'] pair.  Needed because in.mpathd internally
   1986  * stores all addresses as in6_addrs, but we don't want to expose that.
   1987  */
   1988 void
   1989 addr2storage(int af, const struct in6_addr *addr, struct sockaddr_storage *ssp)
   1990 {
   1991 	struct sockaddr_in *sinp = (struct sockaddr_in *)ssp;
   1992 	struct sockaddr_in6 *sin6p = (struct sockaddr_in6 *)ssp;
   1993 
   1994 	assert(af == AF_INET || af == AF_INET6);
   1995 
   1996 	switch (af) {
   1997 	case AF_INET:
   1998 		(void) memset(sinp, 0, sizeof (*sinp));
   1999 		sinp->sin_family = AF_INET;
   2000 		IN6_V4MAPPED_TO_INADDR(addr, &sinp->sin_addr);
   2001 		break;
   2002 	case AF_INET6:
   2003 		(void) memset(sin6p, 0, sizeof (*sin6p));
   2004 		sin6p->sin6_family = AF_INET6;
   2005 		sin6p->sin6_addr = *addr;
   2006 		break;
   2007 	}
   2008 }
   2009 
   2010 /* Lookup target on its address */
   2011 struct target *
   2012 target_lookup(struct phyint_instance *pii, struct in6_addr addr)
   2013 {
   2014 	struct target *tg;
   2015 
   2016 	if (debug & D_TARGET) {
   2017 		char abuf[INET6_ADDRSTRLEN];
   2018 
   2019 		logdebug("target_lookup(%s %s): addr %s\n",
   2020 		    AF_STR(pii->pii_af), pii->pii_name,
   2021 		    pr_addr(pii->pii_af, addr, abuf, sizeof (abuf)));
   2022 	}
   2023 
   2024 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
   2025 		if (IN6_ARE_ADDR_EQUAL(&tg->tg_address, &addr))
   2026 			break;
   2027 	}
   2028 	return (tg);
   2029 }
   2030 
   2031 /*
   2032  * Find and return the next active target, for the next probe.
   2033  * If no active targets are available, return NULL.
   2034  */
   2035 struct target *
   2036 target_next(struct target *tg)
   2037 {
   2038 	struct	phyint_instance	*pii = tg->tg_phyint_inst;
   2039 	struct	target	*marker = tg;
   2040 	hrtime_t now;
   2041 
   2042 	now = gethrtime();
   2043 
   2044 	/*
   2045 	 * Target must be in the list of targets for this phyint
   2046 	 * instance.
   2047 	 */
   2048 	assert(pii->pii_targets == tg || tg->tg_prev != NULL);
   2049 	assert(pii->pii_targets != NULL);
   2050 
   2051 	/* Return the next active target */
   2052 	do {
   2053 		/*
   2054 		 * Go to the next target. If we hit the end,
   2055 		 * reset the ptr to the head
   2056 		 */
   2057 		tg = tg->tg_next;
   2058 		if (tg == NULL)
   2059 			tg = pii->pii_targets;
   2060 
   2061 		assert(TG_STATUS_VALID(tg->tg_status));
   2062 
   2063 		switch (tg->tg_status) {
   2064 		case TG_ACTIVE:
   2065 			return (tg);
   2066 
   2067 		case TG_UNUSED:
   2068 			assert(pii->pii_targets_are_routers);
   2069 			if (pii->pii_ntargets < MAX_PROBE_TARGETS) {
   2070 				/*
   2071 				 * Bubble up the unused target to active
   2072 				 */
   2073 				tg->tg_status = TG_ACTIVE;
   2074 				pii->pii_ntargets++;
   2075 				return (tg);
   2076 			}
   2077 			break;
   2078 
   2079 		case TG_SLOW:
   2080 			assert(pii->pii_targets_are_routers);
   2081 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
   2082 				/*
   2083 				 * Bubble up the slow target to unused
   2084 				 */
   2085 				tg->tg_status = TG_UNUSED;
   2086 			}
   2087 			break;
   2088 
   2089 		case TG_DEAD:
   2090 			assert(pii->pii_targets_are_routers);
   2091 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
   2092 				/*
   2093 				 * Bubble up the dead target to slow
   2094 				 */
   2095 				tg->tg_status = TG_SLOW;
   2096 				tg->tg_latime = now;
   2097 			}
   2098 			break;
   2099 		}
   2100 
   2101 	} while (tg != marker);
   2102 
   2103 	return (NULL);
   2104 }
   2105 
   2106 /*
   2107  * Select the best available target, that is not already TG_ACTIVE,
   2108  * for the caller. The caller will determine whether it wants to
   2109  * make the returned target TG_ACTIVE.
   2110  * The selection order is as follows.
   2111  * 1. pick a TG_UNSED target, if it exists.
   2112  * 2. else pick a TG_SLOW target that has recovered, if it exists
   2113  * 3. else pick any TG_SLOW target, if it exists
   2114  * 4. else pick a TG_DEAD target that has recovered, if it exists
   2115  * 5. else pick any TG_DEAD target, if it exists
   2116  * 6. else return null
   2117  */
   2118 static struct target *
   2119 target_select_best(struct phyint_instance *pii)
   2120 {
   2121 	struct target *tg;
   2122 	struct target *slow = NULL;
   2123 	struct target *dead = NULL;
   2124 	struct target *slow_recovered = NULL;
   2125 	struct target *dead_recovered = NULL;
   2126 	hrtime_t now;
   2127 
   2128 	now = gethrtime();
   2129 
   2130 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
   2131 		assert(TG_STATUS_VALID(tg->tg_status));
   2132 
   2133 		switch (tg->tg_status) {
   2134 		case TG_UNUSED:
   2135 			return (tg);
   2136 
   2137 		case TG_SLOW:
   2138 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
   2139 				slow_recovered = tg;
   2140 				/*
   2141 				 * Promote the slow_recovered to unused
   2142 				 */
   2143 				tg->tg_status = TG_UNUSED;
   2144 			} else {
   2145 				slow = tg;
   2146 			}
   2147 			break;
   2148 
   2149 		case TG_DEAD:
   2150 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
   2151 				dead_recovered = tg;
   2152 				/*
   2153 				 * Promote the dead_recovered to slow
   2154 				 */
   2155 				tg->tg_status = TG_SLOW;
   2156 				tg->tg_latime = now;
   2157 			} else {
   2158 				dead = tg;
   2159 			}
   2160 			break;
   2161 
   2162 		default:
   2163 			break;
   2164 		}
   2165 	}
   2166 
   2167 	if (slow_recovered != NULL)
   2168 		return (slow_recovered);
   2169 	else if (slow != NULL)
   2170 		return (slow);
   2171 	else if (dead_recovered != NULL)
   2172 		return (dead_recovered);
   2173 	else
   2174 		return (dead);
   2175 }
   2176 
   2177 /*
   2178  * Some target was deleted. If we don't have even MIN_PROBE_TARGETS
   2179  * that are active, pick the next best below.
   2180  */
   2181 static void
   2182 target_activate_all(struct phyint_instance *pii)
   2183 {
   2184 	struct target *tg;
   2185 
   2186 	assert(pii->pii_ntargets == 0);
   2187 	assert(pii->pii_target_next == NULL);
   2188 	assert(pii->pii_rtt_target_next == NULL);
   2189 	assert(pii->pii_targets_are_routers);
   2190 
   2191 	while (pii->pii_ntargets < MIN_PROBE_TARGETS) {
   2192 		tg = target_select_best(pii);
   2193 		if (tg == NULL) {
   2194 			/* We are out of targets */
   2195 			return;
   2196 		}
   2197 
   2198 		assert(TG_STATUS_VALID(tg->tg_status));
   2199 		assert(tg->tg_status != TG_ACTIVE);
   2200 		tg->tg_status = TG_ACTIVE;
   2201 		pii->pii_ntargets++;
   2202 		if (pii->pii_target_next == NULL) {
   2203 			pii->pii_target_next = tg;
   2204 			pii->pii_rtt_target_next = tg;
   2205 		}
   2206 	}
   2207 }
   2208 
   2209 static struct target *
   2210 target_first(struct phyint_instance *pii)
   2211 {
   2212 	struct target *tg;
   2213 
   2214 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
   2215 		assert(TG_STATUS_VALID(tg->tg_status));
   2216 		if (tg->tg_status == TG_ACTIVE)
   2217 			break;
   2218 	}
   2219 
   2220 	return (tg);
   2221 }
   2222 
   2223 /*
   2224  * Create a default target entry.
   2225  */
   2226 void
   2227 target_create(struct phyint_instance *pii, struct in6_addr addr,
   2228     boolean_t is_router)
   2229 {
   2230 	struct target *tg;
   2231 	struct phyint *pi;
   2232 	struct logint *li;
   2233 
   2234 	if (debug & D_TARGET) {
   2235 		char abuf[INET6_ADDRSTRLEN];
   2236 
   2237 		logdebug("target_create(%s %s, %s)\n",
   2238 		    AF_STR(pii->pii_af), pii->pii_name,
   2239 		    pr_addr(pii->pii_af, addr, abuf, sizeof (abuf)));
   2240 	}
   2241 
   2242 	/*
   2243 	 * If the test address is not yet initialized, do not add
   2244 	 * any target, since we cannot determine whether the target
   2245 	 * belongs to the same subnet as the test address.
   2246 	 */
   2247 	li = pii->pii_probe_logint;
   2248 	if (li == NULL)
   2249 		return;
   2250 
   2251 	/*
   2252 	 * If there are multiple subnets associated with an interface, then
   2253 	 * add the target to this phyint instance only if it belongs to the
   2254 	 * same subnet as the test address.  This assures us that we will
   2255 	 * be able to reach this target through our routing table.
   2256 	 */
   2257 	if (!prefix_equal(li->li_subnet, addr, li->li_subnet_len))
   2258 		return;
   2259 
   2260 	if (pii->pii_targets != NULL) {
   2261 		assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
   2262 		if (is_router) {
   2263 			if (!pii->pii_targets_are_routers) {
   2264 				/*
   2265 				 * Prefer router over hosts. Using hosts is a
   2266 				 * fallback mechanism, hence delete all host
   2267 				 * targets.
   2268 				 */
   2269 				while (pii->pii_targets != NULL)
   2270 					target_delete(pii->pii_targets);
   2271 			}
   2272 		} else {
   2273 			/*
   2274 			 * Routers take precedence over hosts. If this
   2275 			 * is a router list and we are trying to add a
   2276 			 * host, just return. If this is a host list
   2277 			 * and if we have sufficient targets, just return
   2278 			 */
   2279 			if (pii->pii_targets_are_routers ||
   2280 			    pii->pii_ntargets == MAX_PROBE_TARGETS)
   2281 				return;
   2282 		}
   2283 	}
   2284 
   2285 	tg = calloc(1, sizeof (struct target));
   2286 	if (tg == NULL) {
   2287 		logperror("target_create: calloc");
   2288 		return;
   2289 	}
   2290 
   2291 	tg->tg_phyint_inst = pii;
   2292 	tg->tg_address = addr;
   2293 	tg->tg_in_use = 1;
   2294 	tg->tg_rtt_sa = -1;
   2295 	tg->tg_num_deferred = 0;
   2296 
   2297 	/*
   2298 	 * If this is the first target, set 'pii_targets_are_routers'
   2299 	 * The list of targets is either a list of hosts or list or
   2300 	 * routers, but not a mix.
   2301 	 */
   2302 	if (pii->pii_targets == NULL) {
   2303 		assert(pii->pii_ntargets == 0);
   2304 		assert(pii->pii_target_next == NULL);
   2305 		assert(pii->pii_rtt_target_next == NULL);
   2306 		pii->pii_targets_are_routers = is_router ? 1 : 0;
   2307 	}
   2308 
   2309 	if (pii->pii_ntargets == MAX_PROBE_TARGETS) {
   2310 		assert(pii->pii_targets_are_routers);
   2311 		assert(pii->pii_target_next != NULL);
   2312 		assert(pii->pii_rtt_target_next != NULL);
   2313 		tg->tg_status = TG_UNUSED;
   2314 	} else {
   2315 		if (pii->pii_ntargets == 0) {
   2316 			assert(pii->pii_target_next == NULL);
   2317 			pii->pii_target_next = tg;
   2318 			pii->pii_rtt_target_next = tg;
   2319 		}
   2320 		pii->pii_ntargets++;
   2321 		tg->tg_status = TG_ACTIVE;
   2322 	}
   2323 
   2324 	target_insert(pii, tg);
   2325 
   2326 	/*
   2327 	 * Change state to PI_RUNNING if this phyint instance is capable of
   2328 	 * sending and receiving probes -- that is, if we know of at least 1
   2329 	 * target, and this phyint instance is probe-capable.  For more
   2330 	 * details, see the phyint state diagram in mpd_probe.c.
   2331 	 */
   2332 	pi = pii->pii_phyint;
   2333 	if (pi->pi_state == PI_NOTARGETS && PROBE_CAPABLE(pii)) {
   2334 		if (pi->pi_flags & IFF_FAILED)
   2335 			phyint_chstate(pi, PI_FAILED);
   2336 		else
   2337 			phyint_chstate(pi, PI_RUNNING);
   2338 	}
   2339 }
   2340 
   2341 /*
   2342  * Add the target address named by `addr' to phyint instance `pii' if it does
   2343  * not already exist.  If the target is a router, `is_router' should be set to
   2344  * B_TRUE.
   2345  */
   2346 void
   2347 target_add(struct phyint_instance *pii, struct in6_addr addr,
   2348     boolean_t is_router)
   2349 {
   2350 	struct target *tg;
   2351 
   2352 	if (pii == NULL)
   2353 		return;
   2354 
   2355 	tg = target_lookup(pii, addr);
   2356 
   2357 	/*
   2358 	 * If the target does not exist, create it; target_create() will set
   2359 	 * tg_in_use to true.  Even if it exists already, if it's a router
   2360 	 * target and we'd previously learned of it through multicast, then we
   2361 	 * need to recreate it as a router target.  Otherwise, just set
   2362 	 * tg_in_use to to true so that init_router_targets() won't delete it.
   2363 	 */
   2364 	if (tg == NULL || (is_router && !pii->pii_targets_are_routers))
   2365 		target_create(pii, addr, is_router);
   2366 	else if (is_router)
   2367 		tg->tg_in_use = 1;
   2368 }
   2369 
   2370 /*
   2371  * Insert target at head of linked list of targets for the associated
   2372  * phyint instance
   2373  */
   2374 static void
   2375 target_insert(struct phyint_instance *pii, struct target *tg)
   2376 {
   2377 	tg->tg_next = pii->pii_targets;
   2378 	tg->tg_prev = NULL;
   2379 	if (tg->tg_next != NULL)
   2380 		tg->tg_next->tg_prev = tg;
   2381 	pii->pii_targets = tg;
   2382 }
   2383 
   2384 /*
   2385  * Delete a target (unlink and free).
   2386  */
   2387 void
   2388 target_delete(struct target *tg)
   2389 {
   2390 	int af;
   2391 	struct phyint_instance	*pii;
   2392 	struct phyint_instance	*pii_other;
   2393 
   2394 	pii = tg->tg_phyint_inst;
   2395 	af = pii->pii_af;
   2396 
   2397 	if (debug & D_TARGET) {
   2398 		char abuf[INET6_ADDRSTRLEN];
   2399 
   2400 		logdebug("target_delete(%s %s, %s)\n",
   2401 		    AF_STR(af), pii->pii_name,
   2402 		    pr_addr(af, tg->tg_address, abuf, sizeof (abuf)));
   2403 	}
   2404 
   2405 	/*
   2406 	 * Target must be in the list of targets for this phyint
   2407 	 * instance.
   2408 	 */
   2409 	assert(pii->pii_targets == tg || tg->tg_prev != NULL);
   2410 
   2411 	/*
   2412 	 * Reset all references to 'tg' in the probe information
   2413 	 * for this phyint.
   2414 	 */
   2415 	reset_pii_probes(pii, tg);
   2416 
   2417 	/*
   2418 	 * Remove this target from the list of targets of this
   2419 	 * phyint instance.
   2420 	 */
   2421 	if (tg->tg_prev == NULL) {
   2422 		pii->pii_targets = tg->tg_next;
   2423 	} else {
   2424 		tg->tg_prev->tg_next = tg->tg_next;
   2425 	}
   2426 
   2427 	if (tg->tg_next != NULL)
   2428 		tg->tg_next->tg_prev = tg->tg_prev;
   2429 
   2430 	tg->tg_next = NULL;
   2431 	tg->tg_prev = NULL;
   2432 
   2433 	if (tg->tg_status == TG_ACTIVE)
   2434 		pii->pii_ntargets--;
   2435 
   2436 	/*
   2437 	 * Adjust the next target to probe, if it points to
   2438 	 * to the currently deleted target.
   2439 	 */
   2440 	if (pii->pii_target_next == tg)
   2441 		pii->pii_target_next = target_first(pii);
   2442 
   2443 	if (pii->pii_rtt_target_next == tg)
   2444 		pii->pii_rtt_target_next = target_first(pii);
   2445 
   2446 	free(tg);
   2447 
   2448 	/*
   2449 	 * The number of active targets pii_ntargets == 0 iff
   2450 	 * the next active target pii->pii_target_next == NULL
   2451 	 */
   2452 	if (pii->pii_ntargets != 0) {
   2453 		assert(pii->pii_target_next != NULL);
   2454 		assert(pii->pii_rtt_target_next != NULL);
   2455 		assert(pii->pii_target_next->tg_status == TG_ACTIVE);
   2456 		assert(pii->pii_rtt_target_next->tg_status == TG_ACTIVE);
   2457 		return;
   2458 	}
   2459 
   2460 	/* At this point, we don't have any active targets. */
   2461 	assert(pii->pii_target_next == NULL);
   2462 	assert(pii->pii_rtt_target_next == NULL);
   2463 
   2464 	if (pii->pii_targets_are_routers) {
   2465 		/*
   2466 		 * Activate any TG_SLOW or TG_DEAD router targets,
   2467 		 * since we don't have any other targets
   2468 		 */
   2469 		target_activate_all(pii);
   2470 
   2471 		if (pii->pii_ntargets != 0) {
   2472 			assert(pii->pii_target_next != NULL);
   2473 			assert(pii->pii_rtt_target_next != NULL);
   2474 			assert(pii->pii_target_next->tg_status == TG_ACTIVE);
   2475 			assert(pii->pii_rtt_target_next->tg_status ==
   2476 			    TG_ACTIVE);
   2477 			return;
   2478 		}
   2479 	}
   2480 
   2481 	/*
   2482 	 * If we still don't have any active targets, the list must
   2483 	 * must be really empty. There aren't even TG_SLOW or TG_DEAD
   2484 	 * targets. Zero out the probe stats since it will not be
   2485 	 * relevant any longer.
   2486 	 */
   2487 	assert(pii->pii_targets == NULL);
   2488 	pii->pii_targets_are_routers = _B_FALSE;
   2489 	clear_pii_probe_stats(pii);
   2490 	pii_other = phyint_inst_other(pii);
   2491 
   2492 	/*
   2493 	 * If there are no targets on both instances and the interface would
   2494 	 * otherwise be considered PI_RUNNING, go back to PI_NOTARGETS state,
   2495 	 * since we cannot probe this phyint any more.  For more details,
   2496 	 * please see phyint state diagram in mpd_probe.c.
   2497 	 */
   2498 	if (!PROBE_CAPABLE(pii_other) && LINK_UP(pii->pii_phyint) &&
   2499 	    pii->pii_phyint->pi_state != PI_OFFLINE)
   2500 		phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
   2501 }
   2502 
   2503 /*
   2504  * Flush the target list of every phyint in the group, if the list
   2505  * is a host target list. This is called if group failure is suspected.
   2506  * If all targets have failed, multicast will subsequently discover new
   2507  * targets. Else it is a group failure.
   2508  * Note: This function is a no-op if the list is a router target list.
   2509  */
   2510 static void
   2511 target_flush_hosts(struct phyint_group *pg)
   2512 {
   2513 	struct phyint *pi;
   2514 	struct phyint_instance *pii;
   2515 
   2516 	if (debug & D_TARGET)
   2517 		logdebug("target_flush_hosts(%s)\n", pg->pg_name);
   2518 
   2519 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
   2520 		pii = pi->pi_v4;
   2521 		if (pii != NULL && !pii->pii_targets_are_routers) {
   2522 			/*
   2523 			 * Delete all the targets. When the list becomes
   2524 			 * empty, target_delete() will set pii->pii_targets
   2525 			 * to NULL.
   2526 			 */
   2527 			while (pii->pii_targets != NULL)
   2528 				target_delete(pii->pii_targets);
   2529 		}
   2530 		pii = pi->pi_v6;
   2531 		if (pii != NULL && !pii->pii_targets_are_routers) {
   2532 			/*
   2533 			 * Delete all the targets. When the list becomes
   2534 			 * empty, target_delete() will set pii->pii_targets
   2535 			 * to NULL.
   2536 			 */
   2537 			while (pii->pii_targets != NULL)
   2538 				target_delete(pii->pii_targets);
   2539 		}
   2540 	}
   2541 }
   2542 
   2543 /*
   2544  * Reset all references to 'target' in the probe info, as this target is
   2545  * being deleted. The pr_target field is guaranteed to be non-null if
   2546  * pr_status is PR_UNACKED. So we change the pr_status to PR_LOST, so that
   2547  * pr_target will not be accessed unconditionally.
   2548  */
   2549 static void
   2550 reset_pii_probes(struct phyint_instance *pii, struct target *tg)
   2551 {
   2552 	int i;
   2553 
   2554 	for (i = 0; i < PROBE_STATS_COUNT; i++) {
   2555 		if (pii->pii_probes[i].pr_target == tg) {
   2556 			if (pii->pii_probes[i].pr_status == PR_UNACKED) {
   2557 				probe_chstate(&pii->pii_probes[i], pii,
   2558 				    PR_LOST);
   2559 			}
   2560 			pii->pii_probes[i].pr_target = NULL;
   2561 		}
   2562 	}
   2563 }
   2564 
   2565 /*
   2566  * Clear the probe statistics array.
   2567  */
   2568 void
   2569 clear_pii_probe_stats(struct phyint_instance *pii)
   2570 {
   2571 	bzero(pii->pii_probes, sizeof (struct probe_stats) * PROBE_STATS_COUNT);
   2572 	/* Reset the next probe index in the probe stats array */
   2573 	pii->pii_probe_next = 0;
   2574 }
   2575 
   2576 static void
   2577 target_print(struct target *tg)
   2578 {
   2579 	char	abuf[INET6_ADDRSTRLEN];
   2580 	char	buf[128];
   2581 	char	buf2[128];
   2582 	int	af;
   2583 	int	i;
   2584 
   2585 	af = tg->tg_phyint_inst->pii_af;
   2586 
   2587 	logdebug("Target on %s %s addr %s\n"
   2588 	    "status %d rtt_sa %lld rtt_sd %lld crtt %d tg_in_use %d\n",
   2589 	    AF_STR(af), tg->tg_phyint_inst->pii_name,
   2590 	    pr_addr(af, tg->tg_address, abuf, sizeof (abuf)),
   2591 	    tg->tg_status, tg->tg_rtt_sa, tg->tg_rtt_sd,
   2592 	    tg->tg_crtt, tg->tg_in_use);
   2593 
   2594 	buf[0] = '\0';
   2595 	for (i = 0; i < tg->tg_num_deferred; i++) {
   2596 		(void) snprintf(buf2, sizeof (buf2), " %dms",
   2597 		    tg->tg_deferred[i]);
   2598 		(void) strlcat(buf, buf2, sizeof (buf));
   2599 	}
   2600 	logdebug("deferred rtts:%s\n", buf);
   2601 }
   2602 
   2603 void
   2604 phyint_inst_print_all(void)
   2605 {
   2606 	struct phyint_instance *pii;
   2607 
   2608 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
   2609 		phyint_inst_print(pii);
   2610 	}
   2611 }
   2612 
   2613 /*
   2614  * Compare two prefixes that have the same prefix length.
   2615  * Fails if the prefix length is unreasonable.
   2616  */
   2617 boolean_t
   2618 prefix_equal(struct in6_addr p1, struct in6_addr p2, uint_t prefix_len)
   2619 {
   2620 	uchar_t mask;
   2621 	int j;
   2622 
   2623 	if (prefix_len > IPV6_ABITS)
   2624 		return (_B_FALSE);
   2625 
   2626 	for (j = 0; prefix_len > 8; prefix_len -= 8, j++)
   2627 		if (p1.s6_addr[j] != p2.s6_addr[j])
   2628 			return (_B_FALSE);
   2629 
   2630 	/* Make the N leftmost bits one */
   2631 	mask = 0xff << (8 - prefix_len);
   2632 	if ((p1.s6_addr[j] & mask) != (p2.s6_addr[j] & mask))
   2633 		return (_B_FALSE);
   2634 
   2635 	return (_B_TRUE);
   2636 }
   2637 
   2638 /*
   2639  * Get the number of UP logints on phyint `pi'.
   2640  */
   2641 static int
   2642 logint_upcount(struct phyint *pi)
   2643 {
   2644 	struct	logint	*li;
   2645 	int count = 0;
   2646 
   2647 	if (pi->pi_v4 != NULL) {
   2648 		for (li = pi->pi_v4->pii_logint; li != NULL; li = li->li_next) {
   2649 			if (li->li_flags & IFF_UP)
   2650 				count++;
   2651 		}
   2652 	}
   2653 
   2654 	if (pi->pi_v6 != NULL) {
   2655 		for (li = pi->pi_v6->pii_logint; li != NULL; li = li->li_next) {
   2656 			if (li->li_flags & IFF_UP)
   2657 				count++;
   2658 		}
   2659 	}
   2660 
   2661 	return (count);
   2662 }
   2663 
   2664 /*
   2665  * Get the phyint instance with the other (IPv4 / IPv6) protocol
   2666  */
   2667 struct phyint_instance *
   2668 phyint_inst_other(struct phyint_instance *pii)
   2669 {
   2670 	if (pii->pii_af == AF_INET)
   2671 		return (pii->pii_phyint->pi_v6);
   2672 	else
   2673 		return (pii->pii_phyint->pi_v4);
   2674 }
   2675 
   2676 /*
   2677  * Check whether a phyint is functioning.
   2678  */
   2679 boolean_t
   2680 phyint_is_functioning(struct phyint *pi)
   2681 {
   2682 	if (pi->pi_state == PI_RUNNING)
   2683 		return (_B_TRUE);
   2684 	return (pi->pi_state == PI_NOTARGETS && !(pi->pi_flags & IFF_FAILED));
   2685 }
   2686 
   2687 /*
   2688  * Check whether a phyint is usable.
   2689  */
   2690 boolean_t
   2691 phyint_is_usable(struct phyint *pi)
   2692 {
   2693 	if (logint_upcount(pi) == 0)
   2694 		return (_B_FALSE);
   2695 	return (phyint_is_functioning(pi));
   2696 }
   2697 
   2698 /*
   2699  * Post an EC_IPMP sysevent of subclass `subclass' and attributes `nvl'.
   2700  * Before sending the event, it prepends the current version of the IPMP
   2701  * sysevent API.  Returns 0 on success, -1 on failure (in either case,
   2702  * `nvl' is freed).
   2703  */
   2704 static int
   2705 post_event(const char *subclass, nvlist_t *nvl)
   2706 {
   2707 	static evchan_t *evchp = NULL;
   2708 
   2709 	/*
   2710 	 * Initialize the event channel if we haven't already done so.
   2711 	 */
   2712 	if (evchp == NULL) {
   2713 		errno = sysevent_evc_bind(IPMP_EVENT_CHAN, &evchp, EVCH_CREAT);
   2714 		if (errno != 0) {
   2715 			logerr("cannot create event channel `%s': %s\n",
   2716 			    IPMP_EVENT_CHAN, strerror(errno));
   2717 			goto failed;
   2718 		}
   2719 	}
   2720 
   2721 	errno = nvlist_add_uint32(nvl, IPMP_EVENT_VERSION,
   2722 	    IPMP_EVENT_CUR_VERSION);
   2723 	if (errno != 0) {
   2724 		logerr("cannot create `%s' event: %s", subclass,
   2725 		    strerror(errno));
   2726 		goto failed;
   2727 	}
   2728 
   2729 	errno = sysevent_evc_publish(evchp, EC_IPMP, subclass, "com.sun",
   2730 	    "in.mpathd", nvl, EVCH_NOSLEEP);
   2731 	if (errno != 0) {
   2732 		logerr("cannot send `%s' event: %s\n", subclass,
   2733 		    strerror(errno));
   2734 		goto failed;
   2735 	}
   2736 
   2737 	nvlist_free(nvl);
   2738 	return (0);
   2739 failed:
   2740 	nvlist_free(nvl);
   2741 	return (-1);
   2742 }
   2743 
   2744 /*
   2745  * Return the external IPMP state associated with phyint `pi'.
   2746  */
   2747 static ipmp_if_state_t
   2748 ifstate(struct phyint *pi)
   2749 {
   2750 	switch (pi->pi_state) {
   2751 	case PI_INIT:
   2752 		return (IPMP_IF_UNKNOWN);
   2753 
   2754 	case PI_NOTARGETS:
   2755 		if (pi->pi_flags & IFF_FAILED)
   2756 			return (IPMP_IF_FAILED);
   2757 		return (IPMP_IF_UNKNOWN);
   2758 
   2759 	case PI_OFFLINE:
   2760 		return (IPMP_IF_OFFLINE);
   2761 
   2762 	case PI_FAILED:
   2763 		return (IPMP_IF_FAILED);
   2764 
   2765 	case PI_RUNNING:
   2766 		return (IPMP_IF_OK);
   2767 	}
   2768 
   2769 	logerr("ifstate: unknown state %d; aborting\n", pi->pi_state);
   2770 	abort();
   2771 	/* NOTREACHED */
   2772 }
   2773 
   2774 /*
   2775  * Return the external IPMP interface type associated with phyint `pi'.
   2776  */
   2777 static ipmp_if_type_t
   2778 iftype(struct phyint *pi)
   2779 {
   2780 	if (pi->pi_flags & IFF_STANDBY)
   2781 		return (IPMP_IF_STANDBY);
   2782 	else
   2783 		return (IPMP_IF_NORMAL);
   2784 }
   2785 
   2786 /*
   2787  * Return the external IPMP link state associated with phyint `pi'.
   2788  */
   2789 static ipmp_if_linkstate_t
   2790 iflinkstate(struct phyint *pi)
   2791 {
   2792 	if (!(pi->pi_notes & (DL_NOTE_LINK_UP|DL_NOTE_LINK_DOWN)))
   2793 		return (IPMP_LINK_UNKNOWN);
   2794 
   2795 	return (LINK_DOWN(pi) ? IPMP_LINK_DOWN : IPMP_LINK_UP);
   2796 }
   2797 
   2798 /*
   2799  * Return the external IPMP probe state associated with phyint `pi'.
   2800  */
   2801 static ipmp_if_probestate_t
   2802 ifprobestate(struct phyint *pi)
   2803 {
   2804 	if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6))
   2805 		return (IPMP_PROBE_DISABLED);
   2806 
   2807 	if (pi->pi_state == PI_FAILED)
   2808 		return (IPMP_PROBE_FAILED);
   2809 
   2810 	if (!PROBE_CAPABLE(pi->pi_v4) && !PROBE_CAPABLE(pi->pi_v6))
   2811 		return (IPMP_PROBE_UNKNOWN);
   2812 
   2813 	return (IPMP_PROBE_OK);
   2814 }
   2815 
   2816 /*
   2817  * Return the external IPMP target mode associated with phyint instance `pii'.
   2818  */
   2819 static ipmp_if_targmode_t
   2820 iftargmode(struct phyint_instance *pii)
   2821 {
   2822 	if (!PROBE_ENABLED(pii))
   2823 		return (IPMP_TARG_DISABLED);
   2824 	else if (pii->pii_targets_are_routers)
   2825 		return (IPMP_TARG_ROUTES);
   2826 	else
   2827 		return (IPMP_TARG_MULTICAST);
   2828 }
   2829 
   2830 /*
   2831  * Return the external IPMP flags associated with phyint `pi'.
   2832  */
   2833 static ipmp_if_flags_t
   2834 ifflags(struct phyint *pi)
   2835 {
   2836 	ipmp_if_flags_t flags = 0;
   2837 
   2838 	if (logint_upcount(pi) == 0)
   2839 		flags |= IPMP_IFFLAG_DOWN;
   2840 	if (pi->pi_flags & IFF_INACTIVE)
   2841 		flags |= IPMP_IFFLAG_INACTIVE;
   2842 	if (pi->pi_hwaddrdup)
   2843 		flags |= IPMP_IFFLAG_HWADDRDUP;
   2844 	if (phyint_is_functioning(pi) && flags == 0)
   2845 		flags |= IPMP_IFFLAG_ACTIVE;
   2846 
   2847 	return (flags);
   2848 }
   2849 
   2850 /*
   2851  * Store the test address used on phyint instance `pii' in `ssp'.  If there's
   2852  * no test address, 0.0.0.0 is stored.
   2853  */
   2854 static struct sockaddr_storage *
   2855 iftestaddr(struct phyint_instance *pii, struct sockaddr_storage *ssp)
   2856 {
   2857 	if (PROBE_ENABLED(pii))
   2858 		addr2storage(pii->pii_af, &pii->pii_probe_logint->li_addr, ssp);
   2859 	else
   2860 		addr2storage(AF_INET6, &in6addr_any, ssp);
   2861 
   2862 	return (ssp);
   2863 }
   2864 
   2865 /*
   2866  * Return the external IPMP group state associated with phyint group `pg'.
   2867  */
   2868 static ipmp_group_state_t
   2869 groupstate(struct phyint_group *pg)
   2870 {
   2871 	switch (pg->pg_state) {
   2872 	case PG_FAILED:
   2873 		return (IPMP_GROUP_FAILED);
   2874 	case PG_DEGRADED:
   2875 		return (IPMP_GROUP_DEGRADED);
   2876 	case PG_OK:
   2877 		return (IPMP_GROUP_OK);
   2878 	}
   2879 
   2880 	logerr("groupstate: unknown state %d; aborting\n", pg->pg_state);
   2881 	abort();
   2882 	/* NOTREACHED */
   2883 }
   2884 
   2885 /*
   2886  * Return the external IPMP probe state associated with probe `ps'.
   2887  */
   2888 static ipmp_probe_state_t
   2889 probestate(struct probe_stats *ps)
   2890 {
   2891 	switch (ps->pr_status) {
   2892 	case PR_UNUSED:
   2893 	case PR_LOST:
   2894 		return (IPMP_PROBE_LOST);
   2895 	case PR_UNACKED:
   2896 		return (IPMP_PROBE_SENT);
   2897 	case PR_ACKED:
   2898 		return (IPMP_PROBE_ACKED);
   2899 	}
   2900 
   2901 	logerr("probestate: unknown state %d; aborting\n", ps->pr_status);
   2902 	abort();
   2903 	/* NOTREACHED */
   2904 }
   2905 
   2906 /*
   2907  * Generate an ESC_IPMP_PROBE_STATE sysevent for the probe described by `pr'
   2908  * on phyint instance `pii'.  Returns 0 on success, -1 on failure.
   2909  */
   2910 int
   2911 probe_state_event(struct probe_stats *pr, struct phyint_instance *pii)
   2912 {
   2913 	nvlist_t *nvl;
   2914 	hrtime_t proc_time = 0, recv_time = 0;
   2915 	struct sockaddr_storage ss;
   2916 	struct target *tg = pr->pr_target;
   2917 	int64_t rttavg, rttdev;
   2918 
   2919 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
   2920 	if (errno != 0) {
   2921 		logperror("cannot create `interface change' event");
   2922 		return (-1);
   2923 	}
   2924 
   2925 	errno = nvlist_add_uint32(nvl, IPMP_PROBE_ID, pr->pr_id);
   2926 	if (errno != 0)
   2927 		goto failed;
   2928 
   2929 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pii->pii_phyint->pi_name);
   2930 	if (errno != 0)
   2931 		goto failed;
   2932 
   2933 	errno = nvlist_add_uint32(nvl, IPMP_PROBE_STATE, probestate(pr));
   2934 	if (errno != 0)
   2935 		goto failed;
   2936 
   2937 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_START_TIME,
   2938 	    pr->pr_hrtime_start);
   2939 	if (errno != 0)
   2940 		goto failed;
   2941 
   2942 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_SENT_TIME,
   2943 	    pr->pr_hrtime_sent);
   2944 	if (errno != 0)
   2945 		goto failed;
   2946 
   2947 	if (pr->pr_status == PR_ACKED) {
   2948 		recv_time = pr->pr_hrtime_ackrecv;
   2949 		proc_time = pr->pr_hrtime_ackproc;
   2950 	}
   2951 
   2952 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKRECV_TIME, recv_time);
   2953 	if (errno != 0)
   2954 		goto failed;
   2955 
   2956 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKPROC_TIME, proc_time);
   2957 	if (errno != 0)
   2958 		goto failed;
   2959 
   2960 	if (tg != NULL)
   2961 		addr2storage(pii->pii_af, &tg->tg_address, &ss);
   2962 	else
   2963 		addr2storage(pii->pii_af, &in6addr_any, &ss);
   2964 
   2965 	errno = nvlist_add_byte_array(nvl, IPMP_PROBE_TARGET, (uchar_t *)&ss,
   2966 	    sizeof (ss));
   2967 	if (errno != 0)
   2968 		goto failed;
   2969 
   2970 	rttavg = (tg != NULL) ? (tg->tg_rtt_sa / 8) : 0;
   2971 	errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTAVG, rttavg);
   2972 	if (errno != 0)
   2973 		goto failed;
   2974 
   2975 	rttdev = (tg != NULL) ? (tg->tg_rtt_sd / 4) : 0;
   2976 	errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTDEV, rttdev);
   2977 	if (errno != 0)
   2978 		goto failed;
   2979 
   2980 	return (post_event(ESC_IPMP_PROBE_STATE, nvl));
   2981 failed:
   2982 	logperror("cannot create `probe state' event");
   2983 	nvlist_free(nvl);
   2984 	return (-1);
   2985 }
   2986 
   2987 /*
   2988  * Generate an ESC_IPMP_GROUP_STATE sysevent for phyint group `pg'.
   2989  * Returns 0 on success, -1 on failure.
   2990  */
   2991 static int
   2992 phyint_group_state_event(struct phyint_group *pg)
   2993 {
   2994 	nvlist_t	*nvl;
   2995 
   2996 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
   2997 	if (errno != 0) {
   2998 		logperror("cannot create `group state change' event");
   2999 		return (-1);
   3000 	}
   3001 
   3002 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
   3003 	if (errno != 0)
   3004 		goto failed;
   3005 
   3006 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
   3007 	if (errno != 0)
   3008 		goto failed;
   3009 
   3010 	errno = nvlist_add_uint32(nvl, IPMP_GROUP_STATE, groupstate(pg));
   3011 	if (errno != 0)
   3012 		goto failed;
   3013 
   3014 	return (post_event(ESC_IPMP_GROUP_STATE, nvl));
   3015 failed:
   3016 	logperror("cannot create `group state change' event");
   3017 	nvlist_free(nvl);
   3018 	return (-1);
   3019 }
   3020 
   3021 /*
   3022  * Generate an ESC_IPMP_GROUP_CHANGE sysevent of type `op' for phyint group
   3023  * `pg'.  Returns 0 on success, -1 on failure.
   3024  */
   3025 static int
   3026 phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t op)
   3027 {
   3028 	nvlist_t *nvl;
   3029 
   3030 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
   3031 	if (errno != 0) {
   3032 		logperror("cannot create `group change' event");
   3033 		return (-1);
   3034 	}
   3035 
   3036 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
   3037 	if (errno != 0)
   3038 		goto failed;
   3039 
   3040 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
   3041 	if (errno != 0)
   3042 		goto failed;
   3043 
   3044 	errno = nvlist_add_uint64(nvl, IPMP_GROUPLIST_SIGNATURE,
   3045 	    phyint_grouplistsig);
   3046 	if (errno != 0)
   3047 		goto failed;
   3048 
   3049 	errno = nvlist_add_uint32(nvl, IPMP_GROUP_OPERATION, op);
   3050 	if (errno != 0)
   3051 		goto failed;
   3052 
   3053 	return (post_event(ESC_IPMP_GROUP_CHANGE, nvl));
   3054 failed:
   3055 	logperror("cannot create `group change' event");
   3056 	nvlist_free(nvl);
   3057 	return (-1);
   3058 }
   3059 
   3060 /*
   3061  * Generate an ESC_IPMP_GROUP_MEMBER_CHANGE sysevent for phyint `pi' in
   3062  * group `pg'.	Returns 0 on success, -1 on failure.
   3063  */
   3064 static int
   3065 phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
   3066     ipmp_if_op_t op)
   3067 {
   3068 	nvlist_t *nvl;
   3069 
   3070 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
   3071 	if (errno != 0) {
   3072 		logperror("cannot create `group member change' event");
   3073 		return (-1);
   3074 	}
   3075 
   3076 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
   3077 	if (errno != 0)
   3078 		goto failed;
   3079 
   3080 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
   3081 	if (errno != 0)
   3082 		goto failed;
   3083 
   3084 	errno = nvlist_add_uint32(nvl, IPMP_IF_OPERATION, op);
   3085 	if (errno != 0)
   3086 		goto failed;
   3087 
   3088 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name);
   3089 	if (errno != 0)
   3090 		goto failed;
   3091 
   3092 	errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi));
   3093 	if (errno != 0)
   3094 		goto failed;
   3095 
   3096 	errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi));
   3097 	if (errno != 0)
   3098 		goto failed;
   3099 
   3100 	return (post_event(ESC_IPMP_GROUP_MEMBER_CHANGE, nvl));
   3101 failed:
   3102 	logperror("cannot create `group member change' event");
   3103 	nvlist_free(nvl);
   3104 	return (-1);
   3105 
   3106 }
   3107 
   3108 /*
   3109  * Generate an ESC_IPMP_IF_CHANGE sysevent for phyint `pi' in group `pg'.
   3110  * Returns 0 on success, -1 on failure.
   3111  */
   3112 static int
   3113 phyint_state_event(struct phyint_group *pg, struct phyint *pi)
   3114 {
   3115 	nvlist_t *nvl;
   3116 
   3117 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
   3118 	if (errno != 0) {
   3119 		logperror("cannot create `interface change' event");
   3120 		return (-1);
   3121 	}
   3122 
   3123 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
   3124 	if (errno != 0)
   3125 		goto failed;
   3126 
   3127 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
   3128 	if (errno != 0)
   3129 		goto failed;
   3130 
   3131 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name);
   3132 	if (errno != 0)
   3133 		goto failed;
   3134 
   3135 	errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi));
   3136 	if (errno != 0)
   3137 		goto failed;
   3138 
   3139 	errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi));
   3140 	if (errno != 0)
   3141 		goto failed;
   3142 
   3143 	return (post_event(ESC_IPMP_IF_CHANGE, nvl));
   3144 failed:
   3145 	logperror("cannot create `interface change' event");
   3146 	nvlist_free(nvl);
   3147 	return (-1);
   3148 
   3149 }
   3150 
   3151 /*
   3152  * Generate a signature for use.  The signature is conceptually divided
   3153  * into two pieces: a random 16-bit "generation number" and a 48-bit
   3154  * monotonically increasing integer.  The generation number protects
   3155  * against stale updates to entities (e.g., IPMP groups) that have been
   3156  * deleted and since recreated.
   3157  */
   3158 static uint64_t
   3159 gensig(void)
   3160 {
   3161 	static int seeded = 0;
   3162 
   3163 	if (seeded == 0) {
   3164 		srand48((long)gethrtime());
   3165 		seeded++;
   3166 	}
   3167 
   3168 	return ((uint64_t)lrand48() << 48 | 1);
   3169 }
   3170 
   3171 /*
   3172  * Store the information associated with group `grname' into a dynamically
   3173  * allocated structure pointed to by `*grinfopp'.  Returns an IPMP error code.
   3174  */
   3175 unsigned int
   3176 getgroupinfo(const char *grname, ipmp_groupinfo_t **grinfopp)
   3177 {
   3178 	struct phyint		*pi;
   3179 	struct phyint_group	*pg;
   3180 	char			(*ifs)[LIFNAMSIZ];
   3181 	unsigned int		i, j;
   3182 	unsigned int		nif = 0, naddr = 0;
   3183 	lifgroupinfo_t		lifgr;
   3184 	addrlist_t		*addrp;
   3185 	struct sockaddr_storage	*addrs;
   3186 	int			fdt = 0;
   3187 
   3188 	pg = phyint_group_lookup(grname);
   3189 	if (pg == NULL)
   3190 		return (IPMP_EUNKGROUP);
   3191 
   3192 	/*
   3193 	 * Tally up the number of interfaces, allocate an array to hold them,
   3194 	 * and insert their names into the array.  While we're at it, if any
   3195 	 * interface is actually enabled to send probes, save the group fdt.
   3196 	 */
   3197 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext)
   3198 		nif++;
   3199 
   3200 	ifs = alloca(nif * sizeof (*ifs));
   3201 	for (i = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext, i++) {
   3202 		assert(i < nif);
   3203 		(void) strlcpy(ifs[i], pi->pi_name, LIFNAMSIZ);
   3204 		if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6))
   3205 			fdt = pg->pg_fdt;
   3206 	}
   3207 	assert(i == nif);
   3208 
   3209 	/*
   3210 	 * If this is the anonymous group, there's no other information to
   3211 	 * collect (since there's no IPMP interface).
   3212 	 */
   3213 	if (pg == phyint_anongroup) {
   3214 		*grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt,
   3215 		    groupstate(pg), nif, ifs, "", "", "", "", 0, NULL);
   3216 		return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
   3217 	}
   3218 
   3219 	/*
   3220 	 * Grab some additional information about the group from the kernel.
   3221 	 * (NOTE: since SIOCGLIFGROUPINFO does not look up by interface name,
   3222 	 * we can use ifsock_v4 even for a V6-only group.)
   3223 	 */
   3224 	(void) strlcpy(lifgr.gi_grname, grname, LIFGRNAMSIZ);
   3225 	if (ioctl(ifsock_v4, SIOCGLIFGROUPINFO, &lifgr) == -1) {
   3226 		if (errno == ENOENT)
   3227 			return (IPMP_EUNKGROUP);
   3228 
   3229 		logperror("getgroupinfo: SIOCGLIFGROUPINFO");
   3230 		return (IPMP_FAILURE);
   3231 	}
   3232 
   3233 	/*
   3234 	 * Tally up the number of data addresses, allocate an array to hold
   3235 	 * them, and insert their values into the array.
   3236 	 */
   3237 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next)
   3238 		naddr++;
   3239 
   3240 	addrs = alloca(naddr * sizeof (*addrs));
   3241 	i = 0;
   3242 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) {
   3243 		/*
   3244 		 * It's possible to have duplicate addresses (if some are
   3245 		 * down).  Weed the dups out to avoid confusing consumers.
   3246 		 * (If groups start having tons of addresses, we'll need a
   3247 		 * better algorithm here.)
   3248 		 */
   3249 		for (j = 0; j < i; j++) {
   3250 			if (sockaddrcmp(&addrs[j], &addrp->al_addr))
   3251 				break;
   3252 		}
   3253 		if (j == i) {
   3254 			assert(i < naddr);
   3255 			addrs[i++] = addrp->al_addr;
   3256 		}
   3257 	}
   3258 	naddr = i;
   3259 
   3260 	*grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt,
   3261 	    groupstate(pg), nif, ifs, lifgr.gi_grifname, lifgr.gi_m4ifname,
   3262 	    lifgr.gi_m6ifname, lifgr.gi_bcifname, naddr, addrs);
   3263 	return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
   3264 }
   3265 
   3266 /*
   3267  * Store the target information associated with phyint instance `pii' into a
   3268  * dynamically allocated structure pointed to by `*targinfopp'.  Returns an
   3269  * IPMP error code.
   3270  */
   3271 unsigned int
   3272 gettarginfo(struct phyint_instance *pii, const char *name,
   3273     ipmp_targinfo_t **targinfopp)
   3274 {
   3275 	uint_t ntarg = 0;
   3276 	struct target *tg;
   3277 	struct sockaddr_storage	ss;
   3278 	struct sockaddr_storage *targs = NULL;
   3279 
   3280 	if (PROBE_CAPABLE(pii)) {
   3281 		targs = alloca(pii->pii_ntargets * sizeof (*targs));
   3282 		tg = pii->pii_target_next;
   3283 		do {
   3284 			if (tg->tg_status == TG_ACTIVE) {
   3285 				assert(ntarg < pii->pii_ntargets);
   3286 				addr2storage(pii->pii_af, &tg->tg_address,
   3287 				    &targs[ntarg++]);
   3288 			}
   3289 			if ((tg = tg->tg_next) == NULL)
   3290 				tg = pii->pii_targets;
   3291 		} while (tg != pii->pii_target_next);
   3292 
   3293 		assert(ntarg == pii->pii_ntargets);
   3294 	}
   3295 
   3296 	*targinfopp = ipmp_targinfo_create(name, iftestaddr(pii, &ss),
   3297 	    iftargmode(pii), ntarg, targs);
   3298 	return (*targinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
   3299 }
   3300 
   3301 /*
   3302  * Store the information associated with interface `ifname' into a dynamically
   3303  * allocated structure pointed to by `*ifinfopp'.  Returns an IPMP error code.
   3304  */
   3305 unsigned int
   3306 getifinfo(const char *ifname, ipmp_ifinfo_t **ifinfopp)
   3307 {
   3308 	int		retval;
   3309 	struct phyint	*pi;
   3310 	ipmp_targinfo_t	*targinfo4;
   3311 	ipmp_targinfo_t	*targinfo6;
   3312 
   3313 	pi = phyint_lookup(ifname);
   3314 	if (pi == NULL)
   3315 		return (IPMP_EUNKIF);
   3316 
   3317 	if ((retval = gettarginfo(pi->pi_v4, pi->pi_name, &targinfo4)) != 0 ||
   3318 	    (retval = gettarginfo(pi->pi_v6, pi->pi_name, &targinfo6)) != 0)
   3319 		goto out;
   3320 
   3321 	*ifinfopp = ipmp_ifinfo_create(pi->pi_name, pi->pi_group->pg_name,
   3322 	    ifstate(pi), iftype(pi), iflinkstate(pi), ifprobestate(pi),
   3323 	    ifflags(pi), targinfo4, targinfo6);
   3324 	retval = (*ifinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
   3325 out:
   3326 	if (targinfo4 != NULL)
   3327 		ipmp_freetarginfo(targinfo4);
   3328 	if (targinfo6 != NULL)
   3329 		ipmp_freetarginfo(targinfo6);
   3330 	return (retval);
   3331 }
   3332 
   3333 /*
   3334  * Store the current list of IPMP groups into a dynamically allocated
   3335  * structure pointed to by `*grlistpp'.	 Returns an IPMP error code.
   3336  */
   3337 unsigned int
   3338 getgrouplist(ipmp_grouplist_t **grlistpp)
   3339 {
   3340 	struct phyint_group	*pg;
   3341 	char			(*groups)[LIFGRNAMSIZ];
   3342 	unsigned int		i, ngroup;
   3343 
   3344 	/*
   3345 	 * Tally up the number of groups, allocate an array to hold them, and
   3346 	 * insert their names into the array.
   3347 	 */
   3348 	for (ngroup = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next)
   3349 		ngroup++;
   3350 
   3351 	groups = alloca(ngroup * sizeof (*groups));
   3352 	for (i = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next, i++) {
   3353 		assert(i < ngroup);
   3354 		(void) strlcpy(groups[i], pg->pg_name, LIFGRNAMSIZ);
   3355 	}
   3356 	assert(i == ngroup);
   3357 
   3358 	*grlistpp = ipmp_grouplist_create(phyint_grouplistsig, ngroup, groups);
   3359 	return (*grlistpp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
   3360 }
   3361 
   3362 /*
   3363  * Store the address information for `ssp' (in group `grname') into a
   3364  * dynamically allocated structure pointed to by `*adinfopp'.  Returns an IPMP
   3365  * error code.  (We'd call this function getaddrinfo(), but it would conflict
   3366  * with getaddrinfo(3SOCKET)).
   3367  */
   3368 unsigned int
   3369 getgraddrinfo(const char *grname, struct sockaddr_storage *ssp,
   3370     ipmp_addrinfo_t **adinfopp)
   3371 {
   3372 	int ifsock;
   3373 	addrlist_t *addrp, *addrmatchp = NULL;
   3374 	ipmp_addr_state_t state;
   3375 	const char *binding = "";
   3376 	struct lifreq lifr;
   3377 	struct phyint_group *pg;
   3378 
   3379 	if ((pg = phyint_group_lookup(grname)) == NULL)
   3380 		return (IPMP_EUNKADDR);
   3381 
   3382 	/*
   3383 	 * Walk through the data addresses, and find a match.  Note that since
   3384 	 * some of the addresses may be down, more than one may match.  We
   3385 	 * prefer an up address (if one exists).
   3386 	 */
   3387 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) {
   3388 		if (sockaddrcmp(ssp, &addrp->al_addr)) {
   3389 			addrmatchp = addrp;
   3390 			if (addrmatchp->al_flags & IFF_UP)
   3391 				break;
   3392 		}
   3393 	}
   3394 
   3395 	if (addrmatchp == NULL)
   3396 		return (IPMP_EUNKADDR);
   3397 
   3398 	state = (addrmatchp->al_flags & IFF_UP) ? IPMP_ADDR_UP : IPMP_ADDR_DOWN;
   3399 	if (state == IPMP_ADDR_UP) {
   3400 		ifsock = (ssp->ss_family == AF_INET) ? ifsock_v4 : ifsock_v6;
   3401 		(void) strlcpy(lifr.lifr_name, addrmatchp->al_name, LIFNAMSIZ);
   3402 		if (ioctl(ifsock, SIOCGLIFBINDING, &lifr) >= 0)
   3403 			binding = lifr.lifr_binding;
   3404 	}
   3405 
   3406 	*adinfopp = ipmp_addrinfo_create(ssp, state, pg->pg_name, binding);
   3407 	return (*adinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
   3408 }
   3409 
   3410 /*
   3411  * Store a snapshot of the IPMP subsystem into a dynamically allocated
   3412  * structure pointed to by `*snapp'.  Returns an IPMP error code.
   3413  */
   3414 unsigned int
   3415 getsnap(ipmp_snap_t **snapp)
   3416 {
   3417 	ipmp_grouplist_t	*grlistp;
   3418 	ipmp_groupinfo_t	*grinfop;
   3419 	ipmp_addrinfo_t		*adinfop;
   3420 	ipmp_addrlist_t		*adlistp;
   3421 	ipmp_ifinfo_t		*ifinfop;
   3422 	ipmp_snap_t		*snap;
   3423 	struct phyint		*pi;
   3424 	unsigned int		i, j;
   3425 	int			retval;
   3426 
   3427 	snap = ipmp_snap_create();
   3428 	if (snap == NULL)
   3429 		return (IPMP_ENOMEM);
   3430 
   3431 	/*
   3432 	 * Add group list.
   3433 	 */
   3434 	retval = getgrouplist(&snap->sn_grlistp);
   3435 	if (retval != IPMP_SUCCESS)
   3436 		goto failed;
   3437 
   3438 	/*
   3439 	 * Add information for each group in the list, along with all of its
   3440 	 * data addresses.
   3441 	 */
   3442 	grlistp = snap->sn_grlistp;
   3443 	for (i = 0; i < grlistp->gl_ngroup; i++) {
   3444 		retval = getgroupinfo(grlistp->gl_groups[i], &grinfop);
   3445 		if (retval != IPMP_SUCCESS)
   3446 			goto failed;
   3447 
   3448 		retval = ipmp_snap_addgroupinfo(snap, grinfop);
   3449 		if (retval != IPMP_SUCCESS) {
   3450 			ipmp_freegroupinfo(grinfop);
   3451 			goto failed;
   3452 		}
   3453 
   3454 		adlistp = grinfop->gr_adlistp;
   3455 		for (j = 0; j < adlistp->al_naddr; j++) {
   3456 			retval = getgraddrinfo(grinfop->gr_name,
   3457 			    &adlistp->al_addrs[j], &adinfop);
   3458 			if (retval != IPMP_SUCCESS)
   3459 				goto failed;
   3460 
   3461 			retval = ipmp_snap_addaddrinfo(snap, adinfop);
   3462 			if (retval != IPMP_SUCCESS) {
   3463 				ipmp_freeaddrinfo(adinfop);
   3464 				goto failed;
   3465 			}
   3466 		}
   3467 	}
   3468 
   3469 	/*
   3470 	 * Add information for each configured phyint.
   3471 	 */
   3472 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
   3473 		retval = getifinfo(pi->pi_name, &ifinfop);
   3474 		if (retval != IPMP_SUCCESS)
   3475 			goto failed;
   3476 
   3477 		retval = ipmp_snap_addifinfo(snap, ifinfop);
   3478 		if (retval != IPMP_SUCCESS) {
   3479 			ipmp_freeifinfo(ifinfop);
   3480 			goto failed;
   3481 		}
   3482 	}
   3483 
   3484 	*snapp = snap;
   3485 	return (IPMP_SUCCESS);
   3486 failed:
   3487 	ipmp_snap_free(snap);
   3488 	return (retval);
   3489 }
   3490