Home | History | Annotate | Download | only in in.mpathd
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #ifndef	_MPD_TABLES_H
     27 #define	_MPD_TABLES_H
     28 
     29 #ifdef	__cplusplus
     30 extern "C" {
     31 #endif
     32 
     33 /*
     34  * Terminology:
     35  *
     36  * phyint: A NIC eg. hme0. This is represented as 'struct phyint'
     37  *
     38  * phyint instance: A protocol instance of a phyint. Eg. the IPv4 instance of
     39  * 	hme0 or the IPv6 instance of hme0. (struct phyint_instance)
     40  *
     41  * logint: A logical interface eg. hme0:1 (struct logint)
     42  *
     43  * phyint_group: A group of phyints i.e. physical interfaces that are
     44  *	(i) connected to the same level 2 topology e.g. the same ethernet
     45  *	    switch AND
     46  *	(ii) share the same phyint group name.
     47  * Load spreading and failover occur across members of the same phyint group.
     48  * phyint group members must be homogeneous. i.e. if a phyint belonging to a
     49  * phyint group has a IPv6 protocol instance, then all members of the phyint
     50  * group, must have IPv6 protocol instances. (struct phyint_group)
     51  */
     52 
     53 #define	MAXDEFERREDRTT		1	/* Maximum number of deferred rtts */
     54 
     55 /*
     56  * Status of the phyint, expressed by the return code of failure_state()
     57  */
     58 #define	PHYINT_OK	0		/* No failure detected */
     59 #define	PHYINT_FAILURE	1		/* NIC failure detected */
     60 #define	GROUP_FAILURE	2		/* All NICs have failed */
     61 
     62 /*
     63  * Return values of phyint_inst_update_from_k()
     64  */
     65 #define	PI_OK			1	/* Phyint matches in the kernel */
     66 #define	PI_DELETED		2	/* Phyint has vanished in the kernel */
     67 #define	PI_IFINDEX_CHANGED	3	/* Phyint's ifindex has changed */
     68 #define	PI_IOCTL_ERROR		4	/* Some ioctl error */
     69 #define	PI_GROUP_CHANGED	5	/* The phyint has changed group. */
     70 
     71 #define	PHYINT_FLAGS(flags)	\
     72 	(((flags) & (IFF_STANDBY | IFF_INACTIVE | IFF_FAILED | IFF_OFFLINE | \
     73 	IFF_RUNNING)) | (handle_link_notifications ? 0 : IFF_RUNNING))
     74 
     75 /* A Phyint can have up to 2 instances, the IPv4 and the IPv6 instance */
     76 #define	PHYINT_INSTANCE(pi, af)	\
     77 	((af) == AF_INET ? (pi)->pi_v4 : (pi)->pi_v6)
     78 
     79 /*
     80  * A phyint instance is probe *enabled* if it has been configured with a
     81  * unique probe address (i.e., an IFF_NOFAILOVER address).  It is probe
     82  * *capable* if it is also able to send probes (i.e., has one or more
     83  * targets available).
     84  */
     85 #define	PROBE_ENABLED(pii) \
     86 	(((pii) != NULL) && ((pii)->pii_probe_sock != -1) &&	\
     87 	((pii)->pii_probe_logint != NULL) &&			\
     88 	(((pii)->pii_probe_logint->li_dupaddr == 0)))
     89 
     90 #define	PROBE_CAPABLE(pii) \
     91 	(PROBE_ENABLED(pii) && ((pii)->pii_ntargets != 0))
     92 
     93 /* Subtract b from a modulo n. i.e. (a - b) mod n  */
     94 #define	MOD_SUB(a, b, n)	\
     95 	((((a) + (n)) - (b)) % (n))
     96 
     97 /* Increment modulo n */
     98 #define	MOD_INCR(a, n)		\
     99 	(((a) + 1) % (n))
    100 
    101 /* Decrement modulo n */
    102 #define	MOD_DCR(a, n)		\
    103 	MOD_SUB(a, 1, n)
    104 
    105 /*
    106  * 'index' represents an index into the circular probe stats array of
    107  * size PROBE_STATS_COUNT.  0 <= index < PROBE_STATS_COUNT. This is used
    108  * to access members of the pii_probes[] array defined in the phyint_instance
    109  * structure.
    110  */
    111 #define	PROBE_INDEX_PREV(index)	\
    112 	MOD_DCR(index, PROBE_STATS_COUNT)
    113 
    114 #define	PROBE_INDEX_NEXT(index)	\
    115 	MOD_INCR(index, PROBE_STATS_COUNT)
    116 
    117 
    118 /*
    119  * If we receive more than LINK_UP_PERMIN "link up" notifications in a minute,
    120  * then don't actually perform the repair operation until we've dropped back
    121  * below the threshold (or we have a probe address and our probes indicate
    122  * that the link is functioning again).  This is to prevent link flapping in
    123  * the case where we don't have a probe address.
    124  */
    125 #define	LINK_UP_PERMIN	2
    126 
    127 #define	LINK_DOWN(pi) ((pi)->pi_link_state == 0)
    128 #define	LINK_UP(pi) (!LINK_DOWN(pi))
    129 #define	FLAGS_TO_LINK_STATE(pi) (((pi)->pi_flags & IFF_RUNNING) != 0)
    130 #define	UPDATE_LINK_STATE(pi) ((pi)->pi_link_state = \
    131 	FLAGS_TO_LINK_STATE(pi) ? 1 : 0)
    132 #define	INIT_LINK_STATE(pi) ((pi)->pi_link_state = 1)
    133 
    134 /*
    135  * Phyint group states; see below for the phyint group definition.
    136  */
    137 enum pg_state {
    138 	PG_OK = 1,	/* all interfaces in the group are working */
    139 	PG_DEGRADED,	/* some interfaces in the group are unusable */
    140 	PG_FAILED	/* all interfaces in the group are unusable */
    141 };
    142 
    143 /*
    144  * Convenience macro to check if the whole group has failed.
    145  */
    146 #define	GROUP_FAILED(pg)	((pg)->pg_state == PG_FAILED)
    147 
    148 /*
    149  * A doubly linked list of all phyint groups in the system.
    150  * A phyint group is identified by its group name.
    151  */
    152 struct phyint_group {
    153 	char pg_name[LIFGRNAMSIZ];	/* Phyint group name */
    154 	struct phyint *pg_phyint;	/* List of phyints in this group */
    155 	struct phyint_group *pg_next;	/* Next phyint group */
    156 	struct phyint_group *pg_prev;	/* Prev phyint group */
    157 	uint64_t 	pg_sig;		/* Current signature of this group */
    158 	int		pg_probeint;	/* Interval between probes */
    159 	int		pg_fdt;		/* Time needed to detect failure */
    160 	enum pg_state	pg_state;	/* Current group state */
    161 	boolean_t	pg_in_use;	/* To detect removed groups */
    162 	struct addrlist	*pg_addrs;	/* Data addresses in this group */
    163 	boolean_t pg_failmsg_printed;	/* Group failure msg printed */
    164 };
    165 
    166 /*
    167  * Phyint states; see below for the phyint definition.
    168  */
    169 enum pi_state {
    170 	PI_INIT		= 0,	/* Phyint is being initialized */
    171 	PI_NOTARGETS	= 1,	/* Phyint has no targets */
    172 	PI_RUNNING	= 2,	/* Phyint is functioning */
    173 	PI_FAILED	= 3,	/* Phyint is failed */
    174 	PI_OFFLINE	= 4	/* Phyint is offline */
    175 };
    176 
    177 /*
    178  * Representation of a NIC or a phyint. There is a list of all known phyints.
    179  * There is also a list of phyints belonging to a phyint group, one list
    180  * per phyint group.
    181  */
    182 struct phyint {
    183 	char	pi_name[LIFNAMSIZ + 1]; /* Phyint name eg. le0 */
    184 	struct phyint_instance *pi_v4;	/* The IPv4 instance */
    185 	struct phyint_instance *pi_v6;	/* The IPv6 instance */
    186 	struct phyint_group *pi_group;	/* Pointer to the group */
    187 	struct phyint	*pi_next;	/* List of all phyints */
    188 	struct phyint	*pi_prev;	/* List of all phyints */
    189 	struct phyint	*pi_pgnext;	/* List of phyints in this group */
    190 	struct phyint	*pi_pgprev;	/* List of phyints in this group */
    191 	uint_t		pi_ifindex;	/* interface index */
    192 	enum pi_state	pi_state;	/* State of the phyint */
    193 	uint64_t	pi_flags;	/* Phyint flags from kernel */
    194 	uint16_t	pi_icmpid;	/* icmp id in icmp echo request */
    195 	uint64_t	pi_taddrthresh;	/* time (in secs) to delay logging */
    196 					/* about missing test addresses */
    197 	dlpi_handle_t	pi_dh;		/* DLPI handle to underlying link */
    198 	uint_t		pi_notes; 	/* enabled DLPI notifications */
    199 	uchar_t		pi_hwaddr[DLPI_PHYSADDR_MAX]; /* phyint's hw address */
    200 	size_t		pi_hwaddrlen;	/* phyint's hw address length */
    201 
    202 	/*
    203 	 * The pi_whenup array is a circular buffer of the most recent
    204 	 * times (in milliseconds since some arbitrary point of time in
    205 	 * the past) that the interface was brought up; pi_whendx identifies
    206 	 * the oldest element of the array.
    207 	 */
    208 	uint_t		pi_whenup[LINK_UP_PERMIN];
    209 	unsigned int	pi_whendx;
    210 
    211 	uint_t
    212 		pi_taddrmsg_printed : 1,	/* testaddr msg printed */
    213 		pi_duptaddrmsg_printed : 1,	/* dup testaddr msg printed */
    214 		pi_cfgmsg_printed : 1,	/* bad config msg printed */
    215 		pi_lfmsg_printed : 1,   /* link-flapping msg printed */
    216 		pi_link_state : 1,	/* interface link state */
    217 		pi_hwaddrdup : 1; 	/* disabled due to dup hw address */
    218 };
    219 
    220 /*
    221  * A doubly linked list of all phyint_instances each of which contains a
    222  * doubly linked list of logical interfaces and targets. For eg. if both
    223  * IPv4 and IPv6 are used over hme0, we have 2 phyint instances, 1 for each
    224  * protocol.
    225  */
    226 struct phyint_instance {
    227 	struct phyint_instance	*pii_next;	/* List of all phyint insts */
    228 	struct phyint_instance	*pii_prev;	/* List of all phyint insts */
    229 
    230 	struct phyint	*pii_phyint;	/* Back pointer to the phyint */
    231 	struct target	*pii_targets;	/* List of targets on this link */
    232 	struct logint	*pii_probe_logint; /* IFF_NOFAILOVER addr for probing */
    233 	struct logint	*pii_logint;	/* Doubly linked list of logical ifs */
    234 
    235 	int	pii_probe_sock;		/* Socket for ICMP Probe packets */
    236 	int	pii_af;			/* Address family */
    237 	uint16_t pii_rack;		/* highest acknowledged seq number */
    238 	uint16_t pii_snxt;		/* sequence number of next probe */
    239 	uint_t	pii_snxt_time;		/* actual next probe time that */
    240 					/* includes some randomness */
    241 
    242 	uint_t	pii_snxt_basetime; 	/* strictly periodic base probe time */
    243 					/* for all periodic probes */
    244 	uint_t	pii_fd_snxt_basetime; 	/* strictly periodic base probe time */
    245 					/* for failure detection probes */
    246 
    247 	hrtime_t 	pii_fd_hrtime;	/* hrtime_t before which we should */
    248 					/* not send probes out this pii */
    249 
    250 	uint64_t	pii_flags;	/* Phyint flags from kernel */
    251 
    252 	struct probe_stats {
    253 		uint_t		pr_id;		/* Full ID of probe */
    254 		struct target	*pr_target;	/* Probe Target */
    255 		uint_t		pr_time_lost; 	/* Time probe declared lost */
    256 		struct timeval	pr_tv_sent;	/* Wall time probe was sent */
    257 		hrtime_t pr_hrtime_start;	/* hrtime probe op started */
    258 		hrtime_t pr_hrtime_sent;	/* hrtime probe was sent */
    259 		hrtime_t pr_hrtime_ackrecv; 	/* hrtime probe ack received */
    260 		hrtime_t pr_hrtime_ackproc;	/* hrtime probe ack processed */
    261 		uint_t	pr_status;	/* probe status as below */
    262 #define	PR_UNUSED	0		/* Probe slot unused */
    263 #define	PR_UNACKED	1		/* Probe is unacknowledged */
    264 #define	PR_ACKED	2		/* Probe has been acknowledged */
    265 #define	PR_LOST		3		/* Probe is declared lost */
    266 	} pii_probes[PROBE_STATS_COUNT];
    267 
    268 	uint_t
    269 		pii_in_use : 1,			/* To detect removed phyints */
    270 		pii_basetime_inited : 1,	/* probe time initialized */
    271 		pii_targets_are_routers : 1;	/* routers or hosts ? */
    272 
    273 	uint_t	pii_probe_next;		/* next index to use in pii_probes[] */
    274 	struct target *pii_target_next;	/* next target for probing */
    275 	struct target *pii_rtt_target_next;
    276 					/* next target for rtt probes */
    277 
    278 	int	pii_ntargets;		/* Number of active targets */
    279 	struct stats {			/* Cumulative statistics */
    280 		uint64_t	lost;		/* Number of probes lost */
    281 		uint64_t	acked;		/* Number of probes acked */
    282 		uint64_t	sent;		/* Number of probes sent */
    283 		uint64_t	unknown;	/* Number of ambiguous */
    284 						/* probe acks */
    285 	} pii_cum_stats;
    286 };
    287 
    288 #define	pii_name	pii_phyint->pi_name
    289 #define	pii_ifindex	pii_phyint->pi_ifindex
    290 #define	pii_state	pii_phyint->pi_state
    291 #define	pii_icmpid	pii_phyint->pi_icmpid
    292 
    293 #define	PR_STATUS_VALID(status)		((status) <= PR_LOST)
    294 
    295 
    296 /*
    297  * A doubly linked list of prefixes or logicals, hanging off the
    298  * phyint instance.
    299  */
    300 struct logint {
    301 	struct logint	*li_next;	/* Next logint of this phyint inst. */
    302 	struct logint	*li_prev;	/* Prev logint of this phyint inst. */
    303 	struct phyint_instance	*li_phyint_inst;
    304 					/* Back pointer to phyint inst. */
    305 
    306 	char		li_name[LIFNAMSIZ + 1];	/* name Eg. hme0:1 */
    307 	struct in6_addr	li_addr;	/* IP address */
    308 	struct in6_addr	li_dstaddr;	/* Dst IP address for pointopoint */
    309 	struct in6_addr	li_subnet;	/* prefix / subnet */
    310 	uint_t		li_subnet_len;	/* prefix / subnet length */
    311 	uint64_t	li_flags;	/* IFF_* flags */
    312 	uint_t
    313 			li_in_use : 1,	/* flag to detect deleted logints */
    314 			li_dupaddr : 1;	/* test address is not unique */
    315 };
    316 
    317 
    318 /*
    319  * Doubly-linked list of probe targets on a phyint instance. Probe targets are
    320  * usually onlink routers. If no onlink routers can be found, onlink hosts
    321  * are used.
    322  */
    323 struct target {
    324 	struct target	*tg_next;	/* Next target for this phyint inst. */
    325 	struct target	*tg_prev;	/* Prev target for this phyint inst. */
    326 	struct phyint_instance	*tg_phyint_inst;
    327 					/* Back pointer to phyint instance */
    328 
    329 	struct in6_addr	tg_address;	/* Target IP address */
    330 	int		tg_status;	/* Status of the target below */
    331 #define	TG_ACTIVE	1		/* active probe target */
    332 #define	TG_UNUSED	2		/* target not in use now */
    333 #define	TG_SLOW		3		/* rtt is high - Not in use now */
    334 #define	TG_DEAD		4		/* Target is not responding */
    335 
    336 	hrtime_t	tg_latime;	/* Target's last active time */
    337 	int64_t		tg_rtt_sa;	/* Scaled RTT average (in ns) */
    338 	int64_t		tg_rtt_sd;	/* Scaled RTT deviation (in ns) */
    339 	int		tg_crtt;	/* Conservative RTT = A + 4D (in ms) */
    340 	uint32_t
    341 			tg_in_use : 1;	/* In use flag */
    342 	int64_t		tg_deferred[MAXDEFERREDRTT + 1];
    343 					/* Deferred rtt data points */
    344 	int		tg_num_deferred;
    345 					/* Number of deferred rtt data points */
    346 };
    347 
    348 #define	TG_STATUS_VALID(status) \
    349 	(((status) >= TG_ACTIVE) && ((status) <= TG_DEAD))
    350 
    351 /*
    352  * Statistics about consecutive probe failures are passed around between
    353  * functions in this structure.
    354  */
    355 struct probe_fail_count
    356 {
    357 	uint_t	pf_tff;		/* Earliest time of failure in a series */
    358 	int	pf_nfail;	/* Number of consecutive probe failures */
    359 	int	pf_nfail_tg;	/* Number of consecutive probe fails for */
    360 				/* some given target 'tg' */
    361 };
    362 
    363 /*
    364  * Statistics about consecutive probe successes is passed around between
    365  * functions in this structure.
    366  */
    367 struct probe_success_count
    368 {
    369 	uint_t ps_tls;		/* Most recent time of probe success */
    370 	boolean_t ps_tls_valid;	/* is ps_tls valid */
    371 	int	ps_nsucc;	/* Number of consecutive probe successes */
    372 				/* starting from the most recent */
    373 	int	ps_nsucc_tg;	/* Number of consecutive probe successes */
    374 				/* for some given target 'tg' */
    375 };
    376 
    377 /*
    378  * Statistics about missed probes that were never sent.
    379  * Happens due to scheduling delay.
    380  */
    381 
    382 struct probes_missed
    383 {
    384 	uint_t	pm_nprobes;	/* Cumulative number of missed probes */
    385 	uint_t	pm_ntimes;	/* Total number of occasions */
    386 };
    387 
    388 typedef struct addrlist {
    389 	struct addrlist		*al_next; 		/* next address */
    390 	char			al_name[LIFNAMSIZ];	/* address lif name */
    391 	uint64_t		al_flags;		/* address flags */
    392 	struct sockaddr_storage	al_addr; 		/* address */
    393 } addrlist_t;
    394 
    395 /*
    396  * Globals
    397  */
    398 extern addrlist_t *localaddrs;
    399 			/* List of all local addresses, including local zones */
    400 extern struct phyint *phyints;		/* List of all phyints */
    401 extern struct phyint_group *phyint_groups; /* List of all phyint groups */
    402 extern struct phyint_group *phyint_anongroup; /* Pointer to the anon group */
    403 extern struct phyint_instance *phyint_instances;
    404 					/* List of all phyint instances */
    405 extern struct probes_missed probes_missed;
    406 					/* statistics about missed probes */
    407 
    408 /*
    409  * Function prototypes
    410  */
    411 extern int phyint_init(void);
    412 extern struct phyint *phyint_lookup(const char *name);
    413 extern struct phyint_instance *phyint_inst_lookup(int af, char *name);
    414 extern struct phyint_instance *phyint_inst_init_from_k(int af, char *name);
    415 extern struct phyint_instance *phyint_inst_other(struct phyint_instance *pii);
    416 extern int phyint_inst_update_from_k(struct phyint_instance *pii);
    417 extern void phyint_inst_delete(struct phyint_instance *pii);
    418 extern uint_t phyint_inst_timer(struct phyint_instance *pii);
    419 extern boolean_t phyint_inst_sockinit(struct phyint_instance *pii);
    420 
    421 extern void phyint_changed(struct phyint *pi);
    422 extern void phyint_chstate(struct phyint *pi, enum pi_state state);
    423 extern void phyint_group_chstate(struct phyint_group *pg, enum pg_state state);
    424 extern struct phyint_group *phyint_group_create(const char *pg_name);
    425 extern struct phyint_group *phyint_group_lookup(const char *pg_name);
    426 extern void phyint_group_insert(struct phyint_group *pg);
    427 extern void phyint_group_delete(struct phyint_group *pg);
    428 extern void phyint_group_refresh_state(struct phyint_group *pg);
    429 extern void phyint_standby_refresh_inactive(struct phyint *pi);
    430 extern void phyint_check_for_repair(struct phyint *pi);
    431 extern void phyint_transition_to_running(struct phyint *pi);
    432 extern void phyint_activate_another(struct phyint *pi);
    433 extern int phyint_offline(struct phyint *pi, unsigned int);
    434 extern int phyint_undo_offline(struct phyint *pi);
    435 extern boolean_t phyint_is_functioning(struct phyint *pi);
    436 
    437 extern void logint_init_from_k(struct phyint_instance *pii, char *li_name);
    438 extern void logint_delete(struct logint *li);
    439 
    440 extern struct target *target_lookup(struct phyint_instance *pii,
    441     struct in6_addr addr);
    442 extern void target_create(struct phyint_instance *pii,
    443     struct in6_addr addr, boolean_t is_router);
    444 extern void target_delete(struct target *tg);
    445 extern struct target *target_next(struct target *tg);
    446 extern void target_add(struct phyint_instance *pii, struct in6_addr addr,
    447     boolean_t is_router);
    448 
    449 extern void in_data(struct phyint_instance *pii);
    450 extern void in6_data(struct phyint_instance *pii);
    451 
    452 extern void logperror_pii(struct phyint_instance *pii, const char *str);
    453 extern void logperror_li(struct logint *li, const char *str);
    454 extern char *pr_addr(int af, struct in6_addr addr, char *abuf, int len);
    455 extern void addr2storage(int af, const struct in6_addr *addr,
    456     struct sockaddr_storage *ssp);
    457 extern void phyint_inst_print_all(void);
    458 extern boolean_t prefix_equal(struct in6_addr, struct in6_addr, uint_t);
    459 
    460 extern void reset_crtt_all(struct phyint *pi);
    461 extern int failure_state(struct phyint_instance *pii);
    462 extern void process_link_state_changes(void);
    463 extern void clear_pii_probe_stats(struct phyint_instance *pii);
    464 extern void start_timer(struct phyint_instance *pii);
    465 extern void stop_probing(struct phyint *pi);
    466 
    467 extern boolean_t own_address(struct in6_addr addr);
    468 extern boolean_t change_pif_flags(struct phyint *pi, uint64_t set,
    469     uint64_t clear);
    470 
    471 extern void close_probe_socket(struct phyint_instance *pii, boolean_t flag);
    472 extern int probe_state_event(struct probe_stats *, struct phyint_instance *);
    473 extern void probe_chstate(struct probe_stats *, struct phyint_instance *, int);
    474 
    475 extern unsigned int getgraddrinfo(const char *, struct sockaddr_storage *,
    476     ipmp_addrinfo_t **);
    477 extern unsigned int getifinfo(const char *, ipmp_ifinfo_t **);
    478 extern unsigned int getgroupinfo(const char *, ipmp_groupinfo_t **);
    479 extern unsigned int getgrouplist(ipmp_grouplist_t **);
    480 extern unsigned int getsnap(ipmp_snap_t **);
    481 
    482 extern boolean_t addrlist_add(addrlist_t **, const char *, uint64_t,
    483     struct sockaddr_storage *);
    484 extern void addrlist_free(addrlist_t **);
    485 
    486 #ifdef	__cplusplus
    487 }
    488 #endif
    489 
    490 #endif	/* _MPD_TABLES_H */
    491