Home | History | Annotate | Download | only in sys
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #ifndef	_MAC_FLOW_IMPL_H
     28 #define	_MAC_FLOW_IMPL_H
     29 
     30 #ifdef	__cplusplus
     31 extern "C" {
     32 #endif
     33 
     34 #include <sys/param.h>
     35 #include <sys/atomic.h>
     36 #include <sys/ksynch.h>
     37 #include <sys/mac_flow.h>
     38 #include <sys/stream.h>
     39 #include <sys/sdt.h>
     40 #include <net/if.h>
     41 
     42 /*
     43  * Macros to increment/decrement the reference count on a flow_entry_t.
     44  */
     45 #define	FLOW_REFHOLD(flent) {					\
     46 	DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent));	\
     47 	mutex_enter(&(flent)->fe_lock);				\
     48 	(flent)->fe_refcnt++;					\
     49 	mutex_exit(&(flent)->fe_lock);				\
     50 }
     51 
     52 /*
     53  * Data paths must not attempt to use a flow entry if it is marked INCIPIENT
     54  * or QUIESCE. In the former case the set up is not yet complete and the
     55  * data path could stumble on inconsistent data structures. In the latter
     56  * case a control operation is waiting for quiescence so that it can
     57  * change callbacks or other structures without the use of locks.
     58  */
     59 #define	FLOW_TRY_REFHOLD(flent, err) {				\
     60 	DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent));	\
     61 	(err) = 0;						\
     62 	mutex_enter(&(flent)->fe_lock);				\
     63 	if ((flent)->fe_flags & (FE_INCIPIENT | FE_QUIESCE | FE_CONDEMNED | \
     64 	    FE_UF_NO_DATAPATH | FE_MC_NO_DATAPATH))			\
     65 		(err) = -1;					\
     66 	else							\
     67 		(flent)->fe_refcnt++;				\
     68 	mutex_exit(&(flent)->fe_lock);				\
     69 }
     70 
     71 #define	FLOW_REFRELE(flent) {					\
     72 	DTRACE_PROBE1(flow_refrele, flow_entry_t *, (flent));	\
     73 	mutex_enter(&(flent)->fe_lock);				\
     74 	ASSERT((flent)->fe_refcnt != 0);			\
     75 	(flent)->fe_refcnt--;					\
     76 	if ((flent)->fe_flags & FE_WAITER) {			\
     77 		ASSERT((flent)->fe_refcnt != 0);		\
     78 		cv_signal(&(flent)->fe_cv);			\
     79 		mutex_exit(&(flent)->fe_lock);			\
     80 	} else if ((flent)->fe_refcnt == 0) {			\
     81 		mac_flow_destroy(flent);			\
     82 	} else {						\
     83 		mutex_exit(&(flent)->fe_lock);			\
     84 	}							\
     85 }
     86 
     87 #define	FLOW_USER_REFHOLD(flent) {			\
     88 	mutex_enter(&(flent)->fe_lock);			\
     89 	(flent)->fe_user_refcnt++;			\
     90 	mutex_exit(&(flent)->fe_lock);			\
     91 }
     92 
     93 #define	FLOW_USER_REFRELE(flent) {			\
     94 	mutex_enter(&(flent)->fe_lock);			\
     95 	ASSERT((flent)->fe_user_refcnt != 0);		\
     96 	if (--(flent)->fe_user_refcnt == 0 &&		\
     97 	    ((flent)->fe_flags & FE_WAITER))		\
     98 		cv_signal(&(flent)->fe_cv);		\
     99 	mutex_exit(&(flent)->fe_lock);			\
    100 }
    101 
    102 #define	FLOW_FINAL_REFRELE(flent) {			\
    103 	ASSERT(flent->fe_refcnt == 1 && flent->fe_user_refcnt == 0);	\
    104 	FLOW_REFRELE(flent);				\
    105 }
    106 
    107 /*
    108  * Mark or unmark the flent with a bit flag
    109  */
    110 #define	FLOW_MARK(flent, flag) {		\
    111 	mutex_enter(&(flent)->fe_lock);		\
    112 	(flent)->fe_flags |= flag;		\
    113 	mutex_exit(&(flent)->fe_lock);		\
    114 }
    115 
    116 #define	FLOW_UNMARK(flent, flag) {		\
    117 	mutex_enter(&(flent)->fe_lock);		\
    118 	(flent)->fe_flags &= ~flag;		\
    119 	mutex_exit(&(flent)->fe_lock);		\
    120 }
    121 
    122 #define	FLENT_TO_MIP(flent)			\
    123 	(flent->fe_mbg != NULL ? mac_bcast_grp_mip(flent->fe_mbg) :	\
    124 	((mac_client_impl_t *)flent->fe_mcip)->mci_mip)
    125 
    126 /* Convert a bandwidth expressed in bps to a number of bytes per tick. */
    127 #define	FLOW_BYTES_PER_TICK(bps)	(((bps) >> 3) / hz)
    128 
    129 /*
    130  * Given an underlying range and a priority level, obtain the minimum for the
    131  * new range.
    132  */
    133 #define	FLOW_MIN_PRIORITY(min, max, pri)	\
    134 	((min) + ((((max) - (min)) / MRP_PRIORITY_LEVELS) * (pri)))
    135 
    136 /*
    137  * Given an underlying range and a minimum level (base), obtain the maximum
    138  * for the new range.
    139  */
    140 #define	FLOW_MAX_PRIORITY(min, max, base)	\
    141 	((base) + (((max) - (min)) / MRP_PRIORITY_LEVELS))
    142 
    143 /*
    144  * Given an underlying range and a priority level, get the absolute
    145  * priority value. For now there are just 3 values, high, low and
    146  * medium  so we can just return max, min or min + (max - min) / 2.
    147  * If there are more than three we need to change this computation.
    148  */
    149 #define	FLOW_PRIORITY(min, max, pri)		\
    150 	(pri) == MPL_HIGH ? (max) :	\
    151 	(pri) == MPL_LOW ? (min) :	\
    152 	((min) + (((max) - (min)) / 2))
    153 
    154 #define	MAC_FLOW_TAB_SIZE		500
    155 
    156 typedef struct flow_entry_s		flow_entry_t;
    157 typedef struct flow_tab_s		flow_tab_t;
    158 typedef struct flow_state_s 		flow_state_t;
    159 struct mac_impl_s;
    160 struct mac_client_impl_s;
    161 
    162 /*
    163  * Classification flags used to lookup the flow.
    164  */
    165 #define	FLOW_INBOUND		0x01
    166 #define	FLOW_OUTBOUND		0x02
    167 /* Don't compare VID when classifying the packets, see mac_rx_classify() */
    168 #define	FLOW_IGNORE_VLAN	0x04
    169 
    170 /* Generic flow client function signature */
    171 typedef void		(*flow_fn_t)(void *, void *, mblk_t *, boolean_t);
    172 
    173 /* Flow state */
    174 typedef enum {
    175 	FLOW_DRIVER_UPCALL,
    176 	FLOW_USER_REF
    177 } mac_flow_state_t;
    178 
    179 /* Matches a flow_entry_t using the extracted flow_state_t info */
    180 typedef boolean_t	(*flow_match_fn_t)(flow_tab_t *, flow_entry_t *,
    181 			    flow_state_t *);
    182 
    183 /* fe_flags */
    184 #define	FE_QUIESCE		0x01	/* Quiesce the flow */
    185 #define	FE_WAITER		0x02	/* Flow has a waiter */
    186 #define	FE_FLOW_TAB		0x04	/* Flow is in the flow tab list */
    187 #define	FE_G_FLOW_HASH		0x08	/* Flow is in the global flow hash */
    188 #define	FE_INCIPIENT		0x10	/* Being setup */
    189 #define	FE_CONDEMNED		0x20	/* Being deleted */
    190 #define	FE_UF_NO_DATAPATH	0x40	/* No datapath setup for User flow */
    191 #define	FE_MC_NO_DATAPATH	0x80	/* No datapath setup for mac client */
    192 
    193 /* fe_type */
    194 #define	FLOW_PRIMARY_MAC	0x01 	/* NIC primary MAC address */
    195 #define	FLOW_VNIC_MAC		0x02	/* VNIC flow */
    196 #define	FLOW_MCAST		0x04	/* Multicast (and broadcast) */
    197 #define	FLOW_OTHER		0x08	/* Other flows configured */
    198 #define	FLOW_USER		0x10	/* User defined flow */
    199 #define	FLOW_VNIC		FLOW_VNIC_MAC
    200 #define	FLOW_NO_STATS		0x20	/* Don't create stats for the flow */
    201 
    202 /*
    203  * Shared Bandwidth control counters between the soft ring set and its
    204  * associated soft rings. In case the flow associated with NIC/VNIC
    205  * has a group of Rx rings assigned to it, we have the same
    206  * number of soft ring sets as we have the Rx ring in the group
    207  * and each individual SRS (and its soft rings) decide when to
    208  * poll their Rx ring independently. But if there is a B/W limit
    209  * associated with the NIC/VNIC, then the B/W control counter is
    210  * shared across all the SRS in the group and their associated
    211  * soft rings.
    212  *
    213  * There is a many to 1 mapping between the SRS and
    214  * mac_bw_ctl if the flow has a group of Rx rings associated with
    215  * it.
    216  */
    217 typedef struct mac_bw_ctl_s {
    218 	kmutex_t	mac_bw_lock;
    219 	uint32_t	mac_bw_state;
    220 	size_t		mac_bw_sz;	/* ?? Is it needed */
    221 	size_t		mac_bw_limit;	/* Max bytes to process per tick */
    222 	size_t		mac_bw_used;	/* Bytes processed in current tick */
    223 	size_t		mac_bw_drop_threshold; /* Max queue length */
    224 	size_t		mac_bw_drop_bytes;
    225 	size_t		mac_bw_polled;
    226 	size_t		mac_bw_intr;
    227 	clock_t		mac_bw_curr_time;
    228 } mac_bw_ctl_t;
    229 
    230 struct flow_entry_s {					/* Protected by */
    231 	struct flow_entry_s	*fe_next;		/* ft_lock */
    232 
    233 	datalink_id_t		fe_link_id;		/* WO */
    234 
    235 	/* Properties as specified for this flow */
    236 	mac_resource_props_t	fe_resource_props;	/* SL */
    237 
    238 	/* Properties actually effective at run time for this flow */
    239 	mac_resource_props_t	fe_effective_props;	/* SL */
    240 
    241 	kmutex_t		fe_lock;
    242 	char			fe_flow_name[MAXFLOWNAMELEN];	/* fe_lock */
    243 	flow_desc_t		fe_flow_desc;		/* fe_lock */
    244 	kcondvar_t		fe_cv;			/* fe_lock */
    245 	/*
    246 	 * Initial flow ref is 1 on creation. A thread that lookups the
    247 	 * flent typically by a mac_flow_lookup() dynamically holds a ref.
    248 	 * If the ref is 1, it means there arent' any upcalls from the driver
    249 	 * or downcalls from the stack using this flent. Structures pointing
    250 	 * to the flent or flent inserted in lists don't count towards this
    251 	 * refcnt. Instead they are tracked using fe_flags. Only a control
    252 	 * thread doing a teardown operation deletes the flent, after waiting
    253 	 * for upcalls to finish synchronously. The fe_refcnt tracks
    254 	 * the number of upcall refs
    255 	 */
    256 	uint32_t		fe_refcnt;		/* fe_lock */
    257 
    258 	/*
    259 	 * This tracks lookups done using the global hash list for user
    260 	 * generated flows. This refcnt only protects the flent itself
    261 	 * from disappearing and helps walkers to read the flent info such
    262 	 * as flow spec. However the flent may be quiesced and the SRS could
    263 	 * be deleted. The fe_user_refcnt tracks the number of global flow
    264 	 * has refs.
    265 	 */
    266 	uint32_t		fe_user_refcnt;		/* fe_lock */
    267 	uint_t			fe_flags;		/* fe_lock */
    268 
    269 	/*
    270 	 * Function/args to invoke for delivering matching packets
    271 	 * Only the function ff_fn may be changed dynamically and atomically.
    272 	 * The ff_arg1 and ff_arg2 are set at creation time and may not
    273 	 * be changed.
    274 	 */
    275 	flow_fn_t		fe_cb_fn;		/* fe_lock */
    276 	void 			*fe_cb_arg1;		/* fe_lock */
    277 	void			*fe_cb_arg2;		/* fe_lock */
    278 
    279 	void			*fe_client_cookie;	/* WO */
    280 	void			*fe_rx_ring_group;	/* SL */
    281 	void			*fe_rx_srs[MAX_RINGS_PER_GROUP]; /* fe_lock */
    282 	int			fe_rx_srs_cnt;		/* fe_lock */
    283 	void			*fe_tx_srs;		/* WO */
    284 
    285 	/*
    286 	 * This is a unicast flow, and is a mac_client_impl_t
    287 	 */
    288 	void			*fe_mcip; 		/* WO */
    289 
    290 	/*
    291 	 * Used by mci_flent_list of mac_client_impl_t to track flows sharing
    292 	 * the same mac_client_impl_t.
    293 	 */
    294 	struct flow_entry_s	*fe_client_next;
    295 
    296 	/*
    297 	 * This is a broadcast or multicast flow and is a mac_bcast_grp_t
    298 	 */
    299 	void			*fe_mbg;		/* WO */
    300 	uint_t			fe_type;		/* WO */
    301 
    302 	/*
    303 	 * BW control info.
    304 	 */
    305 	mac_bw_ctl_t		fe_tx_bw;
    306 	mac_bw_ctl_t		fe_rx_bw;
    307 
    308 	/*
    309 	 * Used by flow table lookup code
    310 	 */
    311 	flow_match_fn_t		fe_match;
    312 
    313 	/*
    314 	 * Used by mac_flow_remove().
    315 	 */
    316 	int			fe_index;
    317 	flow_tab_t		*fe_flow_tab;
    318 
    319 	kstat_t			*fe_ksp;
    320 	flow_stats_t		fe_flowstats;
    321 	boolean_t		fe_desc_logged;
    322 	uint64_t		fe_nic_speed;
    323 };
    324 
    325 /*
    326  * Various structures used by the flows framework for keeping track
    327  * of packet state information.
    328  */
    329 
    330 /* Layer 2 */
    331 typedef struct flow_l2info_s {
    332 	uchar_t		*l2_start;
    333 	uint8_t		*l2_daddr;
    334 	uint16_t	l2_vid;
    335 	uint32_t	l2_sap;
    336 	uint_t		l2_hdrsize;
    337 } flow_l2info_t;
    338 
    339 /* Layer 3 */
    340 typedef struct flow_l3info_s {
    341 	uchar_t		*l3_start;
    342 	uint8_t		l3_protocol;
    343 	uint8_t		l3_version;
    344 	boolean_t	l3_dst_or_src;
    345 	uint_t		l3_hdrsize;
    346 	boolean_t	l3_fragmented;
    347 } flow_l3info_t;
    348 
    349 /* Layer 4 */
    350 typedef struct flow_l4info_s {
    351 	uchar_t		*l4_start;
    352 	uint16_t	l4_src_port;
    353 	uint16_t	l4_dst_port;
    354 	uint16_t	l4_hash_port;
    355 } flow_l4info_t;
    356 
    357 /*
    358  * Combined state structure.
    359  * Holds flow direction and an mblk_t pointer.
    360  */
    361 struct flow_state_s {
    362 	uint_t		fs_flags;
    363 	mblk_t		*fs_mp;
    364 	flow_l2info_t	fs_l2info;
    365 	flow_l3info_t	fs_l3info;
    366 	flow_l4info_t	fs_l4info;
    367 };
    368 
    369 /*
    370  * Flow ops vector.
    371  * There are two groups of functions. The ones ending with _fe are
    372  * called when a flow is being added. The others (hash, accept) are
    373  * called at flow lookup time.
    374  */
    375 #define	FLOW_MAX_ACCEPT	16
    376 typedef struct flow_ops_s {
    377 	/*
    378 	 * fo_accept_fe():
    379 	 * Validates the contents of the flow and checks whether
    380 	 * it's compatible with the flow table. sets the fe_match
    381 	 * function of the flow.
    382 	 */
    383 	int		(*fo_accept_fe)(flow_tab_t *, flow_entry_t *);
    384 	/*
    385 	 * fo_hash_fe():
    386 	 * Generates a hash index to the flow table. This function
    387 	 * must use the same algorithm as fo_hash(), which is used
    388 	 * by the flow lookup code path.
    389 	 */
    390 	uint32_t	(*fo_hash_fe)(flow_tab_t *, flow_entry_t *);
    391 	/*
    392 	 * fo_match_fe():
    393 	 * This is used for finding identical flows.
    394 	 */
    395 	boolean_t	(*fo_match_fe)(flow_tab_t *, flow_entry_t *,
    396 			    flow_entry_t *);
    397 	/*
    398 	 * fo_insert_fe():
    399 	 * Used for inserting a flow to a flow chain.
    400 	 * Protocols that have special ordering requirements would
    401 	 * need to implement this. For those that don't,
    402 	 * flow_generic_insert_fe() may be used.
    403 	 */
    404 	int		(*fo_insert_fe)(flow_tab_t *, flow_entry_t **,
    405 			    flow_entry_t *);
    406 
    407 	/*
    408 	 * Calculates the flow hash index based on the accumulated
    409 	 * state in flow_state_t. Must use the same algorithm as
    410 	 * fo_hash_fe().
    411 	 */
    412 	uint32_t	(*fo_hash)(flow_tab_t *, flow_state_t *);
    413 
    414 	/*
    415 	 * Array of accept fuctions.
    416 	 * Each function in the array will accumulate enough state
    417 	 * (header length, protocol) to allow the next function to
    418 	 * proceed. We support up to FLOW_MAX_ACCEPT functions which
    419 	 * should be sufficient for all practical purposes.
    420 	 */
    421 	int		(*fo_accept[FLOW_MAX_ACCEPT])(flow_tab_t *,
    422 			    flow_state_t *);
    423 } flow_ops_t;
    424 
    425 /*
    426  * Generic flow table.
    427  */
    428 struct flow_tab_s {
    429 	krwlock_t		ft_lock;
    430 	/*
    431 	 * Contains a list of functions (described above)
    432 	 * specific to this table type.
    433 	 */
    434 	flow_ops_t		ft_ops;
    435 
    436 	/*
    437 	 * Indicates what types of flows are supported.
    438 	 */
    439 	flow_mask_t		ft_mask;
    440 
    441 	/*
    442 	 * An array of flow_entry_t * of size ft_size.
    443 	 * Each element is the beginning of a hash chain.
    444 	 */
    445 	flow_entry_t		**ft_table;
    446 	uint_t			ft_size;
    447 
    448 	/*
    449 	 * The number of flows inserted into ft_table.
    450 	 */
    451 	uint_t			ft_flow_count;
    452 	struct mac_impl_s	*ft_mip;
    453 	struct mac_client_impl_s	*ft_mcip;
    454 };
    455 
    456 /*
    457  * This is used for describing what type of flow table can be created.
    458  * mac_flow.c contains a list of these structures.
    459  */
    460 typedef struct flow_tab_info_s {
    461 	flow_ops_t		*fti_ops;
    462 	flow_mask_t		fti_mask;
    463 	uint_t			fti_size;
    464 } flow_tab_info_t;
    465 
    466 #define	FLOW_TAB_EMPTY(ft)	((ft) == NULL || (ft)->ft_flow_count == 0)
    467 
    468 /*
    469  * This is used by mac_tx_send.
    470  */
    471 typedef struct mac_tx_stats_s {
    472 	uint_t			ts_opackets;
    473 	uint_t			ts_obytes;
    474 	uint_t			ts_oerrors;
    475 } mac_tx_stats_t;
    476 
    477 #define	FLOW_STAT_UPDATE(f, s, c)  {					\
    478 	((flow_entry_t *)(f))->fe_flowstats.fs_##s += ((uint64_t)(c));	\
    479 }
    480 
    481 #define	FLOW_TX_STATS_UPDATE(f, s) {					\
    482 	FLOW_STAT_UPDATE((f), opackets, (s)->ts_opackets);		\
    483 	FLOW_STAT_UPDATE((f), obytes, (s)->ts_obytes);			\
    484 	FLOW_STAT_UPDATE((f), oerrors, (s)->ts_oerrors);		\
    485 }
    486 
    487 extern void	mac_flow_init();
    488 extern void	mac_flow_fini();
    489 extern int	mac_flow_create(flow_desc_t *, mac_resource_props_t *,
    490 		    char *, void *, uint_t, flow_entry_t **);
    491 
    492 extern int	mac_flow_add(flow_tab_t *, flow_entry_t *);
    493 extern int	mac_flow_add_subflow(mac_client_handle_t, flow_entry_t *,
    494 		    boolean_t);
    495 extern int	mac_flow_hash_add(flow_entry_t *);
    496 extern int	mac_flow_lookup_byname(char *, flow_entry_t **);
    497 extern int	mac_flow_lookup(flow_tab_t *, mblk_t *, uint_t,
    498 		    flow_entry_t **);
    499 
    500 extern int	mac_flow_walk(flow_tab_t *, int (*)(flow_entry_t *, void *),
    501 		    void *);
    502 
    503 extern int	mac_flow_walk_nolock(flow_tab_t *,
    504 		    int (*)(flow_entry_t *, void *), void *);
    505 
    506 extern void	mac_flow_modify(flow_tab_t *, flow_entry_t *,
    507 		    mac_resource_props_t *);
    508 
    509 extern void	*mac_flow_get_client_cookie(flow_entry_t *);
    510 
    511 extern uint32_t	mac_flow_modify_props(flow_entry_t *, mac_resource_props_t *);
    512 
    513 extern int	mac_flow_update(flow_tab_t *, flow_entry_t *, flow_desc_t *);
    514 extern void	mac_flow_get_desc(flow_entry_t *, flow_desc_t *);
    515 extern void	mac_flow_set_desc(flow_entry_t *, flow_desc_t *);
    516 
    517 extern void	mac_flow_remove(flow_tab_t *, flow_entry_t *, boolean_t);
    518 extern void	mac_flow_hash_remove(flow_entry_t *);
    519 extern void	mac_flow_wait(flow_entry_t *, mac_flow_state_t);
    520 extern void	mac_flow_quiesce(flow_entry_t *);
    521 extern void	mac_flow_restart(flow_entry_t *);
    522 extern void	mac_flow_cleanup(flow_entry_t *);
    523 extern void	mac_flow_destroy(flow_entry_t *);
    524 
    525 extern void	mac_flow_tab_create(flow_ops_t *, flow_mask_t, uint_t,
    526 		    struct mac_impl_s *, flow_tab_t **);
    527 extern void	mac_flow_l2tab_create(struct mac_impl_s *, flow_tab_t **);
    528 extern void	mac_flow_tab_destroy(flow_tab_t *);
    529 extern void	mac_flow_drop(void *, void *, mblk_t *);
    530 extern void	flow_stat_destroy(flow_entry_t *);
    531 
    532 #ifdef	__cplusplus
    533 }
    534 #endif
    535 
    536 #endif	/* _MAC_FLOW_IMPL_H */
    537