Home | History | Annotate | Download | only in sys
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     22 /*	  All Rights Reserved  	*/
     23 
     24 
     25 /*
     26  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     27  * Use is subject to license terms.
     28  */
     29 
     30 #ifndef _SYS_CALLO_H
     31 #define	_SYS_CALLO_H
     32 
     33 #include <sys/t_lock.h>
     34 #include <sys/taskq.h>
     35 #include <sys/lgrp.h>
     36 #include <sys/processor.h>
     37 #include <sys/cyclic.h>
     38 #include <sys/kstat.h>
     39 #include <sys/systm.h>
     40 
     41 #ifdef	__cplusplus
     42 extern "C" {
     43 #endif
     44 
     45 #ifdef	_KERNEL
     46 
     47 typedef struct callout_list	callout_list_t;
     48 
     49 /*
     50  * The callout mechanism provides general-purpose event scheduling:
     51  * an arbitrary function is called in a specified amount of time.
     52  * The expiration time for a callout is kept in its callout list
     53  * structure.
     54  */
     55 typedef struct callout {
     56 	struct callout	*c_idnext;	/* next in ID hash, or on freelist */
     57 	struct callout	*c_idprev;	/* prev in ID hash */
     58 	struct callout	*c_clnext;	/* next in callout list */
     59 	struct callout	*c_clprev;	/* prev in callout list */
     60 	callout_id_t	c_xid;		/* extended callout ID; see below */
     61 	callout_list_t	*c_list;	/* callout list */
     62 	void		(*c_func)(void *); /* function to call */
     63 	void		*c_arg;		/* argument to function */
     64 	kthread_t	*c_executor;	/* executing thread */
     65 	kcondvar_t	c_done;		/* signal callout completion */
     66 	ushort_t	c_waiting;	/* untimeout waiting flag */
     67 } callout_t;
     68 
     69 /*
     70  * The callout ID (callout_id_t) uniquely identifies a callout. The callout
     71  * ID is always 64 bits internally. The lower 32 bits contain an ID value.
     72  * The upper 32 bits contain a generation number and flags. When the ID value
     73  * wraps the generation number is incremented during ID generation. This
     74  * protects callers from ID collisions that can happen as a result of the wrap.
     75  *
     76  * The kernel internal interface, timeout_generic(), always returns a
     77  * callout_id_t. But the legacy interfaces, timeout() and realtime_timeout()
     78  * return a timeout_id_t. On a 64-bit system, timeout_id_t is also 64 bits.
     79  * So, the full 64-bit ID (sans the flags) can be returned. However, on 32-bit
     80  * systems, timeout_id_t is 32 bits. So, only the lower 32 bits can be
     81  * returned. In such cases, a default generation number of 0 is assigned to
     82  * the legacy IDs.
     83  *
     84  * The lower 32-bit ID space is partitioned into two spaces - one for
     85  * short-term callouts and one for long-term.
     86  *
     87  * Here is the bit layout for the callout ID:
     88  *
     89  *      63    62    61 ...  32    31      30     29 .. X+1  X ... 1   0
     90  *  -----------------------------------------------------------------------
     91  *  | Free | Exec | Generation | Long | Counter | ID bits | Table  | Type |
     92  *  |      |      | number     | term | High    |         | number |      |
     93  *  -----------------------------------------------------------------------
     94  *
     95  * Free:
     96  *    This bit indicates that this callout has been freed. This is for
     97  *    debugging purposes.
     98  *
     99  * Exec(uting):
    100  *    This is the executing bit which is only set in the extended callout
    101  *    ID. This bit indicates that the callout handler is currently being
    102  *    executed.
    103  *
    104  * Generation number:
    105  *    This is the generation part of the ID.
    106  *
    107  * Long term:
    108  *    This bit indicates whether this is a short-term or a long-term callout.
    109  *    The long-term bit exists to address the problem of callout ID collision
    110  *    on 32-bit systems. This is an issue because the system typically
    111  *    generates a large number of timeout() requests, which means that callout
    112  *    IDs eventually get recycled. Most timeouts are very short-lived, so that
    113  *    ID recycling isn't a problem; but there are a handful of timeouts which
    114  *    are sufficiently long-lived to see their own IDs reused. We use the
    115  *    long-term bit to partition the ID namespace into pieces; the short-term
    116  *    space gets all the heavy traffic and can wrap frequently (i.e., on the
    117  *    order of a day) with no ill effects; the long-term space gets very little
    118  *    traffic and thus never wraps. That said, we need to future proof callouts
    119  *    in case 32-bit systems grow in size and are able to consume callout IDs
    120  *    at faster rates. So, we should make all the kernel clients that use
    121  *    callouts to use the internal interface so that they can use IDs outside
    122  *    of the legacy space with a proper generation number.
    123  *
    124  * Counter High + ID counter bits:
    125  *    These bits represent the actual ID bits in the callout ID.
    126  *    The highest bit of the running counter is always set; this ensures that
    127  *    the callout ID is always non-zero, thus eliminating the need for an
    128  *    explicit wrap-around test during ID generation.
    129  *
    130  * Table number:
    131  *    These bits carry the table number for the callout table where the callout
    132  *    is queued. Each CPU has its own callout table. So, the callout tables are
    133  *    numbered from 0 - (max_ncpus - 1). Because max_ncpus is different on
    134  *    different systems, the actual number of table number bits will vary
    135  *    accordingly. And so will the ID counter bits.
    136  *
    137  * Type:
    138  *    This bit represents the callout (table) type. Each CPU has one realtime
    139  *    and one normal callout table.
    140  */
    141 #define	CALLOUT_FREE		0x8000000000000000ULL
    142 #define	CALLOUT_EXECUTING	0x4000000000000000ULL
    143 #define	CALLOUT_ID_FLAGS	(CALLOUT_FREE | CALLOUT_EXECUTING)
    144 #define	CALLOUT_ID_MASK		~CALLOUT_ID_FLAGS
    145 #define	CALLOUT_GENERATION_LOW	0x100000000ULL
    146 #define	CALLOUT_LONGTERM	0x80000000
    147 #define	CALLOUT_COUNTER_HIGH	0x40000000
    148 #define	CALLOUT_TYPE_BITS	1
    149 #define	CALLOUT_NTYPES		(1 << CALLOUT_TYPE_BITS)
    150 #define	CALLOUT_TYPE_MASK	(CALLOUT_NTYPES - 1)
    151 #define	CALLOUT_COUNTER_SHIFT	callout_table_bits
    152 #define	CALLOUT_TABLE(t, f)	(((f) << CALLOUT_TYPE_BITS) | (t))
    153 #define	CALLOUT_TABLE_NUM(ct)	((ct) - callout_table)
    154 #define	CALLOUT_TABLE_TYPE(ct)	(CALLOUT_TABLE_NUM(ct) & CALLOUT_TYPE_MASK)
    155 #define	CALLOUT_TABLE_SEQID(ct)	(CALLOUT_TABLE_NUM(ct) >> CALLOUT_TYPE_BITS)
    156 
    157 /*
    158  * We assume that during any period of CALLOUT_LONGTERM_TICKS ticks, at most
    159  * (CALLOUT_COUNTER_HIGH / callout_counter_low) callouts will be generated.
    160  */
    161 #define	CALLOUT_LONGTERM_TICKS	0x4000UL
    162 #define	CALLOUT_BUCKET_SHIFT	9
    163 #define	CALLOUT_BUCKETS		(1 << CALLOUT_BUCKET_SHIFT)
    164 #define	CALLOUT_BUCKET_MASK	(CALLOUT_BUCKETS - 1)
    165 #define	CALLOUT_HASH(x)		((x) & CALLOUT_BUCKET_MASK)
    166 #define	CALLOUT_IDHASH(x)	CALLOUT_HASH((x) >> CALLOUT_COUNTER_SHIFT)
    167 /*
    168  * The multiply by 0 and 1 below are cosmetic. Just to align things better
    169  * and make it more readable. The multiplications will be done at compile
    170  * time.
    171  */
    172 #define	CALLOUT_CLHASH(x)			\
    173 	CALLOUT_HASH(				\
    174 	    ((x)>>(CALLOUT_BUCKET_SHIFT*0)) ^	\
    175 	    ((x)>>(CALLOUT_BUCKET_SHIFT*1)) ^	\
    176 	    ((x)>>(CALLOUT_BUCKET_SHIFT*2)) ^	\
    177 	    ((x)>>(CALLOUT_BUCKET_SHIFT*3)))
    178 
    179 #define	CALLOUT_ID_TO_TABLE(id)		((id) & callout_table_mask)
    180 
    181 #define	CALLOUT_SHORT_ID(table)		\
    182 		((callout_id_t)(table) | CALLOUT_COUNTER_HIGH)
    183 #define	CALLOUT_LONG_ID(table)		\
    184 		(CALLOUT_SHORT_ID(table) | CALLOUT_LONGTERM)
    185 
    186 #define	CALLOUT_THREADS		2
    187 
    188 #define	CALLOUT_REALTIME	0		/* realtime callout type */
    189 #define	CALLOUT_NORMAL		1		/* normal callout type */
    190 
    191 /*
    192  * callout_t's are cache-aligned structures allocated from kmem caches. One kmem
    193  * cache is created per lgrp and is shared by all CPUs in that lgrp. Benefits:
    194  *	- cache pages are mapped only in the TLBs of the CPUs of the lgrp
    195  *	- data in cache pages is present only in those CPU caches
    196  *	- memory access performance improves with locality-awareness in kmem
    197  *
    198  * The following structure is used to manage per-lgroup kmem caches.
    199  *
    200  * NOTE: Free callout_t's go to a callout table's freelist. CPUs map to callout
    201  * tables via their sequence IDs, not CPU IDs. DR operations can cause a
    202  * free list to have callouts from multiple lgrp caches. This takes away some
    203  * performance, but is no worse than if we did not use lgrp caches at all.
    204  */
    205 typedef struct callout_cache {
    206 	struct callout_cache	*cc_next;	/* link in the global list */
    207 	lgrp_handle_t		cc_hand;	/* lgroup handle */
    208 	kmem_cache_t		*cc_cache;	/* kmem cache pointer */
    209 	kmem_cache_t		*cc_lcache;	/* kmem cache pointer */
    210 } callout_cache_t;
    211 
    212 /*
    213  * The callout hash structure is used for queueing both callouts and
    214  * callout lists. That is why the fields are declared as void *.
    215  */
    216 typedef struct callout_hash {
    217 	void	*ch_head;
    218 	void	*ch_tail;
    219 } callout_hash_t;
    220 
    221 /*
    222  * CALLOUT_LIST_FLAG_FREE
    223  *	Callout list is free.
    224  * CALLOUT_LIST_FLAG_ABSOLUTE
    225  *	Callout list contains absolute timers.
    226  * CALLOUT_LIST_FLAG_HRESTIME
    227  *	Callout list contains hrestime timers.
    228  * CALLOUT_LIST_FLAG_NANO
    229  *	Callout list contains 1-nanosecond resolution callouts.
    230  */
    231 #define	CALLOUT_LIST_FLAG_FREE			0x1
    232 #define	CALLOUT_LIST_FLAG_ABSOLUTE		0x2
    233 #define	CALLOUT_LIST_FLAG_HRESTIME		0x4
    234 #define	CALLOUT_LIST_FLAG_NANO			0x8
    235 
    236 struct callout_list {
    237 	callout_list_t	*cl_next;	/* next in clhash */
    238 	callout_list_t	*cl_prev;	/* prev in clhash */
    239 	hrtime_t	cl_expiration;	/* expiration for callouts in list */
    240 	callout_hash_t	cl_callouts;	/* list of callouts */
    241 	int		cl_flags;	/* callout flags */
    242 };
    243 
    244 /*
    245  * Callout heap element. Each element in the heap stores the expiration
    246  * as well as the corresponding callout list. This is to avoid a lookup
    247  * of the callout list when the heap is processed. Because we store the
    248  * callout list pointer in the heap element, we have to always remove
    249  * a heap element and its callout list together. We cannot remove one
    250  * without the other.
    251  */
    252 typedef struct callout_heap {
    253 	hrtime_t	ch_expiration;
    254 	callout_list_t	*ch_list;
    255 } callout_heap_t;
    256 
    257 /*
    258  * When the heap contains too many empty callout lists, it needs to be
    259  * cleaned up. The decision to clean up the heap is a function of the
    260  * number of empty entries and the heap size. Also, we don't want to
    261  * clean up small heaps.
    262  */
    263 #define	CALLOUT_MIN_REAP	(CALLOUT_BUCKETS >> 3)
    264 #define	CALLOUT_CLEANUP(ct)	((ct->ct_nreap >= callout_min_reap) &&	\
    265 				    (ct->ct_nreap >= (ct->ct_heap_num >> 1)))
    266 
    267 /*
    268  * Per-callout table kstats.
    269  *
    270  * CALLOUT_TIMEOUTS
    271  *	Callouts created since boot.
    272  * CALLOUT_TIMEOUTS_PENDING
    273  *	Number of outstanding callouts.
    274  * CALLOUT_UNTIMEOUTS_UNEXPIRED
    275  *	Number of cancelled callouts that have not expired.
    276  * CALLOUT_UNTIMEOUTS_EXECUTING
    277  *	Number of cancelled callouts that were executing at the time of
    278  *	cancellation.
    279  * CALLOUT_UNTIMEOUTS_EXPIRED
    280  *	Number of cancelled callouts that had already expired at the time
    281  *	of cancellations.
    282  * CALLOUT_EXPIRATIONS
    283  *	Number of callouts that expired.
    284  * CALLOUT_ALLOCATIONS
    285  *	Number of callout structures allocated.
    286  * CALLOUT_CLEANUPS
    287  *	Number of times a callout table is cleaned up.
    288  */
    289 typedef enum callout_stat_type {
    290 	CALLOUT_TIMEOUTS,
    291 	CALLOUT_TIMEOUTS_PENDING,
    292 	CALLOUT_UNTIMEOUTS_UNEXPIRED,
    293 	CALLOUT_UNTIMEOUTS_EXECUTING,
    294 	CALLOUT_UNTIMEOUTS_EXPIRED,
    295 	CALLOUT_EXPIRATIONS,
    296 	CALLOUT_ALLOCATIONS,
    297 	CALLOUT_CLEANUPS,
    298 	CALLOUT_NUM_STATS
    299 } callout_stat_type_t;
    300 
    301 /*
    302  * Callout flags:
    303  *
    304  * CALLOUT_FLAG_ROUNDUP
    305  *	Roundup the expiration time to the next resolution boundary.
    306  *	If this flag is not specified, the expiration time is rounded down.
    307  * CALLOUT_FLAG_ABSOLUTE
    308  *	Normally, the expiration passed to the timeout API functions is an
    309  *	expiration interval. If this flag is specified, then it is
    310  *	interpreted as the expiration time itself.
    311  * CALLOUT_FLAG_HRESTIME
    312  *	Normally, callouts are not affected by changes to system time
    313  *	(hrestime). This flag is used to create a callout that is affected
    314  *	by system time. If system time changes, these timers must be
    315  *	handled in a special way (see callout.c). These are used by condition
    316  *	variables and LWP timers that need this behavior.
    317  * CALLOUT_FLAG_32BIT
    318  *	Legacy interfaces timeout() and realtime_timeout() pass this flag
    319  *	to timeout_generic() to indicate that a 32-bit ID should be allocated.
    320  */
    321 #define	CALLOUT_FLAG_ROUNDUP		0x1
    322 #define	CALLOUT_FLAG_ABSOLUTE		0x2
    323 #define	CALLOUT_FLAG_HRESTIME		0x4
    324 #define	CALLOUT_FLAG_32BIT		0x8
    325 
    326 /*
    327  * On 32-bit systems, the legacy interfaces, timeout() and realtime_timeout(),
    328  * must pass CALLOUT_FLAG_32BIT to timeout_generic() so that a 32-bit ID
    329  * can be generated.
    330  */
    331 #ifdef _LP64
    332 #define	CALLOUT_LEGACY		0
    333 #else
    334 #define	CALLOUT_LEGACY		CALLOUT_FLAG_32BIT
    335 #endif
    336 
    337 /*
    338  * All of the state information associated with a callout table.
    339  * The fields are ordered with cache performance in mind.
    340  */
    341 typedef struct callout_table {
    342 	kmutex_t	ct_mutex;	/* protects all callout state */
    343 	callout_t	*ct_free;	/* free callout structures */
    344 	callout_list_t	*ct_lfree;	/* free callout list structures */
    345 	callout_id_t	ct_short_id;	/* most recently issued short-term ID */
    346 	callout_id_t	ct_long_id;	/* most recently issued long-term ID */
    347 	callout_hash_t 	*ct_idhash;	/* ID hash chains */
    348 	callout_hash_t 	*ct_clhash;	/* callout list hash */
    349 	kstat_named_t	*ct_kstat_data;	/* callout kstat data */
    350 
    351 	uint_t		ct_type;	/* callout table type */
    352 	uint_t		ct_suspend;	/* suspend count */
    353 	cyclic_id_t	ct_cyclic;	/* cyclic for this table */
    354 	callout_heap_t	*ct_heap;	/* callout expiration heap */
    355 	ulong_t		ct_heap_num;	/* occupied slots in the heap */
    356 	ulong_t		ct_heap_max;	/* end of the heap */
    357 	kmem_cache_t	*ct_cache;	/* callout kmem cache */
    358 	kmem_cache_t	*ct_lcache;	/* callout list kmem cache */
    359 	callout_id_t	ct_gen_id;	/* generation based ID */
    360 
    361 	callout_hash_t	ct_expired;	/* list of expired callout lists */
    362 	taskq_t		*ct_taskq;	/* taskq to execute normal callouts */
    363 	kstat_t		*ct_kstats;	/* callout kstats */
    364 	int		ct_nreap;	/* # heap entries that need reaping */
    365 #ifdef _LP64
    366 	char		ct_pad[28];	/* cache alignment */
    367 #else
    368 	char		ct_pad[24];	/* cache alignment */
    369 #endif
    370 } callout_table_t;
    371 
    372 /*
    373  * Short hand definitions for the callout kstats.
    374  */
    375 #define	ct_timeouts							\
    376 		ct_kstat_data[CALLOUT_TIMEOUTS].value.ui64
    377 #define	ct_timeouts_pending						\
    378 		ct_kstat_data[CALLOUT_TIMEOUTS_PENDING].value.ui64
    379 #define	ct_untimeouts_unexpired						\
    380 		ct_kstat_data[CALLOUT_UNTIMEOUTS_UNEXPIRED].value.ui64
    381 #define	ct_untimeouts_executing						\
    382 		ct_kstat_data[CALLOUT_UNTIMEOUTS_EXECUTING].value.ui64
    383 #define	ct_untimeouts_expired						\
    384 		ct_kstat_data[CALLOUT_UNTIMEOUTS_EXPIRED].value.ui64
    385 #define	ct_expirations							\
    386 		ct_kstat_data[CALLOUT_EXPIRATIONS].value.ui64
    387 #define	ct_allocations							\
    388 		ct_kstat_data[CALLOUT_ALLOCATIONS].value.ui64
    389 #define	ct_cleanups							\
    390 		ct_kstat_data[CALLOUT_CLEANUPS].value.ui64
    391 
    392 #define	CALLOUT_CHUNK	128
    393 
    394 #define	CALLOUT_HEAP_PARENT(index)	(((index) - 1) >> 1)
    395 #define	CALLOUT_HEAP_RIGHT(index)	(((index) + 1) << 1)
    396 #define	CALLOUT_HEAP_LEFT(index)	((((index) + 1) << 1) - 1)
    397 
    398 #define	CALLOUT_CYCLIC_HANDLER(t)					\
    399 	((t == CALLOUT_REALTIME) ? callout_realtime : callout_normal)
    400 
    401 #define	CALLOUT_TCP_RESOLUTION		10000000ULL
    402 
    403 #define	CALLOUT_ALIGN	64	/* cache line size */
    404 
    405 #ifdef _LP64
    406 #define	CALLOUT_MAX_TICKS	NSEC_TO_TICK(CY_INFINITY);
    407 #else
    408 #define	CALLOUT_MAX_TICKS	LONG_MAX
    409 #endif
    410 
    411 #define	CALLOUT_TOLERANCE	200000		/* nanoseconds */
    412 
    413 extern void		callout_init(void);
    414 extern void		membar_sync(void);
    415 extern void		callout_cpu_online(cpu_t *);
    416 extern void		callout_cpu_offline(cpu_t *);
    417 extern void		callout_hrestime(void);
    418 
    419 #endif
    420 
    421 #ifdef	__cplusplus
    422 }
    423 #endif
    424 
    425 #endif	/* _SYS_CALLO_H */
    426