Home | History | Annotate | Download | only in sys
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 #ifndef _SYS_BUF_H
     40 #define	_SYS_BUF_H
     41 
     42 #include <sys/types32.h>
     43 #include <sys/t_lock.h>
     44 #include <sys/kstat.h>
     45 
     46 #ifdef	__cplusplus
     47 extern "C" {
     48 #endif
     49 
     50 /*
     51  *	Each buffer in the pool is usually doubly linked into 2 lists:
     52  *	the device with which it is currently associated (always)
     53  *	and also on a list of blocks available for allocation
     54  *	for other use (usually).
     55  *	The latter list is kept in last-used order, and the two
     56  *	lists are doubly linked to make it easy to remove
     57  *	a buffer from one list when it was found by
     58  *	looking through the other.
     59  *	A buffer is on the available list, and is liable
     60  *	to be reassigned to another disk block, if and only
     61  *	if it is not marked BUSY.  When a buffer is busy, the
     62  *	available-list pointers can be used for other purposes.
     63  *	Most drivers use the forward ptr as a link in their I/O active queue.
     64  *	A buffer header contains all the information required to perform I/O.
     65  *	Most of the routines which manipulate these things are in bio.c.
     66  *
     67  *	There are a number of locks associated with the buffer management
     68  *	system.
     69  *	hbuf.b_lock:	protects hash chains, buffer hdr freelists
     70  *			and delayed write freelist
     71  *	bfree_lock;	protects the bfreelist structure
     72  *	bhdr_lock:	protects the free header list
     73  *	blist_lock:	protects b_list fields
     74  *	buf.b_sem:	protects all remaining members in the buf struct
     75  *	buf.b_io:	I/O synchronization variable
     76  *
     77  *	A buffer header is never "locked" (b_sem) when it is on
     78  *	a "freelist" (bhdrlist or bfreelist avail lists).
     79  */
     80 typedef struct	buf {
     81 	int	b_flags;		/* see defines below */
     82 	struct buf *b_forw;		/* headed by d_tab of conf.c */
     83 	struct buf *b_back;		/*  "  */
     84 	struct buf *av_forw;		/* position on free list, */
     85 	struct buf *av_back;		/* if not BUSY */
     86 	o_dev_t	b_dev;			/* OLD major+minor device name */
     87 	size_t b_bcount;		/* transfer count */
     88 	union {
     89 		caddr_t b_addr;		/* low order core address */
     90 		struct fs *b_fs;	/* superblocks */
     91 		struct cg *b_cg;	/* UFS cylinder group block */
     92 		struct dinode *b_dino;	/* UFS ilist */
     93 		daddr32_t *b_daddr;	/* disk blocks */
     94 	} b_un;
     95 
     96 	lldaddr_t	_b_blkno;	/* block # on device (union) */
     97 #define	b_lblkno	_b_blkno._f
     98 #ifdef _LP64
     99 #define	b_blkno		_b_blkno._f
    100 #else
    101 #define	b_blkno		_b_blkno._p._l
    102 #endif /* _LP64 */
    103 
    104 	char	b_obs1;			/* obsolete */
    105 	size_t	b_resid;		/* words not transferred after error */
    106 	clock_t	b_start;		/* request start time */
    107 	struct  proc  *b_proc;		/* process doing physical or swap I/O */
    108 	struct	page  *b_pages;		/* page list for PAGEIO */
    109 	clock_t b_obs2;			/* obsolete */
    110 	/* Begin new stuff */
    111 #define	b_actf	av_forw
    112 #define	b_actl	av_back
    113 #define	b_active b_bcount
    114 #define	b_errcnt b_resid
    115 	size_t	b_bufsize;		/* size of allocated buffer */
    116 	int	(*b_iodone)(struct buf *);	/* function called by iodone */
    117 	struct	vnode *b_vp;		/* vnode associated with block */
    118 	struct 	buf *b_chain;		/* chain together all buffers here */
    119 	int	b_obs3;			/* obsolete */
    120 	int	b_error;		/* expanded error field */
    121 	void	*b_private;		/* "opaque" driver private area */
    122 	dev_t	b_edev;			/* expanded dev field */
    123 	ksema_t	b_sem;			/* Exclusive access to buf */
    124 	ksema_t	b_io;			/* I/O Synchronization */
    125 	struct buf *b_list;		/* List of potential B_DELWRI bufs */
    126 	struct page **b_shadow;		/* shadow page list */
    127 	void	*b_dip;			/* device info pointer */
    128 	struct vnode *b_file;		/* file associated with this buffer */
    129 	offset_t b_offset;		/* offset in file assoc. with buffer */
    130 } buf_t;
    131 
    132 /*
    133  * Bufhd structures used at the head of the hashed buffer queues.
    134  * We only need seven words for this, so this abbreviated
    135  * definition saves some space.
    136  */
    137 struct diskhd {
    138 	int	b_flags;		/* not used, needed for consistency */
    139 	struct buf *b_forw, *b_back;	/* queue of unit queues */
    140 	struct buf *av_forw, *av_back;	/* queue of bufs for this unit */
    141 	o_dev_t	b_dev;			/* OLD major+minor device name */
    142 	size_t b_bcount;		/* transfer count */
    143 };
    144 
    145 
    146 /*
    147  * Statistics on the buffer cache
    148  */
    149 struct biostats {
    150 	kstat_named_t	bio_lookup;	/* requests to assign buffer */
    151 	kstat_named_t	bio_hit;	/* buffer already associated with blk */
    152 	kstat_named_t	bio_bufwant;	/* kmem_allocs NOSLEEP failed new buf */
    153 	kstat_named_t	bio_bufwait;	/* kmem_allocs with KM_SLEEP for buf */
    154 	kstat_named_t	bio_bufbusy;	/* buffer locked by someone else */
    155 	kstat_named_t	bio_bufdup;	/* duplicate buffer found for block */
    156 };
    157 
    158 /*
    159  * These flags are kept in b_flags.
    160  * The first group is part of the DDI
    161  */
    162 #define	B_BUSY		0x0001	/* not on av_forw/back list */
    163 #define	B_DONE		0x0002	/* transaction finished */
    164 #define	B_ERROR		0x0004	/* transaction aborted */
    165 #define	B_PAGEIO	0x0010	/* do I/O to pages on bp->p_pages */
    166 #define	B_PHYS		0x0020	/* Physical IO potentially using UNIBUS map */
    167 #define	B_READ		0x0040	/* read when I/O occurs */
    168 #define	B_WRITE		0x0100	/* non-read pseudo-flag */
    169 
    170 /* Not part of the DDI */
    171 #define	B_WANTED	0x0080		/* issue wakeup when BUSY goes off */
    172 #define	B_AGE		0x000200	/* delayed write for correct aging */
    173 #define	B_ASYNC		0x000400	/* don't wait for I/O completion */
    174 #define	B_DELWRI	0x000800	/* delayed write-wait til buf needed */
    175 #define	B_STALE		0x001000	/* on av_* list; invalid contents */
    176 #define	B_DONTNEED	0x002000	/* after write, need not be cached */
    177 #define	B_REMAPPED	0x004000	/* buffer is kernel addressable */
    178 #define	B_FREE		0x008000	/* free page when done */
    179 #define	B_INVAL		0x010000	/* destroy page when done */
    180 #define	B_FORCE		0x020000	/* semi-permanent removal from cache */
    181 #define	B_NOCACHE	0x080000 	/* don't cache block when released */
    182 #define	B_TRUNC		0x100000	/* truncate page without I/O */
    183 #define	B_SHADOW	0x200000	/* is b_shadow field valid? */
    184 #define	B_RETRYWRI	0x400000	/* retry write til works or bfinval */
    185 #define	B_FAILFAST	0x1000000	/* Fail promptly if device goes away */
    186 #define	B_STARTED	0x2000000	/* io:::start probe called for buf */
    187 #define	B_ABRWRITE	0x4000000	/* Application based recovery active */
    188 #define	B_PAGE_NOWAIT	0x8000000	/* Skip the page if it is locked */
    189 
    190 /*
    191  * There is some confusion over the meaning of B_FREE and B_INVAL and what
    192  * the use of one over the other implies.
    193  *
    194  * In both cases, when we are done with the page (buffer) we want to free
    195  * up the page.  In the case of B_FREE, the page will go to the cachelist.
    196  * In the case of B_INVAL, the page will be destroyed (hashed out of it's
    197  * vnode) and placed on the freelist.  Beyond this, there is no difference
    198  * between the sole use of these two flags.  In both cases, IO will be done
    199  * if the page is not yet committed to storage.
    200  *
    201  * In order to discard pages without writing them back, (B_INVAL | B_TRUNC)
    202  * should be used.
    203  *
    204  * Use (B_INVAL | B_FORCE) to force the page to be destroyed even if we
    205  * could not successfuly write out the page.
    206  */
    207 
    208 /*
    209  * Insq/Remq for the buffer hash lists.
    210  */
    211 #define	bremhash(bp) { \
    212 	ASSERT((bp)->b_forw != NULL); \
    213 	ASSERT((bp)->b_back != NULL); \
    214 	(bp)->b_back->b_forw = (bp)->b_forw; \
    215 	(bp)->b_forw->b_back = (bp)->b_back; \
    216 	(bp)->b_forw = (bp)->b_back = NULL; \
    217 }
    218 #define	binshash(bp, dp) { \
    219 	ASSERT((bp)->b_forw == NULL); \
    220 	ASSERT((bp)->b_back == NULL); \
    221 	ASSERT((dp)->b_forw != NULL); \
    222 	ASSERT((dp)->b_back != NULL); \
    223 	(bp)->b_forw = (dp)->b_forw; \
    224 	(bp)->b_back = (dp); \
    225 	(dp)->b_forw->b_back = (bp); \
    226 	(dp)->b_forw = (bp); \
    227 }
    228 
    229 
    230 /*
    231  * The hash structure maintains two lists:
    232  *
    233  * 	1) The hash list of buffers (b_forw & b_back)
    234  *	2) The LRU free list of buffers on this hash bucket (av_forw & av_back)
    235  *
    236  * The dwbuf structure keeps a list of delayed write buffers per hash bucket
    237  * hence there are exactly the same number of dwbuf structures as there are
    238  * the hash buckets (hbuf structures) in the system.
    239  *
    240  * The number of buffers on the freelist may not be equal to the number of
    241  * buffers on the hash list. That is because when buffers are busy they are
    242  * taken off the freelist but not off the hash list. "b_length" field keeps
    243  * track of the number of free buffers (including delayed writes ones) on
    244  * the hash bucket. The "b_lock" mutex protects the free list as well as
    245  * the hash list. It also protects the counter "b_length".
    246  *
    247  * Enties b_forw, b_back, av_forw & av_back must be at the same offset
    248  * as the ones in buf structure.
    249  */
    250 struct	hbuf {
    251 	int	b_flags;
    252 
    253 	struct	buf	*b_forw;	/* hash list forw pointer */
    254 	struct	buf	*b_back;	/* hash list back pointer */
    255 
    256 	struct	buf	*av_forw;	/* free list forw pointer */
    257 	struct	buf	*av_back;	/* free list back pointer */
    258 
    259 	int		b_length;	/* # of entries on free list */
    260 	kmutex_t	b_lock;		/* lock to protect this structure */
    261 };
    262 
    263 
    264 /*
    265  * The delayed list pointer entries should match with the buf strcuture.
    266  */
    267 struct	dwbuf {
    268 	int	b_flags;		/* not used */
    269 
    270 	struct	buf	*b_forw;	/* not used */
    271 	struct	buf	*b_back;	/* not used */
    272 
    273 	struct	buf	*av_forw;	/* delayed write forw pointer */
    274 	struct	buf	*av_back;	/* delayed write back pointer */
    275 };
    276 
    277 
    278 /*
    279  * Unlink a buffer from the available (free or delayed write) list and mark
    280  * it busy (internal interface).
    281  */
    282 #define	notavail(bp) \
    283 {\
    284 	ASSERT(SEMA_HELD(&bp->b_sem)); \
    285 	ASSERT((bp)->av_forw != NULL); \
    286 	ASSERT((bp)->av_back != NULL); \
    287 	ASSERT((bp)->av_forw != (bp)); \
    288 	ASSERT((bp)->av_back != (bp)); \
    289 	(bp)->av_back->av_forw = (bp)->av_forw; \
    290 	(bp)->av_forw->av_back = (bp)->av_back; \
    291 	(bp)->b_flags |= B_BUSY; \
    292 	(bp)->av_forw = (bp)->av_back = NULL; \
    293 }
    294 
    295 #if defined(_KERNEL)
    296 /*
    297  * Macros to avoid the extra function call needed for binary compat.
    298  *
    299  * B_RETRYWRI is not included in clear_flags for BWRITE(), BWRITE2(),
    300  * or brwrite() so that the retry operation is persistent until the
    301  * write either succeeds or the buffer is bfinval()'d.
    302  *
    303  */
    304 #define	BREAD(dev, blkno, bsize) \
    305 	bread_common(/* ufsvfsp */ NULL, dev, blkno, bsize)
    306 
    307 #define	BWRITE(bp) \
    308 	bwrite_common(/* ufsvfsp */ NULL, bp, /* force_wait */ 0, \
    309 		/* do_relse */ 1, \
    310 		/* clear_flags */ (B_READ | B_DONE | B_ERROR | B_DELWRI))
    311 
    312 #define	BWRITE2(bp) \
    313 	bwrite_common(/* ufsvfsp */ NULL, bp, /* force_wait */ 1, \
    314 		/* do_relse */ 0, \
    315 		/* clear_flags */ (B_READ | B_DONE | B_ERROR | B_DELWRI))
    316 
    317 #define	GETBLK(dev, blkno, bsize) \
    318 	getblk_common(/* ufsvfsp */ NULL, dev, blkno, bsize, /* errflg */ 0)
    319 
    320 
    321 /*
    322  * Macros for new retry write interfaces.
    323  */
    324 
    325 /*
    326  * Same as bdwrite() except write failures are retried.
    327  */
    328 #define	bdrwrite(bp) { \
    329 	(bp)->b_flags |= B_RETRYWRI; \
    330 	bdwrite((bp)); \
    331 }
    332 
    333 /*
    334  * Same as bwrite() except write failures are retried.
    335  */
    336 #define	brwrite(bp) { \
    337 	(bp)->b_flags |= B_RETRYWRI; \
    338 	bwrite_common((bp), /* force_wait */ 0, /* do_relse */ 1, \
    339 		/* clear_flags */ (B_READ | B_DONE | B_ERROR | B_DELWRI)); \
    340 }
    341 
    342 extern struct hbuf	*hbuf;		/* Hash table */
    343 extern struct dwbuf	*dwbuf;		/* delayed write hash table */
    344 extern struct buf	*buf;		/* The buffer pool itself */
    345 extern struct buf	bfreelist;	/* head of available list */
    346 
    347 extern void (*bio_lufs_strategy)(void *, buf_t *);	/* UFS Logging */
    348 extern void (*bio_snapshot_strategy)(void *, buf_t *);	/* UFS snapshots */
    349 
    350 int	bcheck(dev_t, struct buf *);
    351 int	iowait(struct buf *);
    352 int	hash2ints(int x, int y);
    353 int	bio_busy(int);
    354 int	biowait(struct buf *);
    355 int	biomodified(struct buf *);
    356 int	geterror(struct buf *);
    357 void	minphys(struct buf *);
    358 /*
    359  * ufsvfsp is declared as a void * to avoid having everyone that uses
    360  * this header file include sys/fs/ufs_inode.h.
    361  */
    362 void	bwrite_common(void *ufsvfsp, struct buf *, int force_wait,
    363 	int do_relse, int clear_flags);
    364 void	bwrite(struct buf *);
    365 void	bwrite2(struct buf *);
    366 void	bdwrite(struct buf *);
    367 void	bawrite(struct buf *);
    368 void	brelse(struct buf *);
    369 void	iodone(struct buf *);
    370 void	clrbuf(struct buf *);
    371 void	bflush(dev_t);
    372 void	blkflush(dev_t, daddr_t);
    373 void	binval(dev_t);
    374 int	bfinval(dev_t, int);
    375 void	binit(void);
    376 void	biodone(struct buf *);
    377 void	bioinit(struct buf *);
    378 void	biofini(struct buf *);
    379 void	bp_mapin(struct buf *);
    380 void	*bp_mapin_common(struct buf *, int);
    381 void	bp_mapout(struct buf *);
    382 int	bp_copyin(struct buf *, void *, offset_t, size_t);
    383 int	bp_copyout(void *, struct buf *, offset_t, size_t);
    384 void	bp_init(size_t, uint_t);
    385 int	bp_color(struct buf *);
    386 void	pageio_done(struct buf *);
    387 struct buf *bread(dev_t, daddr_t, long);
    388 struct buf *bread_common(void *, dev_t, daddr_t, long);
    389 struct buf *breada(dev_t, daddr_t, daddr_t, long);
    390 struct buf *getblk(dev_t, daddr_t, long);
    391 struct buf *getblk_common(void *, dev_t, daddr_t, long, int);
    392 struct buf *ngeteblk(long);
    393 struct buf *geteblk(void);
    394 struct buf *pageio_setup(struct page *, size_t, struct vnode *, int);
    395 void bioerror(struct buf *bp, int error);
    396 void bioreset(struct buf *bp);
    397 struct buf *bioclone(struct buf *, off_t, size_t, dev_t, daddr_t,
    398 	int (*)(struct buf *), struct buf *, int);
    399 size_t	biosize(void);
    400 #endif	/* defined(_KERNEL) */
    401 
    402 #ifdef	__cplusplus
    403 }
    404 #endif
    405 
    406 #endif	/* _SYS_BUF_H */
    407