Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/zfs_context.h>
     27 #include <sys/dmu_objset.h>
     28 #include <sys/dmu_traverse.h>
     29 #include <sys/dsl_dataset.h>
     30 #include <sys/dsl_dir.h>
     31 #include <sys/dsl_pool.h>
     32 #include <sys/dnode.h>
     33 #include <sys/spa.h>
     34 #include <sys/zio.h>
     35 #include <sys/dmu_impl.h>
     36 #include <sys/callb.h>
     37 
     38 #define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
     39 {                                                       \
     40 	(zb)->zb_objset = objset;                       \
     41 	(zb)->zb_object = object;                       \
     42 	(zb)->zb_level = level;                         \
     43 	(zb)->zb_blkid = blkid;                         \
     44 }
     45 
     46 struct prefetch_data {
     47 	kmutex_t pd_mtx;
     48 	kcondvar_t pd_cv;
     49 	int pd_blks_max;
     50 	int pd_blks_fetched;
     51 	int pd_flags;
     52 	boolean_t pd_cancel;
     53 	boolean_t pd_exited;
     54 };
     55 
     56 struct traverse_data {
     57 	spa_t *td_spa;
     58 	uint64_t td_objset;
     59 	blkptr_t *td_rootbp;
     60 	uint64_t td_min_txg;
     61 	int td_flags;
     62 	struct prefetch_data *td_pfd;
     63 	blkptr_cb_t *td_func;
     64 	void *td_arg;
     65 };
     66 
     67 static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
     68     arc_buf_t *buf, uint64_t objset, uint64_t object);
     69 
     70 /* ARGSUSED */
     71 static void
     72 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
     73 {
     74 	struct traverse_data *td = arg;
     75 	zbookmark_t zb;
     76 
     77 	if (bp->blk_birth == 0)
     78 		return;
     79 
     80 	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
     81 		return;
     82 
     83 	zb.zb_objset = td->td_objset;
     84 	zb.zb_object = 0;
     85 	zb.zb_level = -1;
     86 	zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
     87 	VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
     88 }
     89 
     90 /* ARGSUSED */
     91 static void
     92 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
     93 {
     94 	struct traverse_data *td = arg;
     95 
     96 	if (lrc->lrc_txtype == TX_WRITE) {
     97 		lr_write_t *lr = (lr_write_t *)lrc;
     98 		blkptr_t *bp = &lr->lr_blkptr;
     99 		zbookmark_t zb;
    100 
    101 		if (bp->blk_birth == 0)
    102 			return;
    103 
    104 		if (claim_txg == 0 || bp->blk_birth < claim_txg)
    105 			return;
    106 
    107 		zb.zb_objset = td->td_objset;
    108 		zb.zb_object = lr->lr_foid;
    109 		zb.zb_level = BP_GET_LEVEL(bp);
    110 		zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
    111 		VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
    112 	}
    113 }
    114 
    115 static void
    116 traverse_zil(struct traverse_data *td, zil_header_t *zh)
    117 {
    118 	uint64_t claim_txg = zh->zh_claim_txg;
    119 	zilog_t *zilog;
    120 
    121 	/*
    122 	 * We only want to visit blocks that have been claimed but not yet
    123 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
    124 	 */
    125 	if (claim_txg == 0 && spa_writeable(td->td_spa))
    126 		return;
    127 
    128 	zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
    129 
    130 	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
    131 	    claim_txg);
    132 
    133 	zil_free(zilog);
    134 }
    135 
    136 static int
    137 traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
    138     arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
    139 {
    140 	zbookmark_t czb;
    141 	int err = 0;
    142 	arc_buf_t *buf = NULL;
    143 	struct prefetch_data *pd = td->td_pfd;
    144 
    145 	if (bp->blk_birth == 0) {
    146 		err = td->td_func(td->td_spa, NULL, zb, dnp, td->td_arg);
    147 		return (err);
    148 	}
    149 
    150 	if (bp->blk_birth <= td->td_min_txg)
    151 		return (0);
    152 
    153 	if (pd && !pd->pd_exited &&
    154 	    ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
    155 	    BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) {
    156 		mutex_enter(&pd->pd_mtx);
    157 		ASSERT(pd->pd_blks_fetched >= 0);
    158 		while (pd->pd_blks_fetched == 0 && !pd->pd_exited)
    159 			cv_wait(&pd->pd_cv, &pd->pd_mtx);
    160 		pd->pd_blks_fetched--;
    161 		cv_broadcast(&pd->pd_cv);
    162 		mutex_exit(&pd->pd_mtx);
    163 	}
    164 
    165 	if (td->td_flags & TRAVERSE_PRE) {
    166 		err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
    167 		if (err)
    168 			return (err);
    169 	}
    170 
    171 	if (BP_GET_LEVEL(bp) > 0) {
    172 		uint32_t flags = ARC_WAIT;
    173 		int i;
    174 		blkptr_t *cbp;
    175 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
    176 
    177 		err = arc_read(NULL, td->td_spa, bp, pbuf,
    178 		    arc_getbuf_func, &buf,
    179 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
    180 		if (err)
    181 			return (err);
    182 
    183 		/* recursively visitbp() blocks below this */
    184 		cbp = buf->b_data;
    185 		for (i = 0; i < epb; i++, cbp++) {
    186 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
    187 			    zb->zb_level - 1,
    188 			    zb->zb_blkid * epb + i);
    189 			err = traverse_visitbp(td, dnp, buf, cbp, &czb);
    190 			if (err)
    191 				break;
    192 		}
    193 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
    194 		uint32_t flags = ARC_WAIT;
    195 		int i;
    196 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
    197 
    198 		err = arc_read(NULL, td->td_spa, bp, pbuf,
    199 		    arc_getbuf_func, &buf,
    200 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
    201 		if (err)
    202 			return (err);
    203 
    204 		/* recursively visitbp() blocks below this */
    205 		dnp = buf->b_data;
    206 		for (i = 0; i < epb && err == 0; i++, dnp++) {
    207 			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
    208 			    zb->zb_blkid * epb + i);
    209 			if (err)
    210 				break;
    211 		}
    212 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
    213 		uint32_t flags = ARC_WAIT;
    214 		objset_phys_t *osp;
    215 		dnode_phys_t *dnp;
    216 
    217 		err = arc_read_nolock(NULL, td->td_spa, bp,
    218 		    arc_getbuf_func, &buf,
    219 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
    220 		if (err)
    221 			return (err);
    222 
    223 		osp = buf->b_data;
    224 		traverse_zil(td, &osp->os_zil_header);
    225 
    226 		dnp = &osp->os_meta_dnode;
    227 		err = traverse_dnode(td, dnp, buf, zb->zb_objset, 0);
    228 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
    229 			dnp = &osp->os_userused_dnode;
    230 			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
    231 			    DMU_USERUSED_OBJECT);
    232 		}
    233 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
    234 			dnp = &osp->os_groupused_dnode;
    235 			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
    236 			    DMU_GROUPUSED_OBJECT);
    237 		}
    238 	}
    239 
    240 	if (buf)
    241 		(void) arc_buf_remove_ref(buf, &buf);
    242 
    243 	if (err == 0 && (td->td_flags & TRAVERSE_POST))
    244 		err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
    245 
    246 	return (err);
    247 }
    248 
    249 static int
    250 traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
    251     arc_buf_t *buf, uint64_t objset, uint64_t object)
    252 {
    253 	int j, err = 0;
    254 	zbookmark_t czb;
    255 
    256 	for (j = 0; j < dnp->dn_nblkptr; j++) {
    257 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
    258 		err = traverse_visitbp(td, dnp, buf,
    259 		    (blkptr_t *)&dnp->dn_blkptr[j], &czb);
    260 		if (err)
    261 			break;
    262 	}
    263 	return (err);
    264 }
    265 
    266 /* ARGSUSED */
    267 static int
    268 traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
    269     const dnode_phys_t *dnp, void *arg)
    270 {
    271 	struct prefetch_data *pfd = arg;
    272 	uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
    273 
    274 	ASSERT(pfd->pd_blks_fetched >= 0);
    275 	if (pfd->pd_cancel)
    276 		return (EINTR);
    277 
    278 	if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
    279 	    BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0))
    280 		return (0);
    281 
    282 	mutex_enter(&pfd->pd_mtx);
    283 	while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max)
    284 		cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
    285 	pfd->pd_blks_fetched++;
    286 	cv_broadcast(&pfd->pd_cv);
    287 	mutex_exit(&pfd->pd_mtx);
    288 
    289 	(void) arc_read_nolock(NULL, spa, bp, NULL, NULL,
    290 	    ZIO_PRIORITY_ASYNC_READ,
    291 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
    292 	    &aflags, zb);
    293 
    294 	return (0);
    295 }
    296 
    297 static void
    298 traverse_prefetch_thread(void *arg)
    299 {
    300 	struct traverse_data *td_main = arg;
    301 	struct traverse_data td = *td_main;
    302 	zbookmark_t czb;
    303 
    304 	td.td_func = traverse_prefetcher;
    305 	td.td_arg = td_main->td_pfd;
    306 	td.td_pfd = NULL;
    307 
    308 	SET_BOOKMARK(&czb, td.td_objset, 0, -1, 0);
    309 	(void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);
    310 
    311 	mutex_enter(&td_main->td_pfd->pd_mtx);
    312 	td_main->td_pfd->pd_exited = B_TRUE;
    313 	cv_broadcast(&td_main->td_pfd->pd_cv);
    314 	mutex_exit(&td_main->td_pfd->pd_mtx);
    315 }
    316 
    317 /*
    318  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
    319  * in syncing context).
    320  */
    321 static int
    322 traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp,
    323     uint64_t txg_start, int flags, blkptr_cb_t func, void *arg)
    324 {
    325 	struct traverse_data td;
    326 	struct prefetch_data pd = { 0 };
    327 	zbookmark_t czb;
    328 	int err;
    329 
    330 	td.td_spa = spa;
    331 	td.td_objset = objset;
    332 	td.td_rootbp = rootbp;
    333 	td.td_min_txg = txg_start;
    334 	td.td_func = func;
    335 	td.td_arg = arg;
    336 	td.td_pfd = &pd;
    337 	td.td_flags = flags;
    338 
    339 	pd.pd_blks_max = 100;
    340 	pd.pd_flags = flags;
    341 	mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
    342 	cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
    343 
    344 	if (!(flags & TRAVERSE_PREFETCH) ||
    345 	    0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
    346 	    &td, TQ_NOQUEUE))
    347 		pd.pd_exited = B_TRUE;
    348 
    349 	SET_BOOKMARK(&czb, objset, 0, -1, 0);
    350 	err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);
    351 
    352 	mutex_enter(&pd.pd_mtx);
    353 	pd.pd_cancel = B_TRUE;
    354 	cv_broadcast(&pd.pd_cv);
    355 	while (!pd.pd_exited)
    356 		cv_wait(&pd.pd_cv, &pd.pd_mtx);
    357 	mutex_exit(&pd.pd_mtx);
    358 
    359 	mutex_destroy(&pd.pd_mtx);
    360 	cv_destroy(&pd.pd_cv);
    361 
    362 	return (err);
    363 }
    364 
    365 /*
    366  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
    367  * in syncing context).
    368  */
    369 int
    370 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
    371     blkptr_cb_t func, void *arg)
    372 {
    373 	return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds->ds_object,
    374 	    &ds->ds_phys->ds_bp, txg_start, flags, func, arg));
    375 }
    376 
    377 /*
    378  * NB: pool must not be changing on-disk (eg, from zdb or sync context).
    379  */
    380 int
    381 traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg)
    382 {
    383 	int err;
    384 	uint64_t obj;
    385 	dsl_pool_t *dp = spa_get_dsl(spa);
    386 	objset_t *mos = dp->dp_meta_objset;
    387 
    388 	/* visit the MOS */
    389 	err = traverse_impl(spa, 0, spa_get_rootblkptr(spa),
    390 	    0, TRAVERSE_PRE, func, arg);
    391 	if (err)
    392 		return (err);
    393 
    394 	/* visit each dataset */
    395 	for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, 0)) {
    396 		dmu_object_info_t doi;
    397 
    398 		err = dmu_object_info(mos, obj, &doi);
    399 		if (err)
    400 			return (err);
    401 
    402 		if (doi.doi_type == DMU_OT_DSL_DATASET) {
    403 			dsl_dataset_t *ds;
    404 			rw_enter(&dp->dp_config_rwlock, RW_READER);
    405 			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
    406 			rw_exit(&dp->dp_config_rwlock);
    407 			if (err)
    408 				return (err);
    409 			err = traverse_dataset(ds,
    410 			    ds->ds_phys->ds_prev_snap_txg, TRAVERSE_PRE,
    411 			    func, arg);
    412 			dsl_dataset_rele(ds, FTAG);
    413 			if (err)
    414 				return (err);
    415 		}
    416 	}
    417 	if (err == ESRCH)
    418 		err = 0;
    419 	return (err);
    420 }
    421