OpenGrok

Cross Reference: dmu_traverse.c
xref: /onnv/onnv-gate/usr/src/uts/common/fs/zfs/dmu_traverse.c
Home | History | Annotate | Line # | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
     23  */
     24 
     25 #include <sys/zfs_context.h>
     26 #include <sys/dmu_objset.h>
     27 #include <sys/dmu_traverse.h>
     28 #include <sys/dsl_dataset.h>
     29 #include <sys/dsl_dir.h>
     30 #include <sys/dsl_pool.h>
     31 #include <sys/dnode.h>
     32 #include <sys/spa.h>
     33 #include <sys/zio.h>
     34 #include <sys/dmu_impl.h>
     35 #include <sys/sa.h>
     36 #include <sys/sa_impl.h>
     37 #include <sys/callb.h>
     38 
     39 int zfs_pd_blks_max = 100;
     40 
     41 typedef struct prefetch_data {
     42 	kmutex_t pd_mtx;
     43 	kcondvar_t pd_cv;
     44 	int pd_blks_max;
     45 	int pd_blks_fetched;
     46 	int pd_flags;
     47 	boolean_t pd_cancel;
     48 	boolean_t pd_exited;
     49 } prefetch_data_t;
     50 
     51 typedef struct traverse_data {
     52 	spa_t *td_spa;
     53 	uint64_t td_objset;
     54 	blkptr_t *td_rootbp;
     55 	uint64_t td_min_txg;
     56 	int td_flags;
     57 	prefetch_data_t *td_pfd;
     58 	blkptr_cb_t *td_func;
     59 	void *td_arg;
     60 } traverse_data_t;
     61 
     62 static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
     63     arc_buf_t *buf, uint64_t objset, uint64_t object);
     64 
     65 static int
     66 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
     67 {
     68 	traverse_data_t *td = arg;
     69 	zbookmark_t zb;
     70 
     71 	if (bp->blk_birth == 0)
     72 		return (0);
     73 
     74 	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
     75 		return (0);
     76 
     77 	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
     78 	    bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
     79 
     80 	(void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg);
     81 
     82 	return (0);
     83 }
     84 
     85 static int
     86 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
     87 {
     88 	traverse_data_t *td = arg;
     89 
     90 	if (lrc->lrc_txtype == TX_WRITE) {
     91 		lr_write_t *lr = (lr_write_t *)lrc;
     92 		blkptr_t *bp = &lr->lr_blkptr;
     93 		zbookmark_t zb;
     94 
     95 		if (bp->blk_birth == 0)
     96 			return (0);
     97 
     98 		if (claim_txg == 0 || bp->blk_birth < claim_txg)
     99 			return (0);
    100 
    101 		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
    102 		    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
    103 
    104 		(void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL,
    105 		    td->td_arg);
    106 	}
    107 	return (0);
    108 }
    109 
    110 static void
    111 traverse_zil(traverse_data_t *td, zil_header_t *zh)
    112 {
    113 	uint64_t claim_txg = zh->zh_claim_txg;
    114 	zilog_t *zilog;
    115 
    116 	/*
    117 	 * We only want to visit blocks that have been claimed but not yet
    118 	 * replayed; plus, in read-only mode, blocks that are already stable.
    119 	 */
    120 	if (claim_txg == 0 && spa_writeable(td->td_spa))
    121 		return;
    122 
    123 	zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
    124 
    125 	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
    126 	    claim_txg);
    127 
    128 	zil_free(zilog);
    129 }
    130 
    131 static int
    132 traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
    133     arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
    134 {
    135 	zbookmark_t czb;
    136 	int err = 0, lasterr = 0;
    137 	arc_buf_t *buf = NULL;
    138 	prefetch_data_t *pd = td->td_pfd;
    139 	boolean_t hard = td->td_flags & TRAVERSE_HARD;
    140 
    141 	if (bp->blk_birth == 0) {
    142 		err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp,
    143 		    td->td_arg);
    144 		return (err);
    145 	}
    146 
    147 	if (bp->blk_birth <= td->td_min_txg)
    148 		return (0);
    149 
    150 	if (pd && !pd->pd_exited &&
    151 	    ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
    152 	    BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) {
    153 		mutex_enter(&pd->pd_mtx);
    154 		ASSERT(pd->pd_blks_fetched >= 0);
    155 		while (pd->pd_blks_fetched == 0 && !pd->pd_exited)
    156 			cv_wait(&pd->pd_cv, &pd->pd_mtx);
    157 		pd->pd_blks_fetched--;
    158 		cv_broadcast(&pd->pd_cv);
    159 		mutex_exit(&pd->pd_mtx);
    160 	}
    161 
    162 	if (td->td_flags & TRAVERSE_PRE) {
    163 		err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
    164 		    td->td_arg);
    165 		if (err == TRAVERSE_VISIT_NO_CHILDREN)
    166 			return (0);
    167 		if (err)
    168 			return (err);
    169 	}
    170 
    171 	if (BP_GET_LEVEL(bp) > 0) {
    172 		uint32_t flags = ARC_WAIT;
    173 		int i;
    174 		blkptr_t *cbp;
    175 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
    176 
    177 		err = dsl_read(NULL, td->td_spa, bp, pbuf,
    178 		    arc_getbuf_func, &buf,
    179 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
    180 		if (err)
    181 			return (err);
    182 
    183 		/* recursively visitbp() blocks below this */
    184 		cbp = buf->b_data;
    185 		for (i = 0; i < epb; i++, cbp++) {
    186 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
    187 			    zb->zb_level - 1,
    188 			    zb->zb_blkid * epb + i);
    189 			err = traverse_visitbp(td, dnp, buf, cbp, &czb);
    190 			if (err) {
    191 				if (!hard)
    192 					break;
    193 				lasterr = err;
    194 			}
    195 		}
    196 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
    197 		uint32_t flags = ARC_WAIT;
    198 		int i;
    199 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
    200 
    201 		err = dsl_read(NULL, td->td_spa, bp, pbuf,
    202 		    arc_getbuf_func, &buf,
    203 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
    204 		if (err)
    205 			return (err);
    206 
    207 		/* recursively visitbp() blocks below this */
    208 		dnp = buf->b_data;
    209 		for (i = 0; i < epb; i++, dnp++) {
    210 			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
    211 			    zb->zb_blkid * epb + i);
    212 			if (err) {
    213 				if (!hard)
    214 					break;
    215 				lasterr = err;
    216 			}
    217 		}
    218 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
    219 		uint32_t flags = ARC_WAIT;
    220 		objset_phys_t *osp;
    221 		dnode_phys_t *dnp;
    222 
    223 		err = dsl_read_nolock(NULL, td->td_spa, bp,
    224 		    arc_getbuf_func, &buf,
    225 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
    226 		if (err)
    227 			return (err);
    228 
    229 		osp = buf->b_data;
    230 		dnp = &osp->os_meta_dnode;
    231 		err = traverse_dnode(td, dnp, buf, zb->zb_objset,
    232 		    DMU_META_DNODE_OBJECT);
    233 		if (err && hard) {
    234 			lasterr = err;
    235 			err = 0;
    236 		}
    237 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
    238 			dnp = &osp->os_userused_dnode;
    239 			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
    240 			    DMU_USERUSED_OBJECT);
    241 		}
    242 		if (err && hard) {
    243 			lasterr = err;
    244 			err = 0;
    245 		}
    246 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
    247 			dnp = &osp->os_groupused_dnode;
    248 			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
    249 			    DMU_GROUPUSED_OBJECT);
    250 		}
    251 	}
    252 
    253 	if (buf)
    254 		(void) arc_buf_remove_ref(buf, &buf);
    255 
    256 	if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) {
    257 		err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
    258 		    td->td_arg);
    259 	}
    260 
    261 	return (err != 0 ? err : lasterr);
    262 }
    263 
    264 static int
    265 traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
    266     arc_buf_t *buf, uint64_t objset, uint64_t object)
    267 {
    268 	int j, err = 0, lasterr = 0;
    269 	zbookmark_t czb;
    270 	boolean_t hard = (td->td_flags & TRAVERSE_HARD);
    271 
    272 	for (j = 0; j < dnp->dn_nblkptr; j++) {
    273 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
    274 		err = traverse_visitbp(td, dnp, buf,
    275 		    (blkptr_t *)&dnp->dn_blkptr[j], &czb);
    276 		if (err) {
    277 			if (!hard)
    278 				break;
    279 			lasterr = err;
    280 		}
    281 	}
    282 
    283 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
    284 		SET_BOOKMARK(&czb, objset,
    285 		    object, 0, DMU_SPILL_BLKID);
    286 		err = traverse_visitbp(td, dnp, buf,
    287 		    (blkptr_t *)&dnp->dn_spill, &czb);
    288 		if (err) {
    289 			if (!hard)
    290 				return (err);
    291 			lasterr = err;
    292 		}
    293 	}
    294 	return (err != 0 ? err : lasterr);
    295 }
    296 
    297 /* ARGSUSED */
    298 static int
    299 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
    300     arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp,
    301     void *arg)
    302 {
    303 	prefetch_data_t *pfd = arg;
    304 	uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
    305 
    306 	ASSERT(pfd->pd_blks_fetched >= 0);
    307 	if (pfd->pd_cancel)
    308 		return (EINTR);
    309 
    310 	if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
    311 	    BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
    312 	    BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
    313 		return (0);
    314 
    315 	mutex_enter(&pfd->pd_mtx);
    316 	while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max)
    317 		cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
    318 	pfd->pd_blks_fetched++;
    319 	cv_broadcast(&pfd->pd_cv);
    320 	mutex_exit(&pfd->pd_mtx);
    321 
    322 	(void) dsl_read(NULL, spa, bp, pbuf, NULL, NULL,
    323 	    ZIO_PRIORITY_ASYNC_READ,
    324 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
    325 	    &aflags, zb);
    326 
    327 	return (0);
    328 }
    329 
    330 static void
    331 traverse_prefetch_thread(void *arg)
    332 {
    333 	traverse_data_t *td_main = arg;
    334 	traverse_data_t td = *td_main;
    335 	zbookmark_t czb;
    336 
    337 	td.td_func = traverse_prefetcher;
    338 	td.td_arg = td_main->td_pfd;
    339 	td.td_pfd = NULL;
    340 
    341 	SET_BOOKMARK(&czb, td.td_objset,
    342 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
    343 	(void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);
    344 
    345 	mutex_enter(&td_main->td_pfd->pd_mtx);
    346 	td_main->td_pfd->pd_exited = B_TRUE;
    347 	cv_broadcast(&td_main->td_pfd->pd_cv);
    348 	mutex_exit(&td_main->td_pfd->pd_mtx);
    349 }
    350 
    351 /*
    352  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
    353  * in syncing context).
    354  */
    355 static int
    356 traverse_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *rootbp,
    357     uint64_t txg_start, int flags, blkptr_cb_t func, void *arg)
    358 {
    359 	traverse_data_t td;
    360 	prefetch_data_t pd = { 0 };
    361 	zbookmark_t czb;
    362 	int err;
    363 
    364 	td.td_spa = spa;
    365 	td.td_objset = ds ? ds->ds_object : 0;
    366 	td.td_rootbp = rootbp;
    367 	td.td_min_txg = txg_start;
    368 	td.td_func = func;
    369 	td.td_arg = arg;
    370 	td.td_pfd = &pd;
    371 	td.td_flags = flags;
    372 
    373 	pd.pd_blks_max = zfs_pd_blks_max;
    374 	pd.pd_flags = flags;
    375 	mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
    376 	cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
    377 
    378 	/* See comment on ZIL traversal in dsl_scan_visitds. */
    379 	if (ds != NULL && !dsl_dataset_is_snapshot(ds)) {
    380 		objset_t *os;
    381 
    382 		err = dmu_objset_from_ds(ds, &os);
    383 		if (err)
    384 			return (err);
    385 
    386 		traverse_zil(&td, &os->os_zil_header);
    387 	}
    388 
    389 	if (!(flags & TRAVERSE_PREFETCH) ||
    390 	    0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
    391 	    &td, TQ_NOQUEUE))
    392 		pd.pd_exited = B_TRUE;
    393 
    394 	SET_BOOKMARK(&czb, td.td_objset,
    395 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
    396 	err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);
    397 
    398 	mutex_enter(&pd.pd_mtx);
    399 	pd.pd_cancel = B_TRUE;
    400 	cv_broadcast(&pd.pd_cv);
    401 	while (!pd.pd_exited)
    402 		cv_wait(&pd.pd_cv, &pd.pd_mtx);
    403 	mutex_exit(&pd.pd_mtx);
    404 
    405 	mutex_destroy(&pd.pd_mtx);
    406 	cv_destroy(&pd.pd_cv);
    407 
    408 	return (err);
    409 }
    410 
    411 /*
    412  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
    413  * in syncing context).
    414  */
    415 int
    416 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
    417     blkptr_cb_t func, void *arg)
    418 {
    419 	return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds,
    420 	    &ds->ds_phys->ds_bp, txg_start, flags, func, arg));
    421 }
    422 
    423 /*
    424  * NB: pool must not be changing on-disk (eg, from zdb or sync context).
    425  */
    426 int
    427 traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
    428     blkptr_cb_t func, void *arg)
    429 {
    430 	int err, lasterr = 0;
    431 	uint64_t obj;
    432 	dsl_pool_t *dp = spa_get_dsl(spa);
    433 	objset_t *mos = dp->dp_meta_objset;
    434 	boolean_t hard = (flags & TRAVERSE_HARD);
    435 
    436 	/* visit the MOS */
    437 	err = traverse_impl(spa, NULL, spa_get_rootblkptr(spa),
    438 	    txg_start, flags, func, arg);
    439 	if (err)
    440 		return (err);
    441 
    442 	/* visit each dataset */
    443 	for (obj = 1; err == 0 || (err != ESRCH && hard);
    444 	    err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
    445 		dmu_object_info_t doi;
    446 
    447 		err = dmu_object_info(mos, obj, &doi);
    448 		if (err) {
    449 			if (!hard)
    450 				return (err);
    451 			lasterr = err;
    452 			continue;
    453 		}
    454 
    455 		if (doi.doi_type == DMU_OT_DSL_DATASET) {
    456 			dsl_dataset_t *ds;
    457 			uint64_t txg = txg_start;
    458 
    459 			rw_enter(&dp->dp_config_rwlock, RW_READER);
    460 			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
    461 			rw_exit(&dp->dp_config_rwlock);
    462 			if (err) {
    463 				if (!hard)
    464 					return (err);
    465 				lasterr = err;
    466 				continue;
    467 			}
    468 			if (ds->ds_phys->ds_prev_snap_txg > txg)
    469 				txg = ds->ds_phys->ds_prev_snap_txg;
    470 			err = traverse_dataset(ds, txg, flags, func, arg);
    471 			dsl_dataset_rele(ds, FTAG);
    472 			if (err) {
    473 				if (!hard)
    474 					return (err);
    475 				lasterr = err;
    476 			}
    477 		}
    478 	}
    479 	if (err == ESRCH)
    480 		err = 0;
    481 	return (err != 0 ? err : lasterr);
    482 }
    483