Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/zfs_context.h>
     27 #include <sys/dmu_objset.h>
     28 #include <sys/dmu_traverse.h>
     29 #include <sys/dsl_dataset.h>
     30 #include <sys/dsl_dir.h>
     31 #include <sys/dsl_pool.h>
     32 #include <sys/dnode.h>
     33 #include <sys/spa.h>
     34 #include <sys/zio.h>
     35 #include <sys/dmu_impl.h>
     36 #include <sys/callb.h>
     37 
     38 struct prefetch_data {
     39 	kmutex_t pd_mtx;
     40 	kcondvar_t pd_cv;
     41 	int pd_blks_max;
     42 	int pd_blks_fetched;
     43 	int pd_flags;
     44 	boolean_t pd_cancel;
     45 	boolean_t pd_exited;
     46 };
     47 
     48 struct traverse_data {
     49 	spa_t *td_spa;
     50 	uint64_t td_objset;
     51 	blkptr_t *td_rootbp;
     52 	uint64_t td_min_txg;
     53 	int td_flags;
     54 	struct prefetch_data *td_pfd;
     55 	blkptr_cb_t *td_func;
     56 	void *td_arg;
     57 };
     58 
     59 static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
     60     arc_buf_t *buf, uint64_t objset, uint64_t object);
     61 
     62 /* ARGSUSED */
     63 static int
     64 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
     65 {
     66 	struct traverse_data *td = arg;
     67 	zbookmark_t zb;
     68 
     69 	if (bp->blk_birth == 0)
     70 		return (0);
     71 
     72 	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
     73 		return (0);
     74 
     75 	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
     76 	    bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
     77 
     78 	(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
     79 
     80 	return (0);
     81 }
     82 
     83 /* ARGSUSED */
     84 static int
     85 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
     86 {
     87 	struct traverse_data *td = arg;
     88 
     89 	if (lrc->lrc_txtype == TX_WRITE) {
     90 		lr_write_t *lr = (lr_write_t *)lrc;
     91 		blkptr_t *bp = &lr->lr_blkptr;
     92 		zbookmark_t zb;
     93 
     94 		if (bp->blk_birth == 0)
     95 			return (0);
     96 
     97 		if (claim_txg == 0 || bp->blk_birth < claim_txg)
     98 			return (0);
     99 
    100 		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL,
    101 		    lr->lr_offset / BP_GET_LSIZE(bp));
    102 
    103 		(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
    104 		    td->td_arg);
    105 	}
    106 	return (0);
    107 }
    108 
    109 static void
    110 traverse_zil(struct traverse_data *td, zil_header_t *zh)
    111 {
    112 	uint64_t claim_txg = zh->zh_claim_txg;
    113 	zilog_t *zilog;
    114 
    115 	/*
    116 	 * We only want to visit blocks that have been claimed but not yet
    117 	 * replayed; plus, in read-only mode, blocks that are already stable.
    118 	 */
    119 	if (claim_txg == 0 && spa_writeable(td->td_spa))
    120 		return;
    121 
    122 	zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
    123 
    124 	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
    125 	    claim_txg);
    126 
    127 	zil_free(zilog);
    128 }
    129 
    130 static int
    131 traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
    132     arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
    133 {
    134 	zbookmark_t czb;
    135 	int err = 0;
    136 	arc_buf_t *buf = NULL;
    137 	struct prefetch_data *pd = td->td_pfd;
    138 
    139 	if (bp->blk_birth == 0) {
    140 		err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg);
    141 		return (err);
    142 	}
    143 
    144 	if (bp->blk_birth <= td->td_min_txg)
    145 		return (0);
    146 
    147 	if (pd && !pd->pd_exited &&
    148 	    ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
    149 	    BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) {
    150 		mutex_enter(&pd->pd_mtx);
    151 		ASSERT(pd->pd_blks_fetched >= 0);
    152 		while (pd->pd_blks_fetched == 0 && !pd->pd_exited)
    153 			cv_wait(&pd->pd_cv, &pd->pd_mtx);
    154 		pd->pd_blks_fetched--;
    155 		cv_broadcast(&pd->pd_cv);
    156 		mutex_exit(&pd->pd_mtx);
    157 	}
    158 
    159 	if (td->td_flags & TRAVERSE_PRE) {
    160 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
    161 		if (err)
    162 			return (err);
    163 	}
    164 
    165 	if (BP_GET_LEVEL(bp) > 0) {
    166 		uint32_t flags = ARC_WAIT;
    167 		int i;
    168 		blkptr_t *cbp;
    169 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
    170 
    171 		err = arc_read(NULL, td->td_spa, bp, pbuf,
    172 		    arc_getbuf_func, &buf,
    173 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
    174 		if (err)
    175 			return (err);
    176 
    177 		/* recursively visitbp() blocks below this */
    178 		cbp = buf->b_data;
    179 		for (i = 0; i < epb; i++, cbp++) {
    180 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
    181 			    zb->zb_level - 1,
    182 			    zb->zb_blkid * epb + i);
    183 			err = traverse_visitbp(td, dnp, buf, cbp, &czb);
    184 			if (err)
    185 				break;
    186 		}
    187 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
    188 		uint32_t flags = ARC_WAIT;
    189 		int i;
    190 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
    191 
    192 		err = arc_read(NULL, td->td_spa, bp, pbuf,
    193 		    arc_getbuf_func, &buf,
    194 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
    195 		if (err)
    196 			return (err);
    197 
    198 		/* recursively visitbp() blocks below this */
    199 		dnp = buf->b_data;
    200 		for (i = 0; i < epb && err == 0; i++, dnp++) {
    201 			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
    202 			    zb->zb_blkid * epb + i);
    203 			if (err)
    204 				break;
    205 		}
    206 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
    207 		uint32_t flags = ARC_WAIT;
    208 		objset_phys_t *osp;
    209 		dnode_phys_t *dnp;
    210 
    211 		err = arc_read_nolock(NULL, td->td_spa, bp,
    212 		    arc_getbuf_func, &buf,
    213 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
    214 		if (err)
    215 			return (err);
    216 
    217 		osp = buf->b_data;
    218 		traverse_zil(td, &osp->os_zil_header);
    219 
    220 		dnp = &osp->os_meta_dnode;
    221 		err = traverse_dnode(td, dnp, buf, zb->zb_objset,
    222 		    DMU_META_DNODE_OBJECT);
    223 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
    224 			dnp = &osp->os_userused_dnode;
    225 			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
    226 			    DMU_USERUSED_OBJECT);
    227 		}
    228 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
    229 			dnp = &osp->os_groupused_dnode;
    230 			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
    231 			    DMU_GROUPUSED_OBJECT);
    232 		}
    233 	}
    234 
    235 	if (buf)
    236 		(void) arc_buf_remove_ref(buf, &buf);
    237 
    238 	if (err == 0 && (td->td_flags & TRAVERSE_POST))
    239 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
    240 
    241 	return (err);
    242 }
    243 
    244 static int
    245 traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
    246     arc_buf_t *buf, uint64_t objset, uint64_t object)
    247 {
    248 	int j, err = 0;
    249 	zbookmark_t czb;
    250 
    251 	for (j = 0; j < dnp->dn_nblkptr; j++) {
    252 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
    253 		err = traverse_visitbp(td, dnp, buf,
    254 		    (blkptr_t *)&dnp->dn_blkptr[j], &czb);
    255 		if (err)
    256 			break;
    257 	}
    258 	return (err);
    259 }
    260 
    261 /* ARGSUSED */
    262 static int
    263 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
    264     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
    265 {
    266 	struct prefetch_data *pfd = arg;
    267 	uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
    268 
    269 	ASSERT(pfd->pd_blks_fetched >= 0);
    270 	if (pfd->pd_cancel)
    271 		return (EINTR);
    272 
    273 	if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
    274 	    BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0))
    275 		return (0);
    276 
    277 	mutex_enter(&pfd->pd_mtx);
    278 	while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max)
    279 		cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
    280 	pfd->pd_blks_fetched++;
    281 	cv_broadcast(&pfd->pd_cv);
    282 	mutex_exit(&pfd->pd_mtx);
    283 
    284 	(void) arc_read_nolock(NULL, spa, bp, NULL, NULL,
    285 	    ZIO_PRIORITY_ASYNC_READ,
    286 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
    287 	    &aflags, zb);
    288 
    289 	return (0);
    290 }
    291 
    292 static void
    293 traverse_prefetch_thread(void *arg)
    294 {
    295 	struct traverse_data *td_main = arg;
    296 	struct traverse_data td = *td_main;
    297 	zbookmark_t czb;
    298 
    299 	td.td_func = traverse_prefetcher;
    300 	td.td_arg = td_main->td_pfd;
    301 	td.td_pfd = NULL;
    302 
    303 	SET_BOOKMARK(&czb, td.td_objset,
    304 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
    305 	(void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);
    306 
    307 	mutex_enter(&td_main->td_pfd->pd_mtx);
    308 	td_main->td_pfd->pd_exited = B_TRUE;
    309 	cv_broadcast(&td_main->td_pfd->pd_cv);
    310 	mutex_exit(&td_main->td_pfd->pd_mtx);
    311 }
    312 
    313 /*
    314  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
    315  * in syncing context).
    316  */
    317 static int
    318 traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp,
    319     uint64_t txg_start, int flags, blkptr_cb_t func, void *arg)
    320 {
    321 	struct traverse_data td;
    322 	struct prefetch_data pd = { 0 };
    323 	zbookmark_t czb;
    324 	int err;
    325 
    326 	td.td_spa = spa;
    327 	td.td_objset = objset;
    328 	td.td_rootbp = rootbp;
    329 	td.td_min_txg = txg_start;
    330 	td.td_func = func;
    331 	td.td_arg = arg;
    332 	td.td_pfd = &pd;
    333 	td.td_flags = flags;
    334 
    335 	pd.pd_blks_max = 100;
    336 	pd.pd_flags = flags;
    337 	mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
    338 	cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
    339 
    340 	if (!(flags & TRAVERSE_PREFETCH) ||
    341 	    0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
    342 	    &td, TQ_NOQUEUE))
    343 		pd.pd_exited = B_TRUE;
    344 
    345 	SET_BOOKMARK(&czb, objset,
    346 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
    347 	err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);
    348 
    349 	mutex_enter(&pd.pd_mtx);
    350 	pd.pd_cancel = B_TRUE;
    351 	cv_broadcast(&pd.pd_cv);
    352 	while (!pd.pd_exited)
    353 		cv_wait(&pd.pd_cv, &pd.pd_mtx);
    354 	mutex_exit(&pd.pd_mtx);
    355 
    356 	mutex_destroy(&pd.pd_mtx);
    357 	cv_destroy(&pd.pd_cv);
    358 
    359 	return (err);
    360 }
    361 
    362 /*
    363  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
    364  * in syncing context).
    365  */
    366 int
    367 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
    368     blkptr_cb_t func, void *arg)
    369 {
    370 	return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds->ds_object,
    371 	    &ds->ds_phys->ds_bp, txg_start, flags, func, arg));
    372 }
    373 
    374 /*
    375  * NB: pool must not be changing on-disk (eg, from zdb or sync context).
    376  */
    377 int
    378 traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
    379     blkptr_cb_t func, void *arg)
    380 {
    381 	int err;
    382 	uint64_t obj;
    383 	dsl_pool_t *dp = spa_get_dsl(spa);
    384 	objset_t *mos = dp->dp_meta_objset;
    385 
    386 	/* visit the MOS */
    387 	err = traverse_impl(spa, 0, spa_get_rootblkptr(spa),
    388 	    txg_start, flags, func, arg);
    389 	if (err)
    390 		return (err);
    391 
    392 	/* visit each dataset */
    393 	for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE,
    394 	    txg_start)) {
    395 		dmu_object_info_t doi;
    396 
    397 		err = dmu_object_info(mos, obj, &doi);
    398 		if (err)
    399 			return (err);
    400 
    401 		if (doi.doi_type == DMU_OT_DSL_DATASET) {
    402 			dsl_dataset_t *ds;
    403 			uint64_t txg = txg_start;
    404 
    405 			rw_enter(&dp->dp_config_rwlock, RW_READER);
    406 			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
    407 			rw_exit(&dp->dp_config_rwlock);
    408 			if (err)
    409 				return (err);
    410 			if (ds->ds_phys->ds_prev_snap_txg > txg)
    411 				txg = ds->ds_phys->ds_prev_snap_txg;
    412 			err = traverse_dataset(ds, txg, flags, func, arg);
    413 			dsl_dataset_rele(ds, FTAG);
    414 			if (err)
    415 				return (err);
    416 		}
    417 	}
    418 	if (err == ESRCH)
    419 		err = 0;
    420 	return (err);
    421 }
    422