Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 #include <sys/dmu.h>
     29 #include <sys/dmu_impl.h>
     30 #include <sys/dmu_tx.h>
     31 #include <sys/dbuf.h>
     32 #include <sys/dnode.h>
     33 #include <sys/zfs_context.h>
     34 #include <sys/dmu_objset.h>
     35 #include <sys/dmu_traverse.h>
     36 #include <sys/dsl_dataset.h>
     37 #include <sys/dsl_dir.h>
     38 #include <sys/dsl_pool.h>
     39 #include <sys/dsl_synctask.h>
     40 #include <sys/zfs_ioctl.h>
     41 #include <sys/zap.h>
     42 #include <sys/zio_checksum.h>
     43 
     44 static char *dmu_recv_tag = "dmu_recv_tag";
     45 
     46 struct backuparg {
     47 	dmu_replay_record_t *drr;
     48 	vnode_t *vp;
     49 	offset_t *off;
     50 	objset_t *os;
     51 	zio_cksum_t zc;
     52 	int err;
     53 };
     54 
     55 static int
     56 dump_bytes(struct backuparg *ba, void *buf, int len)
     57 {
     58 	ssize_t resid; /* have to get resid to get detailed errno */
     59 	ASSERT3U(len % 8, ==, 0);
     60 
     61 	fletcher_4_incremental_native(buf, len, &ba->zc);
     62 	ba->err = vn_rdwr(UIO_WRITE, ba->vp,
     63 	    (caddr_t)buf, len,
     64 	    0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid);
     65 	*ba->off += len;
     66 	return (ba->err);
     67 }
     68 
     69 static int
     70 dump_free(struct backuparg *ba, uint64_t object, uint64_t offset,
     71     uint64_t length)
     72 {
     73 	/* write a FREE record */
     74 	bzero(ba->drr, sizeof (dmu_replay_record_t));
     75 	ba->drr->drr_type = DRR_FREE;
     76 	ba->drr->drr_u.drr_free.drr_object = object;
     77 	ba->drr->drr_u.drr_free.drr_offset = offset;
     78 	ba->drr->drr_u.drr_free.drr_length = length;
     79 
     80 	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
     81 		return (EINTR);
     82 	return (0);
     83 }
     84 
     85 static int
     86 dump_data(struct backuparg *ba, dmu_object_type_t type,
     87     uint64_t object, uint64_t offset, int blksz, void *data)
     88 {
     89 	/* write a DATA record */
     90 	bzero(ba->drr, sizeof (dmu_replay_record_t));
     91 	ba->drr->drr_type = DRR_WRITE;
     92 	ba->drr->drr_u.drr_write.drr_object = object;
     93 	ba->drr->drr_u.drr_write.drr_type = type;
     94 	ba->drr->drr_u.drr_write.drr_offset = offset;
     95 	ba->drr->drr_u.drr_write.drr_length = blksz;
     96 
     97 	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
     98 		return (EINTR);
     99 	if (dump_bytes(ba, data, blksz))
    100 		return (EINTR);
    101 	return (0);
    102 }
    103 
    104 static int
    105 dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs)
    106 {
    107 	/* write a FREEOBJECTS record */
    108 	bzero(ba->drr, sizeof (dmu_replay_record_t));
    109 	ba->drr->drr_type = DRR_FREEOBJECTS;
    110 	ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj;
    111 	ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs;
    112 
    113 	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
    114 		return (EINTR);
    115 	return (0);
    116 }
    117 
    118 static int
    119 dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
    120 {
    121 	if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
    122 		return (dump_freeobjects(ba, object, 1));
    123 
    124 	/* write an OBJECT record */
    125 	bzero(ba->drr, sizeof (dmu_replay_record_t));
    126 	ba->drr->drr_type = DRR_OBJECT;
    127 	ba->drr->drr_u.drr_object.drr_object = object;
    128 	ba->drr->drr_u.drr_object.drr_type = dnp->dn_type;
    129 	ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype;
    130 	ba->drr->drr_u.drr_object.drr_blksz =
    131 	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
    132 	ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen;
    133 	ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum;
    134 	ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress;
    135 
    136 	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
    137 		return (EINTR);
    138 
    139 	if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)))
    140 		return (EINTR);
    141 
    142 	/* free anything past the end of the file */
    143 	if (dump_free(ba, object, (dnp->dn_maxblkid + 1) *
    144 	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
    145 		return (EINTR);
    146 	if (ba->err)
    147 		return (EINTR);
    148 	return (0);
    149 }
    150 
    151 #define	BP_SPAN(dnp, level) \
    152 	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
    153 	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
    154 
    155 static int
    156 backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
    157 {
    158 	struct backuparg *ba = arg;
    159 	uint64_t object = bc->bc_bookmark.zb_object;
    160 	int level = bc->bc_bookmark.zb_level;
    161 	uint64_t blkid = bc->bc_bookmark.zb_blkid;
    162 	blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL;
    163 	dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
    164 	void *data = bc->bc_data;
    165 	int err = 0;
    166 
    167 	if (issig(JUSTLOOKING) && issig(FORREAL))
    168 		return (EINTR);
    169 
    170 	ASSERT(data || bp == NULL);
    171 
    172 	if (bp == NULL && object == 0) {
    173 		uint64_t span = BP_SPAN(bc->bc_dnode, level);
    174 		uint64_t dnobj = (blkid * span) >> DNODE_SHIFT;
    175 		err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
    176 	} else if (bp == NULL) {
    177 		uint64_t span = BP_SPAN(bc->bc_dnode, level);
    178 		err = dump_free(ba, object, blkid * span, span);
    179 	} else if (data && level == 0 && type == DMU_OT_DNODE) {
    180 		dnode_phys_t *blk = data;
    181 		int i;
    182 		int blksz = BP_GET_LSIZE(bp);
    183 
    184 		for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
    185 			uint64_t dnobj =
    186 			    (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
    187 			err = dump_dnode(ba, dnobj, blk+i);
    188 			if (err)
    189 				break;
    190 		}
    191 	} else if (level == 0 &&
    192 	    type != DMU_OT_DNODE && type != DMU_OT_OBJSET) {
    193 		int blksz = BP_GET_LSIZE(bp);
    194 		if (data == NULL) {
    195 			uint32_t aflags = ARC_WAIT;
    196 			arc_buf_t *abuf;
    197 			zbookmark_t zb;
    198 
    199 			zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object;
    200 			zb.zb_object = object;
    201 			zb.zb_level = level;
    202 			zb.zb_blkid = blkid;
    203 			(void) arc_read_nolock(NULL, spa, bp,
    204 			    arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
    205 			    ZIO_FLAG_MUSTSUCCEED, &aflags, &zb);
    206 
    207 			if (abuf) {
    208 				err = dump_data(ba, type, object, blkid * blksz,
    209 				    blksz, abuf->b_data);
    210 				(void) arc_buf_remove_ref(abuf, &abuf);
    211 			}
    212 		} else {
    213 			err = dump_data(ba, type, object, blkid * blksz,
    214 			    blksz, data);
    215 		}
    216 	}
    217 
    218 	ASSERT(err == 0 || err == EINTR);
    219 	return (err);
    220 }
    221 
    222 int
    223 dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
    224     vnode_t *vp, offset_t *off)
    225 {
    226 	dsl_dataset_t *ds = tosnap->os->os_dsl_dataset;
    227 	dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL;
    228 	dmu_replay_record_t *drr;
    229 	struct backuparg ba;
    230 	int err;
    231 	uint64_t fromtxg = 0;
    232 
    233 	/* tosnap must be a snapshot */
    234 	if (ds->ds_phys->ds_next_snap_obj == 0)
    235 		return (EINVAL);
    236 
    237 	/* fromsnap must be an earlier snapshot from the same fs as tosnap */
    238 	if (fromds && (ds->ds_dir != fromds->ds_dir ||
    239 	    fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg))
    240 		return (EXDEV);
    241 
    242 	if (fromorigin) {
    243 		dsl_pool_t *dp = ds->ds_dir->dd_pool;
    244 
    245 		if (fromsnap)
    246 			return (EINVAL);
    247 
    248 		if (dsl_dir_is_clone(ds->ds_dir)) {
    249 			rw_enter(&dp->dp_config_rwlock, RW_READER);
    250 			err = dsl_dataset_hold_obj(dp,
    251 			    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds);
    252 			rw_exit(&dp->dp_config_rwlock);
    253 			if (err)
    254 				return (err);
    255 		} else {
    256 			fromorigin = B_FALSE;
    257 		}
    258 	}
    259 
    260 
    261 	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
    262 	drr->drr_type = DRR_BEGIN;
    263 	drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
    264 	drr->drr_u.drr_begin.drr_version = DMU_BACKUP_STREAM_VERSION;
    265 	drr->drr_u.drr_begin.drr_creation_time =
    266 	    ds->ds_phys->ds_creation_time;
    267 	drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type;
    268 	if (fromorigin)
    269 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
    270 	drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
    271 	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
    272 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
    273 
    274 	if (fromds)
    275 		drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
    276 	dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
    277 
    278 	if (fromds)
    279 		fromtxg = fromds->ds_phys->ds_creation_txg;
    280 	if (fromorigin)
    281 		dsl_dataset_rele(fromds, FTAG);
    282 
    283 	ba.drr = drr;
    284 	ba.vp = vp;
    285 	ba.os = tosnap;
    286 	ba.off = off;
    287 	ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
    288 
    289 	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
    290 		kmem_free(drr, sizeof (dmu_replay_record_t));
    291 		return (ba.err);
    292 	}
    293 
    294 	err = traverse_dsl_dataset(ds, fromtxg,
    295 	    ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK,
    296 	    backup_cb, &ba);
    297 
    298 	if (err) {
    299 		if (err == EINTR && ba.err)
    300 			err = ba.err;
    301 		kmem_free(drr, sizeof (dmu_replay_record_t));
    302 		return (err);
    303 	}
    304 
    305 	bzero(drr, sizeof (dmu_replay_record_t));
    306 	drr->drr_type = DRR_END;
    307 	drr->drr_u.drr_end.drr_checksum = ba.zc;
    308 
    309 	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
    310 		kmem_free(drr, sizeof (dmu_replay_record_t));
    311 		return (ba.err);
    312 	}
    313 
    314 	kmem_free(drr, sizeof (dmu_replay_record_t));
    315 
    316 	return (0);
    317 }
    318 
    319 struct recvbeginsyncarg {
    320 	const char *tofs;
    321 	const char *tosnap;
    322 	dsl_dataset_t *origin;
    323 	uint64_t fromguid;
    324 	dmu_objset_type_t type;
    325 	void *tag;
    326 	boolean_t force;
    327 	uint64_t dsflags;
    328 	char clonelastname[MAXNAMELEN];
    329 	dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
    330 };
    331 
    332 static dsl_dataset_t *
    333 recv_full_sync_impl(dsl_pool_t *dp, uint64_t dsobj, dmu_objset_type_t type,
    334     cred_t *cr, dmu_tx_t *tx)
    335 {
    336 	dsl_dataset_t *ds;
    337 
    338 	/* This should always work, since we just created it */
    339 	/* XXX - create should return an owned ds */
    340 	VERIFY(0 == dsl_dataset_own_obj(dp, dsobj,
    341 	    DS_MODE_INCONSISTENT, dmu_recv_tag, &ds));
    342 
    343 	if (type != DMU_OST_NONE) {
    344 		(void) dmu_objset_create_impl(dp->dp_spa,
    345 		    ds, &ds->ds_phys->ds_bp, type, tx);
    346 	}
    347 
    348 	spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC,
    349 	    dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
    350 
    351 	return (ds);
    352 }
    353 
    354 /* ARGSUSED */
    355 static int
    356 recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
    357 {
    358 	dsl_dir_t *dd = arg1;
    359 	struct recvbeginsyncarg *rbsa = arg2;
    360 	objset_t *mos = dd->dd_pool->dp_meta_objset;
    361 	uint64_t val;
    362 	int err;
    363 
    364 	err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
    365 	    strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val);
    366 
    367 	if (err != ENOENT)
    368 		return (err ? err : EEXIST);
    369 
    370 	if (rbsa->origin) {
    371 		/* make sure it's a snap in the same pool */
    372 		if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool)
    373 			return (EXDEV);
    374 		if (rbsa->origin->ds_phys->ds_num_children == 0)
    375 			return (EINVAL);
    376 		if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
    377 			return (ENODEV);
    378 	}
    379 
    380 	return (0);
    381 }
    382 
    383 static void
    384 recv_full_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
    385 {
    386 	dsl_dir_t *dd = arg1;
    387 	struct recvbeginsyncarg *rbsa = arg2;
    388 	uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
    389 	uint64_t dsobj;
    390 
    391 	dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
    392 	    rbsa->origin, flags, cr, tx);
    393 
    394 	rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
    395 	    rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
    396 }
    397 
    398 static int
    399 recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
    400 {
    401 	dsl_dataset_t *ds = arg1;
    402 	struct recvbeginsyncarg *rbsa = arg2;
    403 	int err;
    404 
    405 	/* must be a head ds */
    406 	if (ds->ds_phys->ds_next_snap_obj != 0)
    407 		return (EINVAL);
    408 
    409 	/* must not be a clone ds */
    410 	if (dsl_dir_is_clone(ds->ds_dir))
    411 		return (EINVAL);
    412 
    413 	err = dsl_dataset_destroy_check(ds, rbsa->tag, tx);
    414 	if (err)
    415 		return (err);
    416 
    417 	if (rbsa->origin) {
    418 		/* make sure it's a snap in the same pool */
    419 		if (rbsa->origin->ds_dir->dd_pool != ds->ds_dir->dd_pool)
    420 			return (EXDEV);
    421 		if (rbsa->origin->ds_phys->ds_num_children == 0)
    422 			return (EINVAL);
    423 		if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
    424 			return (ENODEV);
    425 	}
    426 
    427 	return (0);
    428 }
    429 
    430 static void
    431 recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
    432 {
    433 	dsl_dataset_t *ds = arg1;
    434 	struct recvbeginsyncarg *rbsa = arg2;
    435 	dsl_dir_t *dd = ds->ds_dir;
    436 	uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
    437 	uint64_t dsobj;
    438 
    439 	/*
    440 	 * NB: caller must provide an extra hold on the dsl_dir_t, so it
    441 	 * won't go away when dsl_dataset_destroy_sync() closes the
    442 	 * dataset.
    443 	 */
    444 	dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx);
    445 
    446 	dsobj = dsl_dataset_create_sync_dd(dd, rbsa->origin, flags, tx);
    447 
    448 	rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
    449 	    rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
    450 }
    451 
    452 /* ARGSUSED */
    453 static int
    454 recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
    455 {
    456 	dsl_dataset_t *ds = arg1;
    457 	struct recvbeginsyncarg *rbsa = arg2;
    458 	int err;
    459 	uint64_t val;
    460 
    461 	/* must not have any changes since most recent snapshot */
    462 	if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
    463 		return (ETXTBSY);
    464 
    465 	/* must already be a snapshot of this fs */
    466 	if (ds->ds_phys->ds_prev_snap_obj == 0)
    467 		return (ENODEV);
    468 
    469 	/* most recent snapshot must match fromguid */
    470 	if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid)
    471 		return (ENODEV);
    472 
    473 	/* temporary clone name must not exist */
    474 	err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
    475 	    ds->ds_dir->dd_phys->dd_child_dir_zapobj,
    476 	    rbsa->clonelastname, 8, 1, &val);
    477 	if (err == 0)
    478 		return (EEXIST);
    479 	if (err != ENOENT)
    480 		return (err);
    481 
    482 	/* new snapshot name must not exist */
    483 	err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
    484 	    ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
    485 	if (err == 0)
    486 		return (EEXIST);
    487 	if (err != ENOENT)
    488 		return (err);
    489 	return (0);
    490 }
    491 
    492 /* ARGSUSED */
    493 static void
    494 recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
    495 {
    496 	dsl_dataset_t *ohds = arg1;
    497 	struct recvbeginsyncarg *rbsa = arg2;
    498 	dsl_pool_t *dp = ohds->ds_dir->dd_pool;
    499 	dsl_dataset_t *ods, *cds;
    500 	uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
    501 	uint64_t dsobj;
    502 
    503 	/* create the temporary clone */
    504 	VERIFY(0 == dsl_dataset_hold_obj(dp, ohds->ds_phys->ds_prev_snap_obj,
    505 	    FTAG, &ods));
    506 	dsobj = dsl_dataset_create_sync(ohds->ds_dir,
    507 	    rbsa->clonelastname, ods, flags, cr, tx);
    508 	dsl_dataset_rele(ods, FTAG);
    509 
    510 	/* open the temporary clone */
    511 	VERIFY(0 == dsl_dataset_own_obj(dp, dsobj,
    512 	    DS_MODE_INCONSISTENT, dmu_recv_tag, &cds));
    513 
    514 	/* copy the refquota from the target fs to the clone */
    515 	if (ohds->ds_quota > 0)
    516 		dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx);
    517 
    518 	rbsa->ds = cds;
    519 
    520 	spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
    521 	    dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
    522 }
    523 
    524 /* ARGSUSED */
    525 static void
    526 recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
    527 {
    528 	dsl_dataset_t *ds = arg1;
    529 
    530 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
    531 	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
    532 
    533 	spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
    534 	    ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld",
    535 	    ds->ds_object);
    536 }
    537 
    538 /*
    539  * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
    540  * succeeds; otherwise we will leak the holds on the datasets.
    541  */
    542 int
    543 dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
    544     boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *drc)
    545 {
    546 	int err = 0;
    547 	boolean_t byteswap;
    548 	struct recvbeginsyncarg rbsa;
    549 	uint64_t version;
    550 	int flags;
    551 	dsl_dataset_t *ds;
    552 
    553 	if (drrb->drr_magic == DMU_BACKUP_MAGIC)
    554 		byteswap = FALSE;
    555 	else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
    556 		byteswap = TRUE;
    557 	else
    558 		return (EINVAL);
    559 
    560 	rbsa.tofs = tofs;
    561 	rbsa.tosnap = tosnap;
    562 	rbsa.origin = origin ? origin->os->os_dsl_dataset : NULL;
    563 	rbsa.fromguid = drrb->drr_fromguid;
    564 	rbsa.type = drrb->drr_type;
    565 	rbsa.tag = FTAG;
    566 	rbsa.dsflags = 0;
    567 	version = drrb->drr_version;
    568 	flags = drrb->drr_flags;
    569 
    570 	if (byteswap) {
    571 		rbsa.type = BSWAP_32(rbsa.type);
    572 		rbsa.fromguid = BSWAP_64(rbsa.fromguid);
    573 		version = BSWAP_64(version);
    574 		flags = BSWAP_32(flags);
    575 	}
    576 
    577 	if (version != DMU_BACKUP_STREAM_VERSION ||
    578 	    rbsa.type >= DMU_OST_NUMTYPES ||
    579 	    ((flags & DRR_FLAG_CLONE) && origin == NULL))
    580 		return (EINVAL);
    581 
    582 	if (flags & DRR_FLAG_CI_DATA)
    583 		rbsa.dsflags = DS_FLAG_CI_DATASET;
    584 
    585 	bzero(drc, sizeof (dmu_recv_cookie_t));
    586 	drc->drc_drrb = drrb;
    587 	drc->drc_tosnap = tosnap;
    588 	drc->drc_force = force;
    589 
    590 	/*
    591 	 * Process the begin in syncing context.
    592 	 */
    593 	if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE) && !online) {
    594 		/* offline incremental receive */
    595 		err = dsl_dataset_own(tofs, 0, dmu_recv_tag, &ds);
    596 		if (err)
    597 			return (err);
    598 
    599 		/*
    600 		 * Only do the rollback if the most recent snapshot
    601 		 * matches the incremental source
    602 		 */
    603 		if (force) {
    604 			if (ds->ds_prev == NULL ||
    605 			    ds->ds_prev->ds_phys->ds_guid !=
    606 			    rbsa.fromguid) {
    607 				dsl_dataset_disown(ds, dmu_recv_tag);
    608 				return (ENODEV);
    609 			}
    610 			(void) dsl_dataset_rollback(ds, DMU_OST_NONE);
    611 		}
    612 		rbsa.force = B_FALSE;
    613 		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
    614 		    recv_incremental_check,
    615 		    recv_offline_incremental_sync, ds, &rbsa, 1);
    616 		if (err) {
    617 			dsl_dataset_disown(ds, dmu_recv_tag);
    618 			return (err);
    619 		}
    620 		drc->drc_logical_ds = drc->drc_real_ds = ds;
    621 	} else if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) {
    622 		/* online incremental receive */
    623 
    624 		/* tmp clone name is: tofs/%tosnap" */
    625 		(void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
    626 		    "%%%s", tosnap);
    627 
    628 		/* open the dataset we are logically receiving into */
    629 		err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
    630 		if (err)
    631 			return (err);
    632 
    633 		rbsa.force = force;
    634 		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
    635 		    recv_incremental_check,
    636 		    recv_online_incremental_sync, ds, &rbsa, 5);
    637 		if (err) {
    638 			dsl_dataset_rele(ds, dmu_recv_tag);
    639 			return (err);
    640 		}
    641 		drc->drc_logical_ds = ds;
    642 		drc->drc_real_ds = rbsa.ds;
    643 	} else {
    644 		/* create new fs -- full backup or clone */
    645 		dsl_dir_t *dd = NULL;
    646 		const char *tail;
    647 
    648 		err = dsl_dir_open(tofs, FTAG, &dd, &tail);
    649 		if (err)
    650 			return (err);
    651 		if (tail == NULL) {
    652 			if (!force) {
    653 				dsl_dir_close(dd, FTAG);
    654 				return (EEXIST);
    655 			}
    656 
    657 			rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
    658 			err = dsl_dataset_own_obj(dd->dd_pool,
    659 			    dd->dd_phys->dd_head_dataset_obj,
    660 			    DS_MODE_INCONSISTENT, FTAG, &ds);
    661 			rw_exit(&dd->dd_pool->dp_config_rwlock);
    662 			if (err) {
    663 				dsl_dir_close(dd, FTAG);
    664 				return (err);
    665 			}
    666 
    667 			dsl_dataset_make_exclusive(ds, FTAG);
    668 			err = dsl_sync_task_do(dd->dd_pool,
    669 			    recv_full_existing_check,
    670 			    recv_full_existing_sync, ds, &rbsa, 5);
    671 			dsl_dataset_disown(ds, FTAG);
    672 		} else {
    673 			err = dsl_sync_task_do(dd->dd_pool, recv_full_check,
    674 			    recv_full_sync, dd, &rbsa, 5);
    675 		}
    676 		dsl_dir_close(dd, FTAG);
    677 		if (err)
    678 			return (err);
    679 		drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds;
    680 		drc->drc_newfs = B_TRUE;
    681 	}
    682 
    683 	return (0);
    684 }
    685 
    686 struct restorearg {
    687 	int err;
    688 	int byteswap;
    689 	vnode_t *vp;
    690 	char *buf;
    691 	uint64_t voff;
    692 	int bufsize; /* amount of memory allocated for buf */
    693 	zio_cksum_t cksum;
    694 };
    695 
    696 static void *
    697 restore_read(struct restorearg *ra, int len)
    698 {
    699 	void *rv;
    700 	int done = 0;
    701 
    702 	/* some things will require 8-byte alignment, so everything must */
    703 	ASSERT3U(len % 8, ==, 0);
    704 
    705 	while (done < len) {
    706 		ssize_t resid;
    707 
    708 		ra->err = vn_rdwr(UIO_READ, ra->vp,
    709 		    (caddr_t)ra->buf + done, len - done,
    710 		    ra->voff, UIO_SYSSPACE, FAPPEND,
    711 		    RLIM64_INFINITY, CRED(), &resid);
    712 
    713 		if (resid == len - done)
    714 			ra->err = EINVAL;
    715 		ra->voff += len - done - resid;
    716 		done = len - resid;
    717 		if (ra->err)
    718 			return (NULL);
    719 	}
    720 
    721 	ASSERT3U(done, ==, len);
    722 	rv = ra->buf;
    723 	if (ra->byteswap)
    724 		fletcher_4_incremental_byteswap(rv, len, &ra->cksum);
    725 	else
    726 		fletcher_4_incremental_native(rv, len, &ra->cksum);
    727 	return (rv);
    728 }
    729 
    730 static void
    731 backup_byteswap(dmu_replay_record_t *drr)
    732 {
    733 #define	DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
    734 #define	DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
    735 	drr->drr_type = BSWAP_32(drr->drr_type);
    736 	drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
    737 	switch (drr->drr_type) {
    738 	case DRR_BEGIN:
    739 		DO64(drr_begin.drr_magic);
    740 		DO64(drr_begin.drr_version);
    741 		DO64(drr_begin.drr_creation_time);
    742 		DO32(drr_begin.drr_type);
    743 		DO32(drr_begin.drr_flags);
    744 		DO64(drr_begin.drr_toguid);
    745 		DO64(drr_begin.drr_fromguid);
    746 		break;
    747 	case DRR_OBJECT:
    748 		DO64(drr_object.drr_object);
    749 		/* DO64(drr_object.drr_allocation_txg); */
    750 		DO32(drr_object.drr_type);
    751 		DO32(drr_object.drr_bonustype);
    752 		DO32(drr_object.drr_blksz);
    753 		DO32(drr_object.drr_bonuslen);
    754 		break;
    755 	case DRR_FREEOBJECTS:
    756 		DO64(drr_freeobjects.drr_firstobj);
    757 		DO64(drr_freeobjects.drr_numobjs);
    758 		break;
    759 	case DRR_WRITE:
    760 		DO64(drr_write.drr_object);
    761 		DO32(drr_write.drr_type);
    762 		DO64(drr_write.drr_offset);
    763 		DO64(drr_write.drr_length);
    764 		break;
    765 	case DRR_FREE:
    766 		DO64(drr_free.drr_object);
    767 		DO64(drr_free.drr_offset);
    768 		DO64(drr_free.drr_length);
    769 		break;
    770 	case DRR_END:
    771 		DO64(drr_end.drr_checksum.zc_word[0]);
    772 		DO64(drr_end.drr_checksum.zc_word[1]);
    773 		DO64(drr_end.drr_checksum.zc_word[2]);
    774 		DO64(drr_end.drr_checksum.zc_word[3]);
    775 		break;
    776 	}
    777 #undef DO64
    778 #undef DO32
    779 }
    780 
    781 static int
    782 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
    783 {
    784 	int err;
    785 	dmu_tx_t *tx;
    786 
    787 	err = dmu_object_info(os, drro->drr_object, NULL);
    788 
    789 	if (err != 0 && err != ENOENT)
    790 		return (EINVAL);
    791 
    792 	if (drro->drr_type == DMU_OT_NONE ||
    793 	    drro->drr_type >= DMU_OT_NUMTYPES ||
    794 	    drro->drr_bonustype >= DMU_OT_NUMTYPES ||
    795 	    drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS ||
    796 	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
    797 	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
    798 	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
    799 	    drro->drr_blksz > SPA_MAXBLOCKSIZE ||
    800 	    drro->drr_bonuslen > DN_MAX_BONUSLEN) {
    801 		return (EINVAL);
    802 	}
    803 
    804 	tx = dmu_tx_create(os);
    805 
    806 	if (err == ENOENT) {
    807 		/* currently free, want to be allocated */
    808 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
    809 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1);
    810 		err = dmu_tx_assign(tx, TXG_WAIT);
    811 		if (err) {
    812 			dmu_tx_abort(tx);
    813 			return (err);
    814 		}
    815 		err = dmu_object_claim(os, drro->drr_object,
    816 		    drro->drr_type, drro->drr_blksz,
    817 		    drro->drr_bonustype, drro->drr_bonuslen, tx);
    818 	} else {
    819 		/* currently allocated, want to be allocated */
    820 		dmu_tx_hold_bonus(tx, drro->drr_object);
    821 		/*
    822 		 * We may change blocksize, so need to
    823 		 * hold_write
    824 		 */
    825 		dmu_tx_hold_write(tx, drro->drr_object, 0, 1);
    826 		err = dmu_tx_assign(tx, TXG_WAIT);
    827 		if (err) {
    828 			dmu_tx_abort(tx);
    829 			return (err);
    830 		}
    831 
    832 		err = dmu_object_reclaim(os, drro->drr_object,
    833 		    drro->drr_type, drro->drr_blksz,
    834 		    drro->drr_bonustype, drro->drr_bonuslen, tx);
    835 	}
    836 	if (err) {
    837 		dmu_tx_commit(tx);
    838 		return (EINVAL);
    839 	}
    840 
    841 	dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx);
    842 	dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
    843 
    844 	if (drro->drr_bonuslen) {
    845 		dmu_buf_t *db;
    846 		void *data;
    847 		VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
    848 		dmu_buf_will_dirty(db, tx);
    849 
    850 		ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
    851 		data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
    852 		if (data == NULL) {
    853 			dmu_tx_commit(tx);
    854 			return (ra->err);
    855 		}
    856 		bcopy(data, db->db_data, drro->drr_bonuslen);
    857 		if (ra->byteswap) {
    858 			dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
    859 			    drro->drr_bonuslen);
    860 		}
    861 		dmu_buf_rele(db, FTAG);
    862 	}
    863 	dmu_tx_commit(tx);
    864 	return (0);
    865 }
    866 
    867 /* ARGSUSED */
    868 static int
    869 restore_freeobjects(struct restorearg *ra, objset_t *os,
    870     struct drr_freeobjects *drrfo)
    871 {
    872 	uint64_t obj;
    873 
    874 	if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
    875 		return (EINVAL);
    876 
    877 	for (obj = drrfo->