Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/dmu.h>
     27 #include <sys/dmu_impl.h>
     28 #include <sys/dmu_tx.h>
     29 #include <sys/dbuf.h>
     30 #include <sys/dnode.h>
     31 #include <sys/zfs_context.h>
     32 #include <sys/dmu_objset.h>
     33 #include <sys/dmu_traverse.h>
     34 #include <sys/dsl_dataset.h>
     35 #include <sys/dsl_dir.h>
     36 #include <sys/dsl_pool.h>
     37 #include <sys/dsl_synctask.h>
     38 #include <sys/zfs_ioctl.h>
     39 #include <sys/zap.h>
     40 #include <sys/zio_checksum.h>
     41 
     42 static char *dmu_recv_tag = "dmu_recv_tag";
     43 
     44 struct backuparg {
     45 	dmu_replay_record_t *drr;
     46 	vnode_t *vp;
     47 	offset_t *off;
     48 	objset_t *os;
     49 	zio_cksum_t zc;
     50 	int err;
     51 };
     52 
     53 static int
     54 dump_bytes(struct backuparg *ba, void *buf, int len)
     55 {
     56 	ssize_t resid; /* have to get resid to get detailed errno */
     57 	ASSERT3U(len % 8, ==, 0);
     58 
     59 	fletcher_4_incremental_native(buf, len, &ba->zc);
     60 	ba->err = vn_rdwr(UIO_WRITE, ba->vp,
     61 	    (caddr_t)buf, len,
     62 	    0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid);
     63 	*ba->off += len;
     64 	return (ba->err);
     65 }
     66 
     67 static int
     68 dump_free(struct backuparg *ba, uint64_t object, uint64_t offset,
     69     uint64_t length)
     70 {
     71 	/* write a FREE record */
     72 	bzero(ba->drr, sizeof (dmu_replay_record_t));
     73 	ba->drr->drr_type = DRR_FREE;
     74 	ba->drr->drr_u.drr_free.drr_object = object;
     75 	ba->drr->drr_u.drr_free.drr_offset = offset;
     76 	ba->drr->drr_u.drr_free.drr_length = length;
     77 
     78 	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
     79 		return (EINTR);
     80 	return (0);
     81 }
     82 
     83 static int
     84 dump_data(struct backuparg *ba, dmu_object_type_t type,
     85     uint64_t object, uint64_t offset, int blksz, void *data)
     86 {
     87 	/* write a DATA record */
     88 	bzero(ba->drr, sizeof (dmu_replay_record_t));
     89 	ba->drr->drr_type = DRR_WRITE;
     90 	ba->drr->drr_u.drr_write.drr_object = object;
     91 	ba->drr->drr_u.drr_write.drr_type = type;
     92 	ba->drr->drr_u.drr_write.drr_offset = offset;
     93 	ba->drr->drr_u.drr_write.drr_length = blksz;
     94 
     95 	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
     96 		return (EINTR);
     97 	if (dump_bytes(ba, data, blksz))
     98 		return (EINTR);
     99 	return (0);
    100 }
    101 
    102 static int
    103 dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs)
    104 {
    105 	/* write a FREEOBJECTS record */
    106 	bzero(ba->drr, sizeof (dmu_replay_record_t));
    107 	ba->drr->drr_type = DRR_FREEOBJECTS;
    108 	ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj;
    109 	ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs;
    110 
    111 	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
    112 		return (EINTR);
    113 	return (0);
    114 }
    115 
    116 static int
    117 dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
    118 {
    119 	if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
    120 		return (dump_freeobjects(ba, object, 1));
    121 
    122 	/* write an OBJECT record */
    123 	bzero(ba->drr, sizeof (dmu_replay_record_t));
    124 	ba->drr->drr_type = DRR_OBJECT;
    125 	ba->drr->drr_u.drr_object.drr_object = object;
    126 	ba->drr->drr_u.drr_object.drr_type = dnp->dn_type;
    127 	ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype;
    128 	ba->drr->drr_u.drr_object.drr_blksz =
    129 	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
    130 	ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen;
    131 	ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum;
    132 	ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress;
    133 
    134 	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
    135 		return (EINTR);
    136 
    137 	if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)))
    138 		return (EINTR);
    139 
    140 	/* free anything past the end of the file */
    141 	if (dump_free(ba, object, (dnp->dn_maxblkid + 1) *
    142 	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
    143 		return (EINTR);
    144 	if (ba->err)
    145 		return (EINTR);
    146 	return (0);
    147 }
    148 
    149 #define	BP_SPAN(dnp, level) \
    150 	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
    151 	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
    152 
    153 /* ARGSUSED */
    154 static int
    155 backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
    156     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
    157 {
    158 	struct backuparg *ba = arg;
    159 	dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
    160 	int err = 0;
    161 
    162 	if (issig(JUSTLOOKING) && issig(FORREAL))
    163 		return (EINTR);
    164 
    165 	if (zb->zb_object != DMU_META_DNODE_OBJECT &&
    166 	    DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
    167 		return (0);
    168 	} else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) {
    169 		uint64_t span = BP_SPAN(dnp, zb->zb_level);
    170 		uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
    171 		err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
    172 	} else if (bp == NULL) {
    173 		uint64_t span = BP_SPAN(dnp, zb->zb_level);
    174 		err = dump_free(ba, zb->zb_object, zb->zb_blkid * span, span);
    175 	} else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
    176 		return (0);
    177 	} else if (type == DMU_OT_DNODE) {
    178 		dnode_phys_t *blk;
    179 		int i;
    180 		int blksz = BP_GET_LSIZE(bp);
    181 		uint32_t aflags = ARC_WAIT;
    182 		arc_buf_t *abuf;
    183 
    184 		if (arc_read_nolock(NULL, spa, bp,
    185 		    arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
    186 		    ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
    187 			return (EIO);
    188 
    189 		blk = abuf->b_data;
    190 		for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
    191 			uint64_t dnobj = (zb->zb_blkid <<
    192 			    (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
    193 			err = dump_dnode(ba, dnobj, blk+i);
    194 			if (err)
    195 				break;
    196 		}
    197 		(void) arc_buf_remove_ref(abuf, &abuf);
    198 	} else { /* it's a level-0 block of a regular object */
    199 		uint32_t aflags = ARC_WAIT;
    200 		arc_buf_t *abuf;
    201 		int blksz = BP_GET_LSIZE(bp);
    202 
    203 		if (arc_read_nolock(NULL, spa, bp,
    204 		    arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
    205 		    ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
    206 			return (EIO);
    207 
    208 		err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz,
    209 		    blksz, abuf->b_data);
    210 		(void) arc_buf_remove_ref(abuf, &abuf);
    211 	}
    212 
    213 	ASSERT(err == 0 || err == EINTR);
    214 	return (err);
    215 }
    216 
    217 int
    218 dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
    219     vnode_t *vp, offset_t *off)
    220 {
    221 	dsl_dataset_t *ds = tosnap->os_dsl_dataset;
    222 	dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL;
    223 	dmu_replay_record_t *drr;
    224 	struct backuparg ba;
    225 	int err;
    226 	uint64_t fromtxg = 0;
    227 
    228 	/* tosnap must be a snapshot */
    229 	if (ds->ds_phys->ds_next_snap_obj == 0)
    230 		return (EINVAL);
    231 
    232 	/* fromsnap must be an earlier snapshot from the same fs as tosnap */
    233 	if (fromds && (ds->ds_dir != fromds->ds_dir ||
    234 	    fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg))
    235 		return (EXDEV);
    236 
    237 	if (fromorigin) {
    238 		dsl_pool_t *dp = ds->ds_dir->dd_pool;
    239 
    240 		if (fromsnap)
    241 			return (EINVAL);
    242 
    243 		if (dsl_dir_is_clone(ds->ds_dir)) {
    244 			rw_enter(&dp->dp_config_rwlock, RW_READER);
    245 			err = dsl_dataset_hold_obj(dp,
    246 			    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds);
    247 			rw_exit(&dp->dp_config_rwlock);
    248 			if (err)
    249 				return (err);
    250 		} else {
    251 			fromorigin = B_FALSE;
    252 		}
    253 	}
    254 
    255 
    256 	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
    257 	drr->drr_type = DRR_BEGIN;
    258 	drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
    259 	drr->drr_u.drr_begin.drr_version = DMU_BACKUP_STREAM_VERSION;
    260 	drr->drr_u.drr_begin.drr_creation_time =
    261 	    ds->ds_phys->ds_creation_time;
    262 	drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type;
    263 	if (fromorigin)
    264 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
    265 	drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
    266 	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
    267 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
    268 
    269 	if (fromds)
    270 		drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
    271 	dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
    272 
    273 	if (fromds)
    274 		fromtxg = fromds->ds_phys->ds_creation_txg;
    275 	if (fromorigin)
    276 		dsl_dataset_rele(fromds, FTAG);
    277 
    278 	ba.drr = drr;
    279 	ba.vp = vp;
    280 	ba.os = tosnap;
    281 	ba.off = off;
    282 	ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
    283 
    284 	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
    285 		kmem_free(drr, sizeof (dmu_replay_record_t));
    286 		return (ba.err);
    287 	}
    288 
    289 	err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
    290 	    backup_cb, &ba);
    291 
    292 	if (err) {
    293 		if (err == EINTR && ba.err)
    294 			err = ba.err;
    295 		kmem_free(drr, sizeof (dmu_replay_record_t));
    296 		return (err);
    297 	}
    298 
    299 	bzero(drr, sizeof (dmu_replay_record_t));
    300 	drr->drr_type = DRR_END;
    301 	drr->drr_u.drr_end.drr_checksum = ba.zc;
    302 
    303 	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
    304 		kmem_free(drr, sizeof (dmu_replay_record_t));
    305 		return (ba.err);
    306 	}
    307 
    308 	kmem_free(drr, sizeof (dmu_replay_record_t));
    309 
    310 	return (0);
    311 }
    312 
    313 struct recvbeginsyncarg {
    314 	const char *tofs;
    315 	const char *tosnap;
    316 	dsl_dataset_t *origin;
    317 	uint64_t fromguid;
    318 	dmu_objset_type_t type;
    319 	void *tag;
    320 	boolean_t force;
    321 	uint64_t dsflags;
    322 	char clonelastname[MAXNAMELEN];
    323 	dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
    324 };
    325 
    326 /* ARGSUSED */
    327 static int
    328 recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx)
    329 {
    330 	dsl_dir_t *dd = arg1;
    331 	struct recvbeginsyncarg *rbsa = arg2;
    332 	objset_t *mos = dd->dd_pool->dp_meta_objset;
    333 	uint64_t val;
    334 	int err;
    335 
    336 	err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
    337 	    strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val);
    338 
    339 	if (err != ENOENT)
    340 		return (err ? err : EEXIST);
    341 
    342 	if (rbsa->origin) {
    343 		/* make sure it's a snap in the same pool */
    344 		if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool)
    345 			return (EXDEV);
    346 		if (!dsl_dataset_is_snapshot(rbsa->origin))
    347 			return (EINVAL);
    348 		if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
    349 			return (ENODEV);
    350 	}
    351 
    352 	return (0);
    353 }
    354 
    355 static void
    356 recv_new_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
    357 {
    358 	dsl_dir_t *dd = arg1;
    359 	struct recvbeginsyncarg *rbsa = arg2;
    360 	uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
    361 	uint64_t dsobj;
    362 
    363 	/* Create and open new dataset. */
    364 	dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
    365 	    rbsa->origin, flags, cr, tx);
    366 	VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj,
    367 	    B_TRUE, dmu_recv_tag, &rbsa->ds));
    368 
    369 	if (rbsa->origin == NULL) {
    370 		(void) dmu_objset_create_impl(dd->dd_pool->dp_spa,
    371 		    rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx);
    372 	}
    373 
    374 	spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC,
    375 	    dd->dd_pool->dp_spa, tx, cr, "dataset = %lld", dsobj);
    376 }
    377 
    378 /* ARGSUSED */
    379 static int
    380 recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
    381 {
    382 	dsl_dataset_t *ds = arg1;
    383 	struct recvbeginsyncarg *rbsa = arg2;
    384 	int err;
    385 	uint64_t val;
    386 
    387 	/* must not have any changes since most recent snapshot */
    388 	if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
    389 		return (ETXTBSY);
    390 
    391 	if (rbsa->fromguid) {
    392 		/* if incremental, most recent snapshot must match fromguid */
    393 		if (ds->ds_prev == NULL)
    394 			return (ENODEV);
    395 		if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid)
    396 			return (ENODEV);
    397 	} else {
    398 		/* if full, most recent snapshot must be $ORIGIN */
    399 		if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL)
    400 			return (ENODEV);
    401 	}
    402 
    403 	/* temporary clone name must not exist */
    404 	err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
    405 	    ds->ds_dir->dd_phys->dd_child_dir_zapobj,
    406 	    rbsa->clonelastname, 8, 1, &val);
    407 	if (err == 0)
    408 		return (EEXIST);
    409 	if (err != ENOENT)
    410 		return (err);
    411 
    412 	/* new snapshot name must not exist */
    413 	err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
    414 	    ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
    415 	if (err == 0)
    416 		return (EEXIST);
    417 	if (err != ENOENT)
    418 		return (err);
    419 	return (0);
    420 }
    421 
    422 /* ARGSUSED */
    423 static void
    424 recv_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
    425 {
    426 	dsl_dataset_t *ohds = arg1;
    427 	struct recvbeginsyncarg *rbsa = arg2;
    428 	dsl_pool_t *dp = ohds->ds_dir->dd_pool;
    429 	dsl_dataset_t *cds;
    430 	uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
    431 	uint64_t dsobj;
    432 
    433 	/* create and open the temporary clone */
    434 	dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname,
    435 	    ohds->ds_prev, flags, cr, tx);
    436 	VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds));
    437 
    438 	/*
    439 	 * If we actually created a non-clone, we need to create the
    440 	 * objset in our new dataset.
    441 	 */
    442 	if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) {
    443 		(void) dmu_objset_create_impl(dp->dp_spa,
    444 		    cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx);
    445 	}
    446 
    447 	/* copy the refquota from the target fs to the clone */
    448 	if (ohds->ds_quota > 0)
    449 		dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx);
    450 
    451 	rbsa->ds = cds;
    452 
    453 	spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
    454 	    dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
    455 }
    456 
    457 /*
    458  * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
    459  * succeeds; otherwise we will leak the holds on the datasets.
    460  */
    461 int
    462 dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
    463     boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc)
    464 {
    465 	int err = 0;
    466 	boolean_t byteswap;
    467 	struct recvbeginsyncarg rbsa = { 0 };
    468 	uint64_t version;
    469 	int flags;
    470 	dsl_dataset_t *ds;
    471 
    472 	if (drrb->drr_magic == DMU_BACKUP_MAGIC)
    473 		byteswap = FALSE;
    474 	else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
    475 		byteswap = TRUE;
    476 	else
    477 		return (EINVAL);
    478 
    479 	rbsa.tofs = tofs;
    480 	rbsa.tosnap = tosnap;
    481 	rbsa.origin = origin ? origin->os_dsl_dataset : NULL;
    482 	rbsa.fromguid = drrb->drr_fromguid;
    483 	rbsa.type = drrb->drr_type;
    484 	rbsa.tag = FTAG;
    485 	rbsa.dsflags = 0;
    486 	version = drrb->drr_version;
    487 	flags = drrb->drr_flags;
    488 
    489 	if (byteswap) {
    490 		rbsa.type = BSWAP_32(rbsa.type);
    491 		rbsa.fromguid = BSWAP_64(rbsa.fromguid);
    492 		version = BSWAP_64(version);
    493 		flags = BSWAP_32(flags);
    494 	}
    495 
    496 	if (version != DMU_BACKUP_STREAM_VERSION ||
    497 	    rbsa.type >= DMU_OST_NUMTYPES ||
    498 	    ((flags & DRR_FLAG_CLONE) && origin == NULL))
    499 		return (EINVAL);
    500 
    501 	if (flags & DRR_FLAG_CI_DATA)
    502 		rbsa.dsflags = DS_FLAG_CI_DATASET;
    503 
    504 	bzero(drc, sizeof (dmu_recv_cookie_t));
    505 	drc->drc_drrb = drrb;
    506 	drc->drc_tosnap = tosnap;
    507 	drc->drc_force = force;
    508 
    509 	/*
    510 	 * Process the begin in syncing context.
    511 	 */
    512 
    513 	/* open the dataset we are logically receiving into */
    514 	err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
    515 	if (err == 0) {
    516 		/* target fs already exists; recv into temp clone */
    517 
    518 		/* Can't recv a clone into an existing fs */
    519 		if (flags & DRR_FLAG_CLONE) {
    520 			dsl_dataset_rele(ds, dmu_recv_tag);
    521 			return (EINVAL);
    522 		}
    523 
    524 		/* must not have an incremental recv already in progress */
    525 		if (!mutex_tryenter(&ds->ds_recvlock)) {
    526 			dsl_dataset_rele(ds, dmu_recv_tag);
    527 			return (EBUSY);
    528 		}
    529 
    530 		/* tmp clone name is: tofs/%tosnap" */
    531 		(void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
    532 		    "%%%s", tosnap);
    533 		rbsa.force = force;
    534 		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
    535 		    recv_existing_check, recv_existing_sync, ds, &rbsa, 5);
    536 		if (err) {
    537 			mutex_exit(&ds->ds_recvlock);
    538 			dsl_dataset_rele(ds, dmu_recv_tag);
    539 			return (err);
    540 		}
    541 		drc->drc_logical_ds = ds;
    542 		drc->drc_real_ds = rbsa.ds;
    543 	} else if (err == ENOENT) {
    544 		/* target fs does not exist; must be a full backup or clone */
    545 		char *cp;
    546 
    547 		/*
    548 		 * If it's a non-clone incremental, we are missing the
    549 		 * target fs, so fail the recv.
    550 		 */
    551 		if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE))
    552 			return (ENOENT);
    553 
    554 		/* Open the parent of tofs */
    555 		cp = strrchr(tofs, '/');
    556 		*cp = '\0';
    557 		err = dsl_dataset_hold(tofs, FTAG, &ds);
    558 		*cp = '/';
    559 		if (err)
    560 			return (err);
    561 
    562 		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
    563 		    recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5);
    564 		dsl_dataset_rele(ds, FTAG);
    565 		if (err)
    566 			return (err);
    567 		drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds;
    568 		drc->drc_newfs = B_TRUE;
    569 	}
    570 
    571 	return (err);
    572 }
    573 
    574 struct restorearg {
    575 	int err;
    576 	int byteswap;
    577 	vnode_t *vp;
    578 	char *buf;
    579 	uint64_t voff;
    580 	int bufsize; /* amount of memory allocated for buf */
    581 	zio_cksum_t cksum;
    582 };
    583 
    584 static void *
    585 restore_read(struct restorearg *ra, int len)
    586 {
    587 	void *rv;
    588 	int done = 0;
    589 
    590 	/* some things will require 8-byte alignment, so everything must */
    591 	ASSERT3U(len % 8, ==, 0);
    592 
    593 	while (done < len) {
    594 		ssize_t resid;
    595 
    596 		ra->err = vn_rdwr(UIO_READ, ra->vp,
    597 		    (caddr_t)ra->buf + done, len - done,
    598 		    ra->voff, UIO_SYSSPACE, FAPPEND,
    599 		    RLIM64_INFINITY, CRED(), &resid);
    600 
    601 		if (resid == len - done)
    602 			ra->err = EINVAL;
    603 		ra->voff += len - done - resid;
    604 		done = len - resid;
    605 		if (ra->err)
    606 			return (NULL);
    607 	}
    608 
    609 	ASSERT3U(done, ==, len);
    610 	rv = ra->buf;
    611 	if (ra->byteswap)
    612 		fletcher_4_incremental_byteswap(rv, len, &ra->cksum);
    613 	else
    614 		fletcher_4_incremental_native(rv, len, &ra->cksum);
    615 	return (rv);
    616 }
    617 
    618 static void
    619 backup_byteswap(dmu_replay_record_t *drr)
    620 {
    621 #define	DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
    622 #define	DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
    623 	drr->drr_type = BSWAP_32(drr->drr_type);
    624 	drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
    625 	switch (drr->drr_type) {
    626 	case DRR_BEGIN:
    627 		DO64(drr_begin.drr_magic);
    628 		DO64(drr_begin.drr_version);
    629 		DO64(drr_begin.drr_creation_time);
    630 		DO32(drr_begin.drr_type);
    631 		DO32(drr_begin.drr_flags);
    632 		DO64(drr_begin.drr_toguid);
    633 		DO64(drr_begin.drr_fromguid);
    634 		break;
    635 	case DRR_OBJECT:
    636 		DO64(drr_object.drr_object);
    637 		/* DO64(drr_object.drr_allocation_txg); */
    638 		DO32(drr_object.drr_type);
    639 		DO32(drr_object.drr_bonustype);
    640 		DO32(drr_object.drr_blksz);
    641 		DO32(drr_object.drr_bonuslen);
    642 		break;
    643 	case DRR_FREEOBJECTS:
    644 		DO64(drr_freeobjects.drr_firstobj);
    645 		DO64(drr_freeobjects.drr_numobjs);
    646 		break;
    647 	case DRR_WRITE:
    648 		DO64(drr_write.drr_object);
    649 		DO32(drr_write.drr_type);
    650 		DO64(drr_write.drr_offset);
    651 		DO64(drr_write.drr_length);
    652 		break;
    653 	case DRR_FREE:
    654 		DO64(drr_free.drr_object);
    655 		DO64(drr_free.drr_offset);
    656 		DO64(drr_free.drr_length);
    657 		break;
    658 	case DRR_END:
    659 		DO64(drr_end.drr_checksum.zc_word[0]);
    660 		DO64(drr_end.drr_checksum.zc_word[1]);
    661 		DO64(drr_end.drr_checksum.zc_word[2]);
    662 		DO64(drr_end.drr_checksum.zc_word[3]);
    663 		break;
    664 	}
    665 #undef DO64
    666 #undef DO32
    667 }
    668 
    669 static int
    670 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
    671 {
    672 	int err;
    673 	dmu_tx_t *tx;
    674 	void *data = NULL;
    675 
    676 	if (drro->drr_type == DMU_OT_NONE ||
    677 	    drro->drr_type >= DMU_OT_NUMTYPES ||
    678 	    drro->drr_bonustype >= DMU_OT_NUMTYPES ||
    679 	    drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS ||
    680 	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
    681 	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
    682 	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
    683 	    drro->drr_blksz > SPA_MAXBLOCKSIZE ||
    684 	    drro->drr_bonuslen > DN_MAX_BONUSLEN) {
    685 		return (EINVAL);
    686 	}
    687 
    688 	err = dmu_object_info(os, drro->drr_object, NULL);
    689 
    690 	if (err != 0 && err != ENOENT)
    691 		return (EINVAL);
    692 
    693 	if (drro->drr_bonuslen) {
    694 		data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
    695 		if (ra->err)
    696 			return (ra->err);
    697 	}
    698 
    699 	if (err == ENOENT) {
    700 		/* currently free, want to be allocated */
    701 		tx = dmu_tx_create(os);
    702 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
    703 		err = dmu_tx_assign(tx, TXG_WAIT);
    704 		if (err) {
    705 			dmu_tx_abort(tx);
    706 			return (err);
    707 		}
    708 		err = dmu_object_claim(os, drro->drr_object,
    709 		    drro->drr_type, drro->drr_blksz,
    710 		    drro->drr_bonustype, drro->drr_bonuslen, tx);
    711 		dmu_tx_commit(tx);
    712 	} else {
    713 		/* currently allocated, want to be allocated */
    714 		err = dmu_object_reclaim(os, drro->drr_object,
    715 		    drro->drr_type, drro->drr_blksz,
    716 		    drro->drr_bonustype, drro->drr_bonuslen);
    717 	}
    718 	if (err)
    719 		return (EINVAL);
    720 
    721 	tx = dmu_tx_create(os);
    722 	dmu_tx_hold_bonus(tx, drro->drr_object);
    723 	err = dmu_tx_assign(tx, TXG_WAIT);
    724 	if (err) {
    725 		dmu_tx_abort(tx);
    726 		return (err);
    727 	}
    728 
    729 	dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx);
    730 	dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
    731 
    732 	if (data != NULL) {
    733 		dmu_buf_t *db;
    734 
    735 		VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
    736 		dmu_buf_will_dirty(db, tx);
    737 
    738 		ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
    739 		bcopy(data, db->db_data, drro->drr_bonuslen);
    740 		if (ra->byteswap) {
    741 			dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
    742 			    drro->drr_bonuslen);
    743 		}
    744 		dmu_buf_rele(db, FTAG);
    745 	}
    746 	dmu_tx_commit(tx);
    747 	return (0);
    748 }
    749 
    750 /* ARGSUSED */
    751 static int
    752 restore_freeobjects(struct restorearg *ra, objset_t *os,
    753     struct drr_freeobjects *drrfo)
    754 {
    755 	uint64_t obj;
    756 
    757 	if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
    758 		return (EINVAL);
    759 
    760 	for (obj = drrfo->drr_firstobj;
    761 	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
    762 	    (void) dmu_object_next(os, &obj, FALSE, 0)) {
    763 		int err;
    764 
    765 		if (dmu_object_info(os, obj, NULL) != 0)
    766 			continue;
    767 
    768 		err = dmu_free_object(os, obj);
    769 		if (err)
    770 			return (err);
    771 	}
    772 	return (0);
    773 }
    774 
    775 static int
    776 restore_write(struct restorearg *ra, objset_t *os,
    777     struct drr_write *drrw)
    778 {
    779 	dmu_tx_t *tx;
    780 	void *data;
    781 	int err;
    782 
    783 	if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
    784 	    drrw->drr_type >= DMU_OT_NUMTYPES)
    785 		return (EINVAL);
    786 
    787 	data = restore_read(ra, drrw->drr_length);
    788 	if (data == NULL)
    789 		return (ra->err);
    790 
    791 	if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
    792 		return (EINVAL);
    793 
    794 	tx = dmu_tx_create(os);
    795 
    796 	dmu_tx_hold_write(tx, drrw->drr_object,
    797 	    drrw->drr_offset, drrw->drr_length);
    798 	err = dmu_tx_assign(tx, TXG_WAIT);
    799 	if (err) {
    800 		dmu_tx_abort(tx);
    801 		return (err);
    802 	}
    803 	if (ra->byteswap)
    804 		dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length);
    805 	dmu_write(os, drrw->drr_object,
    806 	    drrw->drr_offset, drrw->drr_length, data, tx);
    807 	dmu_tx_commit(tx);
    808 	return (0);
    809 }
    810 
    811 /* ARGSUSED */
    812 static int
    813 restore_free(struct restorearg *ra, objset_t *os,
    814     struct drr_free *drrf)
    815 {
    816 	int err;
    817 
    818 	if (drrf->drr_length != -1ULL &&
    819 	    drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
    820 		return (EINVAL);
    821 
    822 	if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
    823 		return (EINVAL);
    824 
    825 	err = dmu_free_long_range(os, drrf->drr_object,
    826 	    drrf->drr_offset, drrf->drr_length);
    827 	return (err);
    828 }
    829 
    830 /*
    831  * NB: callers *must* call dmu_recv_end() if this succeeds.
    832  */
    833 int
    834 dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
    835 {
    836 	struct restorearg ra = { 0 };
    837 	dmu_replay_record_t *drr;
    838 	objset_t *os;
    839 	zio_cksum_t pcksum;
    840 
    841 	if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
    842 		ra.byteswap = TRUE;
    843 
    844 	{
    845 		/* compute checksum of drr_begin record */
    846 		dmu_replay_record_t *drr;
    847 		drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
    848 
    849 		drr->drr_type = DRR_BEGIN;
    850 		drr->drr_u.drr_begin = *drc->drc_drrb;
    851 		if (ra.byteswap) {
    852 			fletcher_4_incremental_byteswap(drr,
    853 			    sizeof (dmu_replay_record_t), &ra.cksum);
    854 		} else {
    855 			fletcher_4_incremental_native(drr,
    856 			    sizeof (dmu_replay_record_t), &ra.cksum);
    857 		}
    858 		kmem_free(drr, sizeof (dmu_replay_record_t));
    859 	}
    860 
    861 	if (ra.byteswap) {
    862 		struct drr_begin *drrb = drc->drc_drrb;
    863 		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
    864 		drrb->drr_version = BSWAP_64(drrb->drr_version);
    865 		drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
    866 		drrb->drr_type = BSWAP_32(drrb->drr_type);
    867 		drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
    868 		drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
    869 	}
    870 
    871 	ra.vp = vp;
    872 	ra.voff = *voffp;
    873 	ra.bufsize = 1<<20;
    874 	ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
    875 
    876 	/* these were verified in dmu_recv_begin */
    877 	ASSERT(drc->drc_drrb->drr_version == DMU_BACKUP_STREAM_VERSION);
    878 	ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES);
    879 
    880 	/*
    881 	 * Open the objset we are modifying.
    882 	 */
    883 	VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0);
    884 
    885 	ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
    886 
    887 	/*
    888 	 * Read records and process them.
    889 	 */
    890 	pcksum = ra.cksum;
    891 	while (ra.err == 0 &&
    892 	    NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
    893 		if (issig(JUSTLOOKING) && issig(FORREAL)) {
    894 			ra.err = EINTR;
    895 			goto out;
    896 		}
    897 
    898 		if (ra.byteswap)
    899 			backup_byteswap(drr);
    900 
    901 		switch (drr->drr_type) {
    902 		case DRR_OBJECT:
    903 		{
    904 			/*
    905 			 * We need to make a copy of the record header,
    906 			 * because restore_{object,write} may need to
    907 			 * restore_read(), which will invalidate drr.
    908 			 */
    909 			struct drr_object drro = drr->drr_u.drr_object;
    910 			ra.err = restore_object(&ra, os, &drro);
    911 			break;
    912 		}
    913 		case DRR_FREEOBJECTS:
    914 		{
    915 			struct drr_freeobjects drrfo =
    916 			    drr->drr_u.drr_freeobjects;
    917 			ra.err = restore_freeobjects(&ra, os, &drrfo);
    918 			break;
    919 		}
    920 		case DRR_WRITE:
    921 		{
    922 			struct drr_write drrw = drr->drr_u.drr_write;
    923 			ra.err = restore_write(&ra, os, &drrw);
    924 			break;
    925 		}
    926 		case DRR_FREE:
    927 		{
    928 			struct drr_free drrf = drr->drr_u.drr_free;
    929 			ra.err = restore_free(&ra, os, &drrf);
    930 			break;
    931 		}
    932 		case DRR_END:
    933 		{
    934 			struct drr_end drre = drr->drr_u.drr_end;
    935 			/*
    936 			 * We compare against the *previous* checksum
    937 			 * value, because the stored checksum is of
    938 			 * everything before the DRR_END record.
    939 			 */
    940 			if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum))
    941 				ra.err = ECKSUM;
    942 			goto out;
    943 		}
    944 		default:
    945 			ra.err = EINVAL;
    946 			goto out;
    947 		}
    948 		pcksum = ra.cksum;
    949 	}
    950 	ASSERT(ra.err != 0);
    951 
    952 out:
    953 	if (ra.err != 0) {
    954 		/*
    955 		 * destroy what we created, so we don't leave it in the
    956 		 * inconsistent restoring state.
    957 		 */
    958 		txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0);
    959 
    960 		(void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
    961 		    B_FALSE);
    962 		if (drc->drc_real_ds != drc->drc_logical_ds) {
    963 			mutex_exit(&drc->drc_logical_ds->ds_recvlock);
    964 			dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
    965 		}
    966 	}
    967 
    968 	kmem_free(ra.buf, ra.bufsize);
    969 	*voffp = ra.voff;
    970 	return (ra.err);
    971 }
    972 
    973 struct recvendsyncarg {
    974 	char *tosnap;
    975 	uint64_t creation_time;
    976 	uint64_t toguid;
    977 };
    978 
    979 static int
    980 recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
    981 {
    982 	dsl_dataset_t *ds = arg1;
    983 	struct recvendsyncarg *resa = arg2;
    984 
    985 	return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx));
    986 }
    987 
    988 static void
    989 recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
    990 {
    991 	dsl_dataset_t *ds = arg1;
    992 	struct recvendsyncarg *resa = arg2;
    993 
    994 	dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx);
    995 
    996 	/* set snapshot's creation time and guid */
    997 	dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
    998 	ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time;
    999 	ds->ds_prev->ds_phys->ds_guid = resa->toguid;
   1000 	ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
   1001 
   1002 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
   1003 	ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
   1004 }
   1005 
   1006 static int
   1007 dmu_recv_existing_end(dmu_recv_cookie_t *drc)
   1008 {
   1009 	struct recvendsyncarg resa;
   1010 	dsl_dataset_t *ds = drc->drc_logical_ds;
   1011 	int err;
   1012 
   1013 	/*
   1014 	 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
   1015 	 * expects it to have a ds_user_ptr (and zil), but clone_swap()
   1016 	 * can close it.
   1017 	 */
   1018 	txg_wait_synced(ds->ds_dir->dd_pool, 0);
   1019 
   1020 	if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
   1021 		err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
   1022 		    drc->drc_force);
   1023 		if (err)
   1024 			goto out;
   1025 	} else {
   1026 		mutex_exit(&ds->ds_recvlock);
   1027 		dsl_dataset_rele(ds, dmu_recv_tag);
   1028 		(void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
   1029 		    B_FALSE);
   1030 		return (EBUSY);
   1031 	}
   1032 
   1033 	resa.creation_time = drc->drc_drrb->drr_creation_time;
   1034 	resa.toguid = drc->drc_drrb->drr_toguid;
   1035 	resa.tosnap = drc->drc_tosnap;
   1036 
   1037 	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
   1038 	    recv_end_check, recv_end_sync, ds, &resa, 3);
   1039 	if (err) {
   1040 		/* swap back */
   1041 		(void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE);
   1042 	}
   1043 
   1044 out:
   1045 	mutex_exit(&ds->ds_recvlock);
   1046 	dsl_dataset_disown(ds, dmu_recv_tag);
   1047 	(void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE);
   1048 	return (err);
   1049 }
   1050 
   1051 static int
   1052 dmu_recv_new_end(dmu_recv_cookie_t *drc)
   1053 {
   1054 	struct recvendsyncarg resa;
   1055 	dsl_dataset_t *ds = drc->drc_logical_ds;
   1056 	int err;
   1057 
   1058 	/*
   1059 	 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
   1060 	 * expects it to have a ds_user_ptr (and zil), but clone_swap()
   1061 	 * can close it.
   1062 	 */
   1063 	txg_wait_synced(ds->ds_dir->dd_pool, 0);
   1064 
   1065 	resa.creation_time = drc->drc_drrb->drr_creation_time;
   1066 	resa.toguid = drc->drc_drrb->drr_toguid;
   1067 	resa.tosnap = drc->drc_tosnap;
   1068 
   1069 	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
   1070 	    recv_end_check, recv_end_sync, ds, &resa, 3);
   1071 	if (err) {
   1072 		/* clean up the fs we just recv'd into */
   1073 		(void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE);
   1074 	} else {
   1075 		/* release the hold from dmu_recv_begin */
   1076 		dsl_dataset_disown(ds, dmu_recv_tag);
   1077 	}
   1078 	return (err);
   1079 }
   1080 
   1081 int
   1082 dmu_recv_end(dmu_recv_cookie_t *drc)
   1083 {
   1084 	if (drc->drc_logical_ds != drc->drc_real_ds)
   1085 		return (dmu_recv_existing_end(drc));
   1086 	else
   1087 		return (dmu_recv_new_end(drc));
   1088 }
   1089