Home | History | Annotate | Download | only in common
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 /*
     29  * mirror operations
     30  */
     31 
     32 #include <meta.h>
     33 #include <sys/lvm/md_mirror.h>
     34 #include <thread.h>
     35 
     36 extern	int	md_in_daemon;
     37 extern md_mn_client_list_t *mdmn_clients;
     38 
     39 /*
     40  * chain of mirrors
     41  */
     42 typedef struct mm_unit_list {
     43 	struct mm_unit_list	*next;	/* next in chain */
     44 	mdname_t		*namep;	/* mirror name */
     45 	mm_pass_num_t		pass;	/* pass number */
     46 	uint_t			done;	/* resync done */
     47 } mm_unit_list_t;
     48 
     49 /*
     50  * resync mirror
     51  * meta_lock for this set should be held on entry.
     52  */
     53 int
     54 meta_mirror_resync(
     55 	mdsetname_t		*sp,
     56 	mdname_t		*mirnp,
     57 	daddr_t			size,
     58 	md_error_t		*ep,
     59 	md_resync_cmd_t		cmd	/* Start/Block/Unblock/Kill */
     60 )
     61 {
     62 	char			*miscname;
     63 	md_resync_ioctl_t	ri;
     64 
     65 	/* should have a set */
     66 	assert(sp != NULL);
     67 	assert(sp->setno == MD_MIN2SET(meta_getminor(mirnp->dev)));
     68 
     69 	/* make sure we have a mirror */
     70 	if ((miscname = metagetmiscname(mirnp, ep)) == NULL)
     71 		return (-1);
     72 	if (strcmp(miscname, MD_MIRROR) != 0) {
     73 		return (mdmderror(ep, MDE_NOT_MM, meta_getminor(mirnp->dev),
     74 		    mirnp->cname));
     75 	}
     76 
     77 	/* start resync */
     78 	(void) memset(&ri, 0, sizeof (ri));
     79 	MD_SETDRIVERNAME(&ri, MD_MIRROR, sp->setno);
     80 	ri.ri_mnum = meta_getminor(mirnp->dev);
     81 	ri.ri_copysize = size;
     82 	switch (cmd) {
     83 	case MD_RESYNC_FORCE_MNSTART:
     84 		ri.ri_flags |= MD_RI_RESYNC_FORCE_MNSTART;
     85 		break;
     86 	case MD_RESYNC_START:
     87 		ri.ri_flags = 0;
     88 		break;
     89 	case MD_RESYNC_BLOCK:
     90 		ri.ri_flags = MD_RI_BLOCK;
     91 		break;
     92 	case MD_RESYNC_UNBLOCK:
     93 		ri.ri_flags = MD_RI_UNBLOCK;
     94 		break;
     95 	case MD_RESYNC_KILL:
     96 		ri.ri_flags = MD_RI_KILL;
     97 		break;
     98 	case MD_RESYNC_KILL_NO_WAIT:
     99 		ri.ri_flags = MD_RI_KILL | MD_RI_NO_WAIT;
    100 		break;
    101 	default:
    102 		/* TODO: Add new error MDE_BAD_RESYNC_FLAGS */
    103 		return (mderror(ep, MDE_BAD_RESYNC_OPT, mirnp->cname));
    104 	}
    105 
    106 	if (metaioctl(MD_IOCSETSYNC, &ri, &ri.mde, mirnp->cname) != 0)
    107 		return (mdstealerror(ep, &ri.mde));
    108 
    109 	/* return success */
    110 	return (0);
    111 }
    112 
    113 /*
    114  * free units
    115  */
    116 static void
    117 free_units(
    118 	mm_unit_list_t	*mirrors[MD_PASS_MAX + 1]
    119 )
    120 {
    121 	uint_t		i;
    122 
    123 	for (i = 0; (i < (MD_PASS_MAX + 1)); ++i) {
    124 		mm_unit_list_t	*p, *n;
    125 
    126 		for (p = mirrors[i], n = NULL; (p != NULL); p = n) {
    127 			n = p->next;
    128 			Free(p);
    129 		}
    130 		mirrors[i] = NULL;
    131 	}
    132 }
    133 
    134 /*
    135  * setup_units:	build lists of units for each pass
    136  */
    137 static int
    138 setup_units(
    139 	mdsetname_t	*sp,
    140 	mm_unit_list_t	*mirrors[MD_PASS_MAX + 1],
    141 	md_error_t	*ep
    142 )
    143 {
    144 	mdnamelist_t	*mirrornlp = NULL;
    145 	mdnamelist_t	*p;
    146 	int		rval = 0;
    147 
    148 	/* should have a set */
    149 	assert(sp != NULL);
    150 
    151 	/* for each mirror */
    152 	if (meta_get_mirror_names(sp, &mirrornlp, 0, ep) < 0)
    153 		return (-1);
    154 	for (p = mirrornlp; (p != NULL); p = p->next) {
    155 		md_mirror_t	*mirrorp;
    156 		mm_unit_list_t	*lp;
    157 
    158 		/* get unit structure */
    159 		if ((mirrorp = meta_get_mirror(sp, p->namep, ep)) == NULL) {
    160 			rval = -1;	/* record, but ignore errors */
    161 			continue;
    162 		}
    163 
    164 		/* save info */
    165 		lp = Zalloc(sizeof (*lp));
    166 		lp->namep = p->namep;
    167 		lp->pass = mirrorp->pass_num;
    168 		if ((lp->pass < 0) || (lp->pass > MD_PASS_MAX))
    169 			lp->pass = MD_PASS_MAX;
    170 
    171 		/* put on list */
    172 		lp->next = mirrors[lp->pass];
    173 		mirrors[lp->pass] = lp;
    174 	}
    175 
    176 	/* cleanup, return error */
    177 	metafreenamelist(mirrornlp);
    178 	return (rval);
    179 }
    180 
    181 /*
    182  * resync all mirrors (in background)
    183  */
    184 int
    185 meta_mirror_resync_all(
    186 	mdsetname_t	*sp,
    187 	daddr_t		size,
    188 	md_error_t	*ep
    189 )
    190 {
    191 	mm_unit_list_t	*mirrors[MD_PASS_MAX + 1];
    192 	mm_pass_num_t	pass, max_pass;
    193 	int		rval = 0, fval;
    194 
    195 	/* should have a set */
    196 	assert(sp != NULL);
    197 
    198 	/* get mirrors */
    199 	(void) memset(mirrors, 0, sizeof (mirrors));
    200 	if (setup_units(sp, mirrors, ep) != 0)
    201 		return (-1);
    202 
    203 	/* fork a process */
    204 	if ((fval = md_daemonize(sp, ep)) != 0) {
    205 		/*
    206 		 * md_daemonize will fork off a process.  The is the
    207 		 * parent or error.
    208 		 */
    209 		if (fval > 0) {
    210 			free_units(mirrors);
    211 			return (0);
    212 		}
    213 		mdclrerror(ep);
    214 	}
    215 	/*
    216 	 * Closing stdin/out/err here.
    217 	 * In case this was called thru rsh, the calling process on the other
    218 	 * side will know, it doesn't have to wait until all the resyncs have
    219 	 * finished.
    220 	 * Also initialise the rpc client pool so that this process will use
    221 	 * a unique pool of clients. If we don't do this, all of the forked
    222 	 * clients will end up using the same pool of clients which can result
    223 	 * in hung clients.
    224 	 */
    225 	if (meta_is_mn_set(sp, ep)) {
    226 		(void) close(0);
    227 		(void) close(1);
    228 		(void) close(2);
    229 		mdmn_clients = NULL;
    230 	}
    231 	assert((fval == 0) || (fval == -1));
    232 
    233 	/*
    234 	 * Determine which pass level is the highest that contains mirrors to
    235 	 * resync. We only need to wait for completion of earlier levels below
    236 	 * this high watermark. If all mirrors are at the same pass level
    237 	 * there is no requirement to wait for completion.
    238 	 */
    239 
    240 	max_pass = 1;
    241 	for (pass = MD_PASS_MAX; pass > 1; --pass) {
    242 		if (mirrors[pass] != NULL) {
    243 			max_pass = pass;
    244 			break;
    245 		}
    246 	}
    247 
    248 	/*
    249 	 * max_pass now contains the highest pass-level with resyncable mirrors
    250 	 */
    251 
    252 	/* do passes */
    253 	for (pass = 1; (pass <= MD_PASS_MAX); ++pass) {
    254 		int			dispatched = 0;
    255 		unsigned		howlong = 1;
    256 		mm_unit_list_t		*lp;
    257 
    258 		/* skip empty passes */
    259 		if (mirrors[pass] == NULL)
    260 			continue;
    261 
    262 		/* dispatch all resyncs in pass */
    263 		for (lp = mirrors[pass]; (lp != NULL); lp = lp->next) {
    264 			if (meta_is_mn_set(sp, ep)) {
    265 				if (meta_mn_send_setsync(sp, lp->namep,
    266 				    size, ep) != 0) {
    267 					rval = -1;
    268 					lp->done = 1;
    269 				} else {
    270 					++dispatched;
    271 				}
    272 			} else {
    273 				if (meta_mirror_resync(sp, lp->namep, size, ep,
    274 				    MD_RESYNC_START) != 0) {
    275 					rval = -1;
    276 					lp->done = 1;
    277 				} else {
    278 					++dispatched;
    279 				}
    280 			}
    281 		}
    282 
    283 		/*
    284 		 * Wait for them to finish iff we are at a level lower than
    285 		 * max_pass. This orders the resyncs into distinct levels.
    286 		 * I.e. level 2 resyncs won't start until all level 1 ones
    287 		 * have completed.
    288 		 */
    289 		if (pass == max_pass)
    290 			continue;
    291 
    292 		howlong = 1;
    293 		while (dispatched > 0) {
    294 
    295 			/* wait a while */
    296 			(void) sleep(howlong);
    297 
    298 			/* see if any finished */
    299 			for (lp = mirrors[pass]; lp != NULL; lp = lp->next) {
    300 				md_resync_ioctl_t	ri;
    301 
    302 				if (lp->done)
    303 					continue;
    304 
    305 				(void) memset(&ri, '\0', sizeof (ri));
    306 				ri.ri_mnum = meta_getminor(lp->namep->dev);
    307 				MD_SETDRIVERNAME(&ri, MD_MIRROR, sp->setno);
    308 				if (metaioctl(MD_IOCGETSYNC, &ri, &ri.mde,
    309 				    lp->namep->cname) != 0) {
    310 					(void) mdstealerror(ep, &ri.mde);
    311 					rval = -1;
    312 					lp->done = 1;
    313 					--dispatched;
    314 				} else if (! (ri.ri_flags & MD_RI_INPROGRESS)) {
    315 					lp->done = 1;
    316 					--dispatched;
    317 				}
    318 			}
    319 
    320 			/* wait a little longer next time */
    321 			if (howlong < 10)
    322 				++howlong;
    323 		}
    324 	}
    325 
    326 	/* cleanup, return success */
    327 	free_units(mirrors);
    328 	if (fval == 0)  /* we are the child process so exit */
    329 		exit(0);
    330 	return (rval);
    331 }
    332 
    333 /*
    334  * meta_mn_mirror_resync_all:
    335  * -------------------------
    336  * Resync all mirrors associated with given set (arg). Called when master
    337  * node is adding a node to a diskset.  Only want to initiate the resync on
    338  * the current node.
    339  */
    340 void *
    341 meta_mn_mirror_resync_all(void *arg)
    342 {
    343 	set_t		setno = *((set_t *)arg);
    344 	mdsetname_t	*sp;
    345 	mm_unit_list_t	*mirrors[MD_PASS_MAX + 1];
    346 	mm_pass_num_t	pass, max_pass;
    347 	md_error_t	mde = mdnullerror;
    348 	int		fval;
    349 
    350 
    351 	/* should have a set */
    352 	assert(setno != NULL);
    353 
    354 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
    355 		mde_perror(&mde, "");
    356 		return (NULL);
    357 	}
    358 
    359 	if (!(meta_is_mn_set(sp, &mde))) {
    360 		mde_perror(&mde, "");
    361 		return (NULL);
    362 	}
    363 
    364 	/* fork a process */
    365 	if ((fval = md_daemonize(sp, &mde)) != 0) {
    366 		/*
    367 		 * md_daemonize will fork off a process.  The is the
    368 		 * parent or error.
    369 		 */
    370 		if (fval > 0) {
    371 			return (NULL);
    372 		}
    373 		mde_perror(&mde, "");
    374 		return (NULL);
    375 	}
    376 	/*
    377 	 * Child process should never return back to rpc.metad, but
    378 	 * should exit.
    379 	 * Flush all internally cached data inherited from parent process
    380 	 * since cached data will be cleared when parent process RPC request
    381 	 * has completed (which is possibly before this child process
    382 	 * can complete).
    383 	 * Child process can retrieve and cache its own copy of data from
    384 	 * rpc.metad that won't be changed by the parent process.
    385 	 *
    386 	 * Reset md_in_daemon since this child will be a client of rpc.metad
    387 	 * not part of the rpc.metad daemon itself.
    388 	 * md_in_daemon is used by rpc.metad so that libmeta can tell if
    389 	 * this thread is rpc.metad or any other thread.  (If this thread
    390 	 * was rpc.metad it could use some short circuit code to get data
    391 	 * directly from rpc.metad instead of doing an RPC call to rpc.metad).
    392 	 */
    393 	md_in_daemon = 0;
    394 	metaflushsetname(sp);
    395 	sr_cache_flush_setno(setno);
    396 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
    397 		mde_perror(&mde, "");
    398 		md_exit(sp, 1);
    399 	}
    400 
    401 	if (meta_lock(sp, TRUE, &mde) != 0) {
    402 		mde_perror(&mde, "");
    403 		md_exit(sp, 1);
    404 	}
    405 
    406 	/*
    407 	 * Closing stdin/out/err here.
    408 	 */
    409 	(void) close(0);
    410 	(void) close(1);
    411 	(void) close(2);
    412 	assert(fval == 0);
    413 
    414 	/* get mirrors */
    415 	(void) memset(mirrors, 0, sizeof (mirrors));
    416 	if (setup_units(sp, mirrors, &mde) != 0) {
    417 		(void) meta_unlock(sp, &mde);
    418 		md_exit(sp, 1);
    419 	}
    420 
    421 	/*
    422 	 * Determine which pass level is the highest that contains mirrors to
    423 	 * resync. We only need to wait for completion of earlier levels below
    424 	 * this high watermark. If all mirrors are at the same pass level
    425 	 * there is no requirement to wait for completion.
    426 	 */
    427 	max_pass = 1;
    428 	for (pass = MD_PASS_MAX; pass > 1; --pass) {
    429 		if (mirrors[pass] != NULL) {
    430 			max_pass = pass;
    431 			break;
    432 		}
    433 	}
    434 
    435 	/*
    436 	 * max_pass now contains the highest pass-level with resyncable mirrors
    437 	 */
    438 	/* do passes */
    439 	for (pass = 1; (pass <= MD_PASS_MAX); ++pass) {
    440 		int			dispatched = 0;
    441 		unsigned		howlong = 1;
    442 		mm_unit_list_t		*lp;
    443 
    444 		/* skip empty passes */
    445 		if (mirrors[pass] == NULL)
    446 			continue;
    447 
    448 		/* dispatch all resyncs in pass */
    449 		for (lp = mirrors[pass]; (lp != NULL); lp = lp->next) {
    450 			if (meta_mirror_resync(sp, lp->namep, 0, &mde,
    451 			    MD_RESYNC_FORCE_MNSTART) != 0) {
    452 				mdclrerror(&mde);
    453 				lp->done = 1;
    454 			} else {
    455 				++dispatched;
    456 			}
    457 		}
    458 
    459 		/*
    460 		 * Wait for them to finish iff we are at a level lower than
    461 		 * max_pass. This orders the resyncs into distinct levels.
    462 		 * I.e. level 2 resyncs won't start until all level 1 ones
    463 		 * have completed.
    464 		 */
    465 		if (pass == max_pass)
    466 			continue;
    467 
    468 		howlong = 1;
    469 		while (dispatched > 0) {
    470 
    471 			/* wait a while */
    472 			(void) sleep(howlong);
    473 
    474 			/* see if any finished */
    475 			for (lp = mirrors[pass]; lp != NULL; lp = lp->next) {
    476 				md_resync_ioctl_t	ri;
    477 
    478 				if (lp->done)
    479 					continue;
    480 
    481 				(void) memset(&ri, '\0', sizeof (ri));
    482 				ri.ri_mnum = meta_getminor(lp->namep->dev);
    483 				MD_SETDRIVERNAME(&ri, MD_MIRROR, sp->setno);
    484 				if (metaioctl(MD_IOCGETSYNC, &ri, &ri.mde,
    485 				    lp->namep->cname) != 0) {
    486 					mdclrerror(&mde);
    487 					lp->done = 1;
    488 					--dispatched;
    489 				} else if (! (ri.ri_flags & MD_RI_INPROGRESS)) {
    490 					lp->done = 1;
    491 					--dispatched;
    492 				}
    493 			}
    494 
    495 			/* wait a little longer next time */
    496 			if (howlong < 10)
    497 				++howlong;
    498 		}
    499 	}
    500 
    501 	/* cleanup, return success */
    502 	free_units(mirrors);
    503 	(void) meta_unlock(sp, &mde);
    504 	md_exit(sp, 0);
    505 	/*NOTREACHED*/
    506 	return (NULL);
    507 }
    508 
    509 /*
    510  * meta_mirror_resync_process:
    511  * --------------------------
    512  * Modify any resync that is in progress on this node for the given set.
    513  *
    514  * Input Parameters:
    515  *	sp	setname to scan for mirrors
    516  *	cmd	action to take:
    517  *		MD_RESYNC_KILL	- kill all resync threads
    518  *		MD_RESYNC_BLOCK	- block all resync threads
    519  *		MD_RESYNC_UNBLOCK - resume all resync threads
    520  * Output Parameters
    521  *	ep	error return structure
    522  *
    523  * meta_lock for this set should be held on entry.
    524  */
    525 static void
    526 meta_mirror_resync_process(mdsetname_t *sp, md_error_t *ep, md_resync_cmd_t cmd)
    527 {
    528 	mm_unit_list_t	*mirrors[MD_PASS_MAX + 1];
    529 	mm_pass_num_t	pass;
    530 
    531 	/* Grab all the mirrors from the set (if any) */
    532 	(void) memset(mirrors, 0, sizeof (mirrors));
    533 	if (setup_units(sp, mirrors, ep) != 0)
    534 		return;
    535 
    536 	/* do passes */
    537 	for (pass = 1; (pass <= MD_PASS_MAX); ++pass) {
    538 		mm_unit_list_t		*lp;
    539 
    540 		/* skip empty passes */
    541 		if (mirrors[pass] == NULL)
    542 			continue;
    543 
    544 		/* Process all resyncs in pass */
    545 		for (lp = mirrors[pass]; (lp != NULL); lp = lp->next) {
    546 			(void) meta_mirror_resync(sp, lp->namep, 0, ep,
    547 			    cmd);
    548 		}
    549 	}
    550 
    551 	/* Clear up mirror units */
    552 	free_units(mirrors);
    553 }
    554 
    555 /*
    556  * meta_mirror_resync_process_all:
    557  * ------------------------------
    558  * Issue the given resync command to all mirrors contained in all multi-node
    559  * sets.
    560  *
    561  * Input Parameters:
    562  *	cmd	- MD_RESYNC_KILL, MD_RESYNC_BLOCK, MD_RESYNC_UNBLOCK
    563  */
    564 static void
    565 meta_mirror_resync_process_all(md_resync_cmd_t cmd)
    566 {
    567 	set_t		setno, max_sets;
    568 	md_error_t	mde = mdnullerror;
    569 	mdsetname_t	*this_sp;
    570 	md_set_desc	*sd;
    571 
    572 	/*
    573 	 * Traverse all sets looking for multi-node capable ones.
    574 	 */
    575 	max_sets = get_max_sets(&mde);
    576 	for (setno = 1; setno < max_sets; setno++) {
    577 		mde = mdnullerror;
    578 		if (this_sp = metasetnosetname(setno, &mde)) {
    579 			if ((sd = metaget_setdesc(this_sp, &mde)) == NULL)
    580 				continue;
    581 			if (!MD_MNSET_DESC(sd))
    582 				continue;
    583 
    584 			if (meta_lock(this_sp, TRUE, &mde)) {
    585 				continue;
    586 			}
    587 			meta_mirror_resync_process(this_sp, &mde, cmd);
    588 			(void) meta_unlock(this_sp, &mde);
    589 		}
    590 	}
    591 }
    592 
    593 /*
    594  * meta_mirror_resync_kill_all:
    595  * ---------------------------
    596  * Abort any resync that is in progress on this node. Scan all sets for all
    597  * mirrors.
    598  * Note: this routine is provided for future use. For example to kill all
    599  *	 resyncs on a node this could be used as long as the
    600  *	 mddoors / rpc.mdcommd tuple is running on all members of the cluster.
    601  */
    602 void
    603 meta_mirror_resync_kill_all(void)
    604 {
    605 	meta_mirror_resync_process_all(MD_RESYNC_KILL);
    606 }
    607 
    608 /*
    609  * meta_mirror_resync_block_all:
    610  * ----------------------------
    611  * Block all resyncs that are in progress. This causes the resync state to
    612  * freeze on this machine, and can be resumed by calling
    613  * meta_mirror_resync_unblock_all.
    614  */
    615 void
    616 meta_mirror_resync_block_all(void)
    617 {
    618 	meta_mirror_resync_process_all(MD_RESYNC_BLOCK);
    619 }
    620 
    621 /*
    622  * meta_mirror_resync_unblock_all:
    623  * ------------------------------
    624  * Unblock all previously blocked resync threads on this node.
    625  */
    626 void
    627 meta_mirror_resync_unblock_all(void)
    628 {
    629 	meta_mirror_resync_process_all(MD_RESYNC_UNBLOCK);
    630 }
    631 
    632 /*
    633  * meta_mirror_resync_unblock:
    634  * --------------------------
    635  * Unblock any previously blocked resync threads for the given set.
    636  * meta_lock for this set should be held on entry.
    637  */
    638 void
    639 meta_mirror_resync_unblock(mdsetname_t *sp)
    640 {
    641 	md_error_t	mde = mdnullerror;
    642 
    643 	meta_mirror_resync_process(sp, &mde, MD_RESYNC_UNBLOCK);
    644 }
    645 
    646 /*
    647  * meta_mirror_resync_kill:
    648  * -----------------------
    649  * Kill any resync threads running on mirrors in the given set.
    650  * Called when releasing a set (meta_set_prv.c`halt_set)
    651  */
    652 void
    653 meta_mirror_resync_kill(mdsetname_t *sp)
    654 {
    655 	md_error_t	mde = mdnullerror;
    656 
    657 	meta_mirror_resync_process(sp, &mde, MD_RESYNC_KILL);
    658 }
    659