Home | History | Annotate | Download | only in btree
      1     0  stevel #pragma ident	"%Z%%M%	%I%	%E% SMI"
      2     0  stevel 
      3     0  stevel /*-
      4     0  stevel  * Copyright (c) 1990, 1993, 1994
      5     0  stevel  *	The Regents of the University of California.  All rights reserved.
      6     0  stevel  *
      7     0  stevel  * This code is derived from software contributed to Berkeley by
      8     0  stevel  * Mike Olson.
      9     0  stevel  *
     10     0  stevel  * Redistribution and use in source and binary forms, with or without
     11     0  stevel  * modification, are permitted provided that the following conditions
     12     0  stevel  * are met:
     13     0  stevel  * 1. Redistributions of source code must retain the above copyright
     14     0  stevel  *    notice, this list of conditions and the following disclaimer.
     15     0  stevel  * 2. Redistributions in binary form must reproduce the above copyright
     16     0  stevel  *    notice, this list of conditions and the following disclaimer in the
     17     0  stevel  *    documentation and/or other materials provided with the distribution.
     18     0  stevel  * 3. All advertising materials mentioning features or use of this software
     19     0  stevel  *    must display the following acknowledgement:
     20     0  stevel  *	This product includes software developed by the University of
     21     0  stevel  *	California, Berkeley and its contributors.
     22     0  stevel  * 4. Neither the name of the University nor the names of its contributors
     23     0  stevel  *    may be used to endorse or promote products derived from this software
     24     0  stevel  *    without specific prior written permission.
     25     0  stevel  *
     26     0  stevel  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     27     0  stevel  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     28     0  stevel  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     29     0  stevel  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     30     0  stevel  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     31     0  stevel  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     32     0  stevel  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     33     0  stevel  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     34     0  stevel  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     35     0  stevel  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     36     0  stevel  * SUCH DAMAGE.
     37     0  stevel  */
     38     0  stevel 
     39     0  stevel #if defined(LIBC_SCCS) && !defined(lint)
     40     0  stevel static char sccsid[] = "@(#)bt_split.c	8.10 (Berkeley) 1/9/95";
     41     0  stevel #endif /* LIBC_SCCS and not lint */
     42     0  stevel 
     43     0  stevel #include <sys/types.h>
     44     0  stevel 
     45     0  stevel #include <limits.h>
     46     0  stevel #include <stdio.h>
     47     0  stevel #include <stdlib.h>
     48     0  stevel #include <string.h>
     49     0  stevel 
     50     0  stevel #include "db-int.h"
     51     0  stevel #include "btree.h"
     52     0  stevel 
     53     0  stevel static int	 bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *));
     54     0  stevel static PAGE	*bt_page
     55     0  stevel 		    __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t));
     56     0  stevel static int	 bt_preserve __P((BTREE *, db_pgno_t));
     57     0  stevel static PAGE	*bt_psplit
     58     0  stevel 		    __P((BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t));
     59     0  stevel static PAGE	*bt_root
     60     0  stevel 		    __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t));
     61     0  stevel static int	 bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *));
     62     0  stevel static recno_t	 rec_total __P((PAGE *));
     63     0  stevel 
     64     0  stevel #ifdef STATISTICS
     65     0  stevel u_long	bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
     66     0  stevel #endif
     67     0  stevel 
     68     0  stevel /*
     69     0  stevel  * __BT_SPLIT -- Split the tree.
     70     0  stevel  *
     71     0  stevel  * Parameters:
     72     0  stevel  *	t:	tree
     73     0  stevel  *	sp:	page to split
     74     0  stevel  *	key:	key to insert
     75     0  stevel  *	data:	data to insert
     76     0  stevel  *	flags:	BIGKEY/BIGDATA flags
     77     0  stevel  *	ilen:	insert length
     78     0  stevel  *	skip:	index to leave open
     79     0  stevel  *
     80     0  stevel  * Returns:
     81     0  stevel  *	RET_ERROR, RET_SUCCESS
     82     0  stevel  */
     83     0  stevel int
     84     0  stevel __bt_split(t, sp, key, data, flags, ilen, argskip)
     85     0  stevel 	BTREE *t;
     86     0  stevel 	PAGE *sp;
     87     0  stevel 	const DBT *key, *data;
     88     0  stevel 	int flags;
     89     0  stevel 	size_t ilen;
     90     0  stevel 	u_int32_t argskip;
     91     0  stevel {
     92     0  stevel 	BINTERNAL *bi;
     93     0  stevel 	BLEAF *bl, *tbl;
     94     0  stevel 	DBT a, b;
     95     0  stevel 	EPGNO *parent;
     96     0  stevel 	PAGE *h, *l, *r, *lchild, *rchild;
     97     0  stevel 	indx_t nxtindex;
     98     0  stevel 	u_int16_t skip;
     99     0  stevel 	u_int32_t n, nbytes, nksize;
    100     0  stevel 	int parentsplit;
    101     0  stevel 	char *dest;
    102     0  stevel 
    103     0  stevel 	/*
    104     0  stevel 	 * Split the page into two pages, l and r.  The split routines return
    105     0  stevel 	 * a pointer to the page into which the key should be inserted and with
    106     0  stevel 	 * skip set to the offset which should be used.  Additionally, l and r
    107     0  stevel 	 * are pinned.
    108     0  stevel 	 */
    109     0  stevel 	skip = argskip;
    110     0  stevel 	h = sp->pgno == P_ROOT ?
    111     0  stevel 	    bt_root(t, sp, &l, &r, &skip, ilen) :
    112     0  stevel 	    bt_page(t, sp, &l, &r, &skip, ilen);
    113     0  stevel 	if (h == NULL)
    114     0  stevel 		return (RET_ERROR);
    115     0  stevel 
    116     0  stevel 	/*
    117     0  stevel 	 * Insert the new key/data pair into the leaf page.  (Key inserts
    118     0  stevel 	 * always cause a leaf page to split first.)
    119     0  stevel 	 */
    120     0  stevel 	h->linp[skip] = h->upper -= ilen;
    121     0  stevel 	dest = (char *)h + h->upper;
    122     0  stevel 	if (F_ISSET(t, R_RECNO))
    123     0  stevel 		WR_RLEAF(dest, data, flags)
    124     0  stevel 	else
    125     0  stevel 		WR_BLEAF(dest, key, data, flags)
    126     0  stevel 
    127     0  stevel 	/* If the root page was split, make it look right. */
    128     0  stevel 	if (sp->pgno == P_ROOT &&
    129     0  stevel 	    (F_ISSET(t, R_RECNO) ?
    130     0  stevel 	    bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
    131     0  stevel 		goto err2;
    132     0  stevel 
    133     0  stevel 	/*
    134     0  stevel 	 * Now we walk the parent page stack -- a LIFO stack of the pages that
    135     0  stevel 	 * were traversed when we searched for the page that split.  Each stack
    136     0  stevel 	 * entry is a page number and a page index offset.  The offset is for
    137     0  stevel 	 * the page traversed on the search.  We've just split a page, so we
    138     0  stevel 	 * have to insert a new key into the parent page.
    139     0  stevel 	 *
    140     0  stevel 	 * If the insert into the parent page causes it to split, may have to
    141     0  stevel 	 * continue splitting all the way up the tree.  We stop if the root
    142     0  stevel 	 * splits or the page inserted into didn't have to split to hold the
    143     0  stevel 	 * new key.  Some algorithms replace the key for the old page as well
    144     0  stevel 	 * as the new page.  We don't, as there's no reason to believe that the
    145     0  stevel 	 * first key on the old page is any better than the key we have, and,
    146     0  stevel 	 * in the case of a key being placed at index 0 causing the split, the
    147     0  stevel 	 * key is unavailable.
    148     0  stevel 	 *
    149     0  stevel 	 * There are a maximum of 5 pages pinned at any time.  We keep the left
    150     0  stevel 	 * and right pages pinned while working on the parent.   The 5 are the
    151     0  stevel 	 * two children, left parent and right parent (when the parent splits)
    152     0  stevel 	 * and the root page or the overflow key page when calling bt_preserve.
    153     0  stevel 	 * This code must make sure that all pins are released other than the
    154     0  stevel 	 * root page or overflow page which is unlocked elsewhere.
    155     0  stevel 	 */
    156     0  stevel 	while ((parent = BT_POP(t)) != NULL) {
    157     0  stevel 		lchild = l;
    158     0  stevel 		rchild = r;
    159     0  stevel 
    160     0  stevel 		/* Get the parent page. */
    161     0  stevel 		if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
    162     0  stevel 			goto err2;
    163     0  stevel 
    164     0  stevel 	 	/*
    165     0  stevel 		 * The new key goes ONE AFTER the index, because the split
    166     0  stevel 		 * was to the right.
    167     0  stevel 		 */
    168     0  stevel 		skip = parent->index + 1;
    169     0  stevel 
    170     0  stevel 		/*
    171     0  stevel 		 * Calculate the space needed on the parent page.
    172     0  stevel 		 *
    173     0  stevel 		 * Prefix trees: space hack when inserting into BINTERNAL
    174     0  stevel 		 * pages.  Retain only what's needed to distinguish between
    175     0  stevel 		 * the new entry and the LAST entry on the page to its left.
    176     0  stevel 		 * If the keys compare equal, retain the entire key.  Note,
    177     0  stevel 		 * we don't touch overflow keys, and the entire key must be
    178     0  stevel 		 * retained for the next-to-left most key on the leftmost
    179     0  stevel 		 * page of each level, or the search will fail.  Applicable
    180     0  stevel 		 * ONLY to internal pages that have leaf pages as children.
    181     0  stevel 		 * Further reduction of the key between pairs of internal
    182     0  stevel 		 * pages loses too much information.
    183     0  stevel 		 */
    184     0  stevel 		switch (rchild->flags & P_TYPE) {
    185     0  stevel 		case P_BINTERNAL:
    186     0  stevel 			bi = GETBINTERNAL(rchild, 0);
    187     0  stevel 			nbytes = NBINTERNAL(bi->ksize);
    188     0  stevel 			break;
    189     0  stevel 		case P_BLEAF:
    190     0  stevel 			bl = GETBLEAF(rchild, 0);
    191     0  stevel 			nbytes = NBINTERNAL(bl->ksize);
    192     0  stevel 			if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
    193     0  stevel 			    (h->prevpg != P_INVALID || skip > 1)) {
    194     0  stevel 				tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
    195     0  stevel 				a.size = tbl->ksize;
    196     0  stevel 				a.data = tbl->bytes;
    197     0  stevel 				b.size = bl->ksize;
    198     0  stevel 				b.data = bl->bytes;
    199     0  stevel 				nksize = t->bt_pfx(&a, &b);
    200     0  stevel 				n = NBINTERNAL(nksize);
    201     0  stevel 				if (n < nbytes) {
    202     0  stevel #ifdef STATISTICS
    203     0  stevel 					bt_pfxsaved += nbytes - n;
    204     0  stevel #endif
    205     0  stevel 					nbytes = n;
    206     0  stevel 				} else
    207     0  stevel 					nksize = 0;
    208     0  stevel 			} else
    209     0  stevel 				nksize = 0;
    210     0  stevel 			break;
    211     0  stevel 		case P_RINTERNAL:
    212     0  stevel 		case P_RLEAF:
    213     0  stevel 			nbytes = NRINTERNAL;
    214     0  stevel 			break;
    215     0  stevel 		default:
    216     0  stevel 			abort();
    217     0  stevel 		}
    218     0  stevel 
    219     0  stevel 		/* Split the parent page if necessary or shift the indices. */
    220     0  stevel 		if (h->upper - h->lower < nbytes + sizeof(indx_t)) {
    221     0  stevel 			sp = h;
    222     0  stevel 			h = h->pgno == P_ROOT ?
    223     0  stevel 			    bt_root(t, h, &l, &r, &skip, nbytes) :
    224     0  stevel 			    bt_page(t, h, &l, &r, &skip, nbytes);
    225     0  stevel 			if (h == NULL)
    226     0  stevel 				goto err1;
    227     0  stevel 			parentsplit = 1;
    228     0  stevel 		} else {
    229     0  stevel 			if (skip < (nxtindex = NEXTINDEX(h)))
    230     0  stevel 				memmove(h->linp + skip + 1, h->linp + skip,
    231     0  stevel 				    (nxtindex - skip) * sizeof(indx_t));
    232     0  stevel 			h->lower += sizeof(indx_t);
    233     0  stevel 			parentsplit = 0;
    234     0  stevel 		}
    235     0  stevel 
    236     0  stevel 		/* Insert the key into the parent page. */
    237     0  stevel 		switch (rchild->flags & P_TYPE) {
    238     0  stevel 		case P_BINTERNAL:
    239     0  stevel 			h->linp[skip] = h->upper -= nbytes;
    240     0  stevel 			dest = (char *)h + h->linp[skip];
    241     0  stevel 			memmove(dest, bi, nbytes);
    242     0  stevel 			((BINTERNAL *)dest)->pgno = rchild->pgno;
    243     0  stevel 			break;
    244     0  stevel 		case P_BLEAF:
    245     0  stevel 			h->linp[skip] = h->upper -= nbytes;
    246     0  stevel 			dest = (char *)h + h->linp[skip];
    247     0  stevel 			WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
    248     0  stevel 			    rchild->pgno, bl->flags & P_BIGKEY);
    249     0  stevel 			memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
    250     0  stevel 			if (bl->flags & P_BIGKEY &&
    251     0  stevel 			    bt_preserve(t, *(db_pgno_t *)bl->bytes) == RET_ERROR)
    252     0  stevel 				goto err1;
    253     0  stevel 			break;
    254     0  stevel 		case P_RINTERNAL:
    255     0  stevel 			/*
    256     0  stevel 			 * Update the left page count.  If split
    257     0  stevel 			 * added at index 0, fix the correct page.
    258     0  stevel 			 */
    259     0  stevel 			if (skip > 0)
    260     0  stevel 				dest = (char *)h + h->linp[skip - 1];
    261     0  stevel 			else
    262     0  stevel 				dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
    263     0  stevel 			((RINTERNAL *)dest)->nrecs = rec_total(lchild);
    264     0  stevel 			((RINTERNAL *)dest)->pgno = lchild->pgno;
    265     0  stevel 
    266     0  stevel 			/* Update the right page count. */
    267     0  stevel 			h->linp[skip] = h->upper -= nbytes;
    268     0  stevel 			dest = (char *)h + h->linp[skip];
    269     0  stevel 			((RINTERNAL *)dest)->nrecs = rec_total(rchild);
    270     0  stevel 			((RINTERNAL *)dest)->pgno = rchild->pgno;
    271     0  stevel 			break;
    272     0  stevel 		case P_RLEAF:
    273     0  stevel 			/*
    274     0  stevel 			 * Update the left page count.  If split
    275     0  stevel 			 * added at index 0, fix the correct page.
    276     0  stevel 			 */
    277     0  stevel 			if (skip > 0)
    278     0  stevel 				dest = (char *)h + h->linp[skip - 1];
    279     0  stevel 			else
    280     0  stevel 				dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
    281     0  stevel 			((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
    282     0  stevel 			((RINTERNAL *)dest)->pgno = lchild->pgno;
    283     0  stevel 
    284     0  stevel 			/* Update the right page count. */
    285     0  stevel 			h->linp[skip] = h->upper -= nbytes;
    286     0  stevel 			dest = (char *)h + h->linp[skip];
    287     0  stevel 			((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
    288     0  stevel 			((RINTERNAL *)dest)->pgno = rchild->pgno;
    289     0  stevel 			break;
    290     0  stevel 		default:
    291     0  stevel 			abort();
    292     0  stevel 		}
    293     0  stevel 
    294     0  stevel 		/* Unpin the held pages. */
    295     0  stevel 		if (!parentsplit) {
    296     0  stevel 			mpool_put(t->bt_mp, h, MPOOL_DIRTY);
    297     0  stevel 			break;
    298     0  stevel 		}
    299     0  stevel 
    300     0  stevel 		/* If the root page was split, make it look right. */
    301     0  stevel 		if (sp->pgno == P_ROOT &&
    302     0  stevel 		    (F_ISSET(t, R_RECNO) ?
    303     0  stevel 		    bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
    304     0  stevel 			goto err1;
    305     0  stevel 
    306     0  stevel 		mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
    307     0  stevel 		mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
    308     0  stevel 	}
    309     0  stevel 
    310     0  stevel 	/* Unpin the held pages. */
    311     0  stevel 	mpool_put(t->bt_mp, l, MPOOL_DIRTY);
    312     0  stevel 	mpool_put(t->bt_mp, r, MPOOL_DIRTY);
    313     0  stevel 
    314     0  stevel 	/* Clear any pages left on the stack. */
    315     0  stevel 	return (RET_SUCCESS);
    316     0  stevel 
    317     0  stevel 	/*
    318     0  stevel 	 * If something fails in the above loop we were already walking back
    319     0  stevel 	 * up the tree and the tree is now inconsistent.  Nothing much we can
    320     0  stevel 	 * do about it but release any memory we're holding.
    321     0  stevel 	 */
    322     0  stevel err1:	mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
    323     0  stevel 	mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
    324     0  stevel 
    325     0  stevel err2:	mpool_put(t->bt_mp, l, 0);
    326     0  stevel 	mpool_put(t->bt_mp, r, 0);
    327     0  stevel 	__dbpanic(t->bt_dbp);
    328     0  stevel 	return (RET_ERROR);
    329     0  stevel }
    330     0  stevel 
    331     0  stevel /*
    332     0  stevel  * BT_PAGE -- Split a non-root page of a btree.
    333     0  stevel  *
    334     0  stevel  * Parameters:
    335     0  stevel  *	t:	tree
    336     0  stevel  *	h:	root page
    337     0  stevel  *	lp:	pointer to left page pointer
    338     0  stevel  *	rp:	pointer to right page pointer
    339     0  stevel  *	skip:	pointer to index to leave open
    340     0  stevel  *	ilen:	insert length
    341     0  stevel  *
    342     0  stevel  * Returns:
    343     0  stevel  *	Pointer to page in which to insert or NULL on error.
    344     0  stevel  */
    345     0  stevel static PAGE *
    346     0  stevel bt_page(t, h, lp, rp, skip, ilen)
    347     0  stevel 	BTREE *t;
    348     0  stevel 	PAGE *h, **lp, **rp;
    349     0  stevel 	indx_t *skip;
    350     0  stevel 	size_t ilen;
    351     0  stevel {
    352     0  stevel 	PAGE *l, *r, *tp;
    353     0  stevel 	db_pgno_t npg;
    354     0  stevel 
    355     0  stevel #ifdef STATISTICS
    356     0  stevel 	++bt_split;
    357     0  stevel #endif
    358     0  stevel 	/* Put the new right page for the split into place. */
    359     0  stevel 	if ((r = __bt_new(t, &npg)) == NULL)
    360     0  stevel 		return (NULL);
    361     0  stevel 	r->pgno = npg;
    362     0  stevel 	r->lower = BTDATAOFF;
    363     0  stevel 	r->upper = t->bt_psize;
    364     0  stevel 	r->nextpg = h->nextpg;
    365     0  stevel 	r->prevpg = h->pgno;
    366     0  stevel 	r->flags = h->flags & P_TYPE;
    367     0  stevel 
    368     0  stevel 	/*
    369     0  stevel 	 * If we're splitting the last page on a level because we're appending
    370     0  stevel 	 * a key to it (skip is NEXTINDEX()), it's likely that the data is
    371     0  stevel 	 * sorted.  Adding an empty page on the side of the level is less work
    372     0  stevel 	 * and can push the fill factor much higher than normal.  If we're
    373     0  stevel 	 * wrong it's no big deal, we'll just do the split the right way next
    374     0  stevel 	 * time.  It may look like it's equally easy to do a similar hack for
    375     0  stevel 	 * reverse sorted data, that is, split the tree left, but it's not.
    376     0  stevel 	 * Don't even try.
    377     0  stevel 	 */
    378     0  stevel 	if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
    379     0  stevel #ifdef STATISTICS
    380     0  stevel 		++bt_sortsplit;
    381     0  stevel #endif
    382     0  stevel 		h->nextpg = r->pgno;
    383     0  stevel 		r->lower = BTDATAOFF + sizeof(indx_t);
    384     0  stevel 		*skip = 0;
    385     0  stevel 		*lp = h;
    386     0  stevel 		*rp = r;
    387     0  stevel 		return (r);
    388     0  stevel 	}
    389     0  stevel 
    390     0  stevel 	/* Put the new left page for the split into place. */
    391     0  stevel 	if ((l = (PAGE *)malloc(t->bt_psize)) == NULL) {
    392     0  stevel 		mpool_put(t->bt_mp, r, 0);
    393     0  stevel 		return (NULL);
    394     0  stevel 	}
    395     0  stevel #ifdef PURIFY
    396     0  stevel 	memset(l, 0xff, t->bt_psize);
    397     0  stevel #endif
    398     0  stevel 	l->pgno = h->pgno;
    399     0  stevel 	l->nextpg = r->pgno;
    400     0  stevel 	l->prevpg = h->prevpg;
    401     0  stevel 	l->lower = BTDATAOFF;
    402     0  stevel 	l->upper = t->bt_psize;
    403     0  stevel 	l->flags = h->flags & P_TYPE;
    404     0  stevel 
    405     0  stevel 	/* Fix up the previous pointer of the page after the split page. */
    406     0  stevel 	if (h->nextpg != P_INVALID) {
    407     0  stevel 		if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
    408     0  stevel 			free(l);
    409     0  stevel 			/* XXX mpool_free(t->bt_mp, r->pgno); */
    410     0  stevel 			return (NULL);
    411     0  stevel 		}
    412     0  stevel 		tp->prevpg = r->pgno;
    413     0  stevel 		mpool_put(t->bt_mp, tp, MPOOL_DIRTY);
    414     0  stevel 	}
    415     0  stevel 
    416     0  stevel 	/*
    417     0  stevel 	 * Split right.  The key/data pairs aren't sorted in the btree page so
    418     0  stevel 	 * it's simpler to copy the data from the split page onto two new pages
    419     0  stevel 	 * instead of copying half the data to the right page and compacting
    420     0  stevel 	 * the left page in place.  Since the left page can't change, we have
    421     0  stevel 	 * to swap the original and the allocated left page after the split.
    422     0  stevel 	 */
    423     0  stevel 	tp = bt_psplit(t, h, l, r, skip, ilen);
    424     0  stevel 
    425     0  stevel 	/* Move the new left page onto the old left page. */
    426     0  stevel 	memmove(h, l, t->bt_psize);
    427     0  stevel 	if (tp == l)
    428     0  stevel 		tp = h;
    429     0  stevel 	free(l);
    430     0  stevel 
    431     0  stevel 	*lp = h;
    432     0  stevel 	*rp = r;
    433     0  stevel 	return (tp);
    434     0  stevel }
    435     0  stevel 
    436     0  stevel /*
    437     0  stevel  * BT_ROOT -- Split the root page of a btree.
    438     0  stevel  *
    439     0  stevel  * Parameters:
    440     0  stevel  *	t:	tree
    441     0  stevel  *	h:	root page
    442     0  stevel  *	lp:	pointer to left page pointer
    443     0  stevel  *	rp:	pointer to right page pointer
    444     0  stevel  *	skip:	pointer to index to leave open
    445     0  stevel  *	ilen:	insert length
    446     0  stevel  *
    447     0  stevel  * Returns:
    448     0  stevel  *	Pointer to page in which to insert or NULL on error.
    449     0  stevel  */
    450     0  stevel static PAGE *
    451     0  stevel bt_root(t, h, lp, rp, skip, ilen)
    452     0  stevel 	BTREE *t;
    453     0  stevel 	PAGE *h, **lp, **rp;
    454     0  stevel 	indx_t *skip;
    455     0  stevel 	size_t ilen;
    456     0  stevel {
    457     0  stevel 	PAGE *l, *r, *tp;
    458     0  stevel 	db_pgno_t lnpg, rnpg;
    459     0  stevel 
    460     0  stevel #ifdef STATISTICS
    461     0  stevel 	++bt_split;
    462     0  stevel 	++bt_rootsplit;
    463     0  stevel #endif
    464     0  stevel 	/* Put the new left and right pages for the split into place. */
    465     0  stevel 	if ((l = __bt_new(t, &lnpg)) == NULL ||
    466     0  stevel 	    (r = __bt_new(t, &rnpg)) == NULL)
    467     0  stevel 		return (NULL);
    468     0  stevel 	l->pgno = lnpg;
    469     0  stevel 	r->pgno = rnpg;
    470     0  stevel 	l->nextpg = r->pgno;
    471     0  stevel 	r->prevpg = l->pgno;
    472     0  stevel 	l->prevpg = r->nextpg = P_INVALID;
    473     0  stevel 	l->lower = r->lower = BTDATAOFF;
    474     0  stevel 	l->upper = r->upper = t->bt_psize;
    475     0  stevel 	l->flags = r->flags = h->flags & P_TYPE;
    476     0  stevel 
    477     0  stevel 	/* Split the root page. */
    478     0  stevel 	tp = bt_psplit(t, h, l, r, skip, ilen);
    479     0  stevel 
    480     0  stevel 	*lp = l;
    481     0  stevel 	*rp = r;
    482     0  stevel 	return (tp);
    483     0  stevel }
    484     0  stevel 
    485     0  stevel /*
    486     0  stevel  * BT_RROOT -- Fix up the recno root page after it has been split.
    487     0  stevel  *
    488     0  stevel  * Parameters:
    489     0  stevel  *	t:	tree
    490     0  stevel  *	h:	root page
    491     0  stevel  *	l:	left page
    492     0  stevel  *	r:	right page
    493     0  stevel  *
    494     0  stevel  * Returns:
    495     0  stevel  *	RET_ERROR, RET_SUCCESS
    496     0  stevel  */
    497     0  stevel static int
    498     0  stevel bt_rroot(t, h, l, r)
    499     0  stevel 	BTREE *t;
    500     0  stevel 	PAGE *h, *l, *r;
    501     0  stevel {
    502     0  stevel 	char *dest;
    503     0  stevel 
    504     0  stevel 	/* Insert the left and right keys, set the header information. */
    505     0  stevel 	h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
    506     0  stevel 	dest = (char *)h + h->upper;
    507     0  stevel 	WR_RINTERNAL(dest,
    508     0  stevel 	    l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
    509     0  stevel 
    510     0  stevel 	h->linp[1] = h->upper -= NRINTERNAL;
    511     0  stevel 	dest = (char *)h + h->upper;
    512     0  stevel 	WR_RINTERNAL(dest,
    513     0  stevel 	    r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
    514     0  stevel 
    515     0  stevel 	h->lower = BTDATAOFF + 2 * sizeof(indx_t);
    516     0  stevel 
    517     0  stevel 	/* Unpin the root page, set to recno internal page. */
    518     0  stevel 	h->flags &= ~P_TYPE;
    519     0  stevel 	h->flags |= P_RINTERNAL;
    520     0  stevel 	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
    521     0  stevel 
    522     0  stevel 	return (RET_SUCCESS);
    523     0  stevel }
    524     0  stevel 
    525     0  stevel /*
    526     0  stevel  * BT_BROOT -- Fix up the btree root page after it has been split.
    527     0  stevel  *
    528     0  stevel  * Parameters:
    529     0  stevel  *	t:	tree
    530     0  stevel  *	h:	root page
    531     0  stevel  *	l:	left page
    532     0  stevel  *	r:	right page
    533     0  stevel  *
    534     0  stevel  * Returns:
    535     0  stevel  *	RET_ERROR, RET_SUCCESS
    536     0  stevel  */
    537     0  stevel static int
    538     0  stevel bt_broot(t, h, l, r)
    539     0  stevel 	BTREE *t;
    540     0  stevel 	PAGE *h, *l, *r;
    541     0  stevel {
    542     0  stevel 	BINTERNAL *bi;
    543     0  stevel 	BLEAF *bl;
    544     0  stevel 	u_int32_t nbytes;
    545     0  stevel 	char *dest;
    546     0  stevel 
    547     0  stevel 	/*
    548     0  stevel 	 * If the root page was a leaf page, change it into an internal page.
    549     0  stevel 	 * We copy the key we split on (but not the key's data, in the case of
    550     0  stevel 	 * a leaf page) to the new root page.
    551     0  stevel 	 *
    552     0  stevel 	 * The btree comparison code guarantees that the left-most key on any
    553     0  stevel 	 * level of the tree is never used, so it doesn't need to be filled in.
    554     0  stevel 	 */
    555     0  stevel 	nbytes = NBINTERNAL(0);
    556     0  stevel 	h->linp[0] = h->upper = t->bt_psize - nbytes;
    557     0  stevel 	dest = (char *)h + h->upper;
    558     0  stevel 	WR_BINTERNAL(dest, 0, l->pgno, 0);
    559     0  stevel 
    560     0  stevel 	switch (h->flags & P_TYPE) {
    561     0  stevel 	case P_BLEAF:
    562     0  stevel 		bl = GETBLEAF(r, 0);
    563     0  stevel 		nbytes = NBINTERNAL(bl->ksize);
    564     0  stevel 		h->linp[1] = h->upper -= nbytes;
    565     0  stevel 		dest = (char *)h + h->upper;
    566     0  stevel 		WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
    567     0  stevel 		memmove(dest, bl->bytes, bl->ksize);
    568     0  stevel 
    569     0  stevel 		/*
    570     0  stevel 		 * If the key is on an overflow page, mark the overflow chain
    571     0  stevel 		 * so it isn't deleted when the leaf copy of the key is deleted.
    572     0  stevel 		 */
    573     0  stevel 		if (bl->flags & P_BIGKEY &&
    574     0  stevel 		    bt_preserve(t, *(db_pgno_t *)bl->bytes) == RET_ERROR)
    575     0  stevel 			return (RET_ERROR);
    576     0  stevel 		break;
    577     0  stevel 	case P_BINTERNAL:
    578     0  stevel 		bi = GETBINTERNAL(r, 0);
    579     0  stevel 		nbytes = NBINTERNAL(bi->ksize);
    580     0  stevel 		h->linp[1] = h->upper -= nbytes;
    581     0  stevel 		dest = (char *)h + h->upper;
    582     0  stevel 		memmove(dest, bi, nbytes);
    583     0  stevel 		((BINTERNAL *)dest)->pgno = r->pgno;
    584     0  stevel 		break;
    585     0  stevel 	default:
    586     0  stevel 		abort();
    587     0  stevel 	}
    588     0  stevel 
    589     0  stevel 	/* There are two keys on the page. */
    590     0  stevel 	h->lower = BTDATAOFF + 2 * sizeof(indx_t);
    591     0  stevel 
    592     0  stevel 	/* Unpin the root page, set to btree internal page. */
    593     0  stevel 	h->flags &= ~P_TYPE;
    594     0  stevel 	h->flags |= P_BINTERNAL;
    595     0  stevel 	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
    596     0  stevel 
    597     0  stevel 	return (RET_SUCCESS);
    598     0  stevel }
    599     0  stevel 
    600     0  stevel /*
    601     0  stevel  * BT_PSPLIT -- Do the real work of splitting the page.
    602     0  stevel  *
    603     0  stevel  * Parameters:
    604     0  stevel  *	t:	tree
    605     0  stevel  *	h:	page to be split
    606     0  stevel  *	l:	page to put lower half of data
    607     0  stevel  *	r:	page to put upper half of data
    608     0  stevel  *	pskip:	pointer to index to leave open
    609     0  stevel  *	ilen:	insert length
    610     0  stevel  *
    611     0  stevel  * Returns:
    612     0  stevel  *	Pointer to page in which to insert.
    613     0  stevel  */
    614     0  stevel static PAGE *
    615     0  stevel bt_psplit(t, h, l, r, pskip, ilen)
    616     0  stevel 	BTREE *t;
    617     0  stevel 	PAGE *h, *l, *r;
    618     0  stevel 	indx_t *pskip;
    619     0  stevel 	size_t ilen;
    620     0  stevel {
    621     0  stevel 	BINTERNAL *bi;
    622     0  stevel 	BLEAF *bl;
    623     0  stevel 	CURSOR *c;
    624     0  stevel 	RLEAF *rl;
    625     0  stevel 	PAGE *rval;
    626     0  stevel 	void *src;
    627     0  stevel 	indx_t full, half, nxt, off, skip, top, used;
    628     0  stevel 	u_int32_t nbytes;
    629     0  stevel 	int bigkeycnt, isbigkey;
    630     0  stevel 
    631     0  stevel 	/*
    632     0  stevel 	 * Split the data to the left and right pages.  Leave the skip index
    633     0  stevel 	 * open.  Additionally, make some effort not to split on an overflow
    634     0  stevel 	 * key.  This makes internal page processing faster and can save
    635     0  stevel 	 * space as overflow keys used by internal pages are never deleted.
    636     0  stevel 	 */
    637     0  stevel 	bigkeycnt = 0;
    638     0  stevel 	skip = *pskip;
    639     0  stevel 	full = t->bt_psize - BTDATAOFF;
    640     0  stevel 	half = full / 2;
    641     0  stevel 	used = 0;
    642     0  stevel 	for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
    643     0  stevel 		if (skip == off) {
    644     0  stevel 			nbytes = ilen;
    645     0  stevel 			isbigkey = 0;		/* XXX: not really known. */
    646     0  stevel 		} else
    647     0  stevel 			switch (h->flags & P_TYPE) {
    648     0  stevel 			case P_BINTERNAL:
    649     0  stevel 				src = bi = GETBINTERNAL(h, nxt);
    650     0  stevel 				nbytes = NBINTERNAL(bi->ksize);
    651     0  stevel 				isbigkey = bi->flags & P_BIGKEY;
    652     0  stevel 				break;
    653     0  stevel 			case P_BLEAF:
    654     0  stevel 				src = bl = GETBLEAF(h, nxt);
    655     0  stevel 				nbytes = NBLEAF(bl);
    656     0  stevel 				isbigkey = bl->flags & P_BIGKEY;
    657     0  stevel 				break;
    658     0  stevel 			case P_RINTERNAL:
    659     0  stevel 				src = GETRINTERNAL(h, nxt);
    660     0  stevel 				nbytes = NRINTERNAL;
    661     0  stevel 				isbigkey = 0;
    662     0  stevel 				break;
    663     0  stevel 			case P_RLEAF:
    664     0  stevel 				src = rl = GETRLEAF(h, nxt);
    665     0  stevel 				nbytes = NRLEAF(rl);
    666     0  stevel 				isbigkey = 0;
    667     0  stevel 				break;
    668     0  stevel 			default:
    669     0  stevel 				abort();
    670     0  stevel 			}
    671     0  stevel 
    672     0  stevel 		/*
    673     0  stevel 		 * If the key/data pairs are substantial fractions of the max
    674     0  stevel 		 * possible size for the page, it's possible to get situations
    675     0  stevel 		 * where we decide to try and copy too much onto the left page.
    676     0  stevel 		 * Make sure that doesn't happen.
    677     0  stevel 		 */
    678     0  stevel 		if ((skip <= off && used + nbytes + sizeof(indx_t) >= full)
    679     0  stevel 		    || nxt == top - 1) {
    680     0  stevel 			--off;
    681     0  stevel 			break;
    682     0  stevel 		}
    683     0  stevel 
    684     0  stevel 		/* Copy the key/data pair, if not the skipped index. */
    685     0  stevel 		if (skip != off) {
    686     0  stevel 			++nxt;
    687     0  stevel 
    688     0  stevel 			l->linp[off] = l->upper -= nbytes;
    689     0  stevel 			memmove((char *)l + l->upper, src, nbytes);
    690     0  stevel 		}
    691     0  stevel 
    692     0  stevel 		used += nbytes + sizeof(indx_t);
    693     0  stevel 		if (used >= half) {
    694     0  stevel 			if (!isbigkey || bigkeycnt == 3)
    695     0  stevel 				break;
    696     0  stevel 			else
    697     0  stevel 				++bigkeycnt;
    698     0  stevel 		}
    699     0  stevel 	}
    700     0  stevel 
    701     0  stevel 	/*
    702     0  stevel 	 * Off is the last offset that's valid for the left page.
    703     0  stevel 	 * Nxt is the first offset to be placed on the right page.
    704     0  stevel 	 */
    705     0  stevel 	l->lower += (off + 1) * sizeof(indx_t);
    706     0  stevel 
    707     0  stevel 	/*
    708     0  stevel 	 * If splitting the page that the cursor was on, the cursor has to be
    709     0  stevel 	 * adjusted to point to the same record as before the split.  If the
    710     0  stevel 	 * cursor is at or past the skipped slot, the cursor is incremented by
    711     0  stevel 	 * one.  If the cursor is on the right page, it is decremented by the
    712     0  stevel 	 * number of records split to the left page.
    713     0  stevel 	 */
    714     0  stevel 	c = &t->bt_cursor;
    715     0  stevel 	if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) {
    716     0  stevel 		if (c->pg.index >= skip)
    717     0  stevel 			++c->pg.index;
    718     0  stevel 		if (c->pg.index < nxt)			/* Left page. */
    719     0  stevel 			c->pg.pgno = l->pgno;
    720     0  stevel 		else {					/* Right page. */
    721     0  stevel 			c->pg.pgno = r->pgno;
    722     0  stevel 			c->pg.index -= nxt;
    723     0  stevel 		}
    724     0  stevel 	}
    725     0  stevel 
    726     0  stevel 	/*
    727     0  stevel 	 * If the skipped index was on the left page, just return that page.
    728     0  stevel 	 * Otherwise, adjust the skip index to reflect the new position on
    729     0  stevel 	 * the right page.
    730     0  stevel 	 */
    731     0  stevel 	if (skip <= off) {
    732  5975  semery 		/*
    733  5975  semery 		 * If we get here then 'skip' is in the left page.  We do
    734  5975  semery 		 * not want to mix this with the right page, so we assign
    735  5975  semery 		 * an unrealistic value (-1).
    736  5975  semery 		 */
    737  5975  semery 		skip = (indx_t)-1;
    738     0  stevel 		rval = l;
    739     0  stevel 	} else {
    740     0  stevel 		rval = r;
    741     0  stevel 		*pskip -= nxt;
    742     0  stevel 	}
    743     0  stevel 
    744     0  stevel 	for (off = 0; nxt < top; ++off) {
    745     0  stevel 		if (skip == nxt) {
    746     0  stevel 			++off;
    747  5975  semery 			/*
    748  5975  semery 			 * Assign 'skip' an unrealistic value (-1) to ensure
    749  5975  semery 			 * it is not matched again.
    750  5975  semery 			 */
    751  5975  semery 			skip = (indx_t)-1;
    752     0  stevel 		}
    753     0  stevel 		switch (h->flags & P_TYPE) {
    754     0  stevel 		case P_BINTERNAL:
    755     0  stevel 			src = bi = GETBINTERNAL(h, nxt);
    756     0  stevel 			nbytes = NBINTERNAL(bi->ksize);
    757     0  stevel 			break;
    758     0  stevel 		case P_BLEAF:
    759     0  stevel 			src = bl = GETBLEAF(h, nxt);
    760     0  stevel 			nbytes = NBLEAF(bl);
    761     0  stevel 			break;
    762     0  stevel 		case P_RINTERNAL:
    763     0  stevel 			src = GETRINTERNAL(h, nxt);
    764     0  stevel 			nbytes = NRINTERNAL;
    765     0  stevel 			break;
    766     0  stevel 		case P_RLEAF:
    767     0  stevel 			src = rl = GETRLEAF(h, nxt);
    768     0  stevel 			nbytes = NRLEAF(rl);
    769     0  stevel 			break;
    770     0  stevel 		default:
    771     0  stevel 			abort();
    772     0  stevel 		}
    773     0  stevel 		++nxt;
    774     0  stevel 		r->linp[off] = r->upper -= nbytes;
    775     0  stevel 		memmove((char *)r + r->upper, src, nbytes);
    776     0  stevel 	}
    777     0  stevel 	r->lower += off * sizeof(indx_t);
    778     0  stevel 
    779     0  stevel 	/* If the key is being appended to the page, adjust the index. */
    780     0  stevel 	if (skip == top)
    781     0  stevel 		r->lower += sizeof(indx_t);
    782     0  stevel 
    783     0  stevel 	return (rval);
    784     0  stevel }
    785     0  stevel 
    786     0  stevel /*
    787     0  stevel  * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
    788     0  stevel  *
    789     0  stevel  * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
    790     0  stevel  * record that references them gets deleted.  Chains pointed to by internal
    791     0  stevel  * pages never get deleted.  This routine marks a chain as pointed to by an
    792     0  stevel  * internal page.
    793     0  stevel  *
    794     0  stevel  * Parameters:
    795     0  stevel  *	t:	tree
    796     0  stevel  *	pg:	page number of first page in the chain.
    797     0  stevel  *
    798     0  stevel  * Returns:
    799     0  stevel  *	RET_SUCCESS, RET_ERROR.
    800     0  stevel  */
    801     0  stevel static int
    802     0  stevel bt_preserve(t, pg)
    803     0  stevel 	BTREE *t;
    804     0  stevel 	db_pgno_t pg;
    805     0  stevel {
    806     0  stevel 	PAGE *h;
    807     0  stevel 
    808     0  stevel 	if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
    809     0  stevel 		return (RET_ERROR);
    810     0  stevel 	h->flags |= P_PRESERVE;
    811     0  stevel 	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
    812     0  stevel 	return (RET_SUCCESS);
    813     0  stevel }
    814     0  stevel 
    815     0  stevel /*
    816     0  stevel  * REC_TOTAL -- Return the number of recno entries below a page.
    817     0  stevel  *
    818     0  stevel  * Parameters:
    819     0  stevel  *	h:	page
    820     0  stevel  *
    821     0  stevel  * Returns:
    822     0  stevel  *	The number of recno entries below a page.
    823     0  stevel  *
    824     0  stevel  * XXX
    825     0  stevel  * These values could be set by the bt_psplit routine.  The problem is that the
    826     0  stevel  * entry has to be popped off of the stack etc. or the values have to be passed
    827     0  stevel  * all the way back to bt_split/bt_rroot and it's not very clean.
    828     0  stevel  */
    829     0  stevel static recno_t
    830     0  stevel rec_total(h)
    831     0  stevel 	PAGE *h;
    832     0  stevel {
    833     0  stevel 	recno_t recs;
    834     0  stevel 	indx_t nxt, top;
    835     0  stevel 
    836     0  stevel 	for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
    837     0  stevel 		recs += GETRINTERNAL(h, nxt)->nrecs;
    838     0  stevel 	return (recs);
    839     0  stevel }
    840