aboutsummaryrefslogtreecommitdiff
path: root/db2/btree/bt_put.c
diff options
context:
space:
mode:
Diffstat (limited to 'db2/btree/bt_put.c')
-rw-r--r--db2/btree/bt_put.c461
1 files changed, 296 insertions, 165 deletions
diff --git a/db2/btree/bt_put.c b/db2/btree/bt_put.c
index af09f76..b3d775b 100644
--- a/db2/btree/bt_put.c
+++ b/db2/btree/bt_put.c
@@ -47,7 +47,7 @@
#include "config.h"
#ifndef lint
-static const char sccsid[] = "@(#)bt_put.c 10.25 (Sleepycat) 9/17/97";
+static const char sccsid[] = "@(#)bt_put.c 10.31 (Sleepycat) 10/26/97";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -66,7 +66,10 @@ static const char sccsid[] = "@(#)bt_put.c 10.25 (Sleepycat) 9/17/97";
static int __bam_fixed __P((BTREE *, DBT *));
static int __bam_lookup __P((DB *, DBT *, int *));
static int __bam_ndup __P((DB *, PAGE *, u_int32_t));
-static int __bam_partial __P((DB *, DBT *, PAGE *, u_int32_t));
+static int __bam_ovput __P((DB *, PAGE *, u_int32_t, DBT *));
+static int __bam_partial __P((DB *, DBT *, PAGE *, u_int32_t, u_int32_t));
+static u_int32_t
+ __bam_partsize __P((DB *, DBT *, PAGE *, u_int32_t));
/*
* __bam_put --
@@ -334,21 +337,6 @@ slow: return (__bam_search(dbp, key, S_INSERT, 1, NULL, exactp));
}
/*
- * OVPUT --
- * Copy an overflow item onto a page.
- */
-#undef OVPUT
-#define OVPUT(h, indx, bo) do { \
- DBT __hdr; \
- memset(&__hdr, 0, sizeof(__hdr)); \
- __hdr.data = &bo; \
- __hdr.size = BOVERFLOW_SIZE; \
- if ((ret = __db_pitem(dbp, \
- h, indx, BOVERFLOW_SIZE, &__hdr, NULL)) != 0) \
- return (ret); \
-} while (0)
-
-/*
* __bam_iitem --
* Insert an item into the tree.
*
@@ -365,19 +353,18 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
{
BTREE *t;
BKEYDATA *bk;
- BOVERFLOW kbo, dbo;
DBT tdbt;
PAGE *h;
db_indx_t indx;
- u_int32_t have_bytes, need_bytes, needed;
- int bigkey, bigdata, dcopy, dupadjust, ret;
+ u_int32_t data_size, have_bytes, need_bytes, needed;
+ int bigkey, bigdata, dupadjust, replace, ret;
t = dbp->internal;
h = *hp;
indx = *indxp;
- dupadjust = 0;
bk = NULL; /* XXX: Shut the compiler up. */
+ dupadjust = replace = 0;
/*
* If it's a page of duplicates, call the common code to do the work.
@@ -385,7 +372,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
* !!!
* Here's where the hp and indxp are important. The duplicate code
* may decide to rework/rearrange the pages and indices we're using,
- * so the caller must understand that the stack has to change.
+ * so the caller must understand that the page stack may change.
*/
if (TYPE(h) == P_DUPLICATE) {
/* Adjust the index for the new item if it's a DB_AFTER op. */
@@ -401,24 +388,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
return (__db_dput(dbp, data, hp, indxp, __bam_new));
}
- /*
- * XXX
- * Handle partial puts.
- *
- * This is truly awful from a performance standput. We don't optimize
- * for partial puts at all, we delete the record and add it back in,
- * regardless of size or if we're simply overwriting current data.
- * The hash access method does this a lot better than we do, and we're
- * eventually going to have to fix it.
- */
- if (F_ISSET(data, DB_DBT_PARTIAL)) {
- tdbt = *data;
- if ((ret = __bam_partial(dbp, &tdbt, h, indx)) != 0)
- return (ret);
- data = &tdbt;
- }
-
- /* If it's a short fixed-length record, fix it up. */
+ /* Handle fixed-length records: build the real record. */
if (F_ISSET(dbp, DB_RE_FIXEDLEN) && data->size != t->bt_recno->re_len) {
tdbt = *data;
if ((ret = __bam_fixed(t, &tdbt)) != 0)
@@ -427,30 +397,15 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
}
/*
- * If the key or data item won't fit on a page, store it in the
- * overflow pages.
- *
- * !!!
- * From this point on, we have to recover the allocated overflow
- * pages on error.
+ * Figure out how much space the data will take, including if it's a
+ * partial record. If either of the key or data items won't fit on
+ * a page, we'll have to store them on overflow pages.
*/
- bigkey = bigdata = 0;
- if (LF_ISSET(BI_NEWKEY) && key->size > t->bt_ovflsize) {
- B_TSET(kbo.type, B_OVERFLOW, 0);
- kbo.tlen = key->size;
- if ((ret = __db_poff(dbp, key, &kbo.pgno, __bam_new)) != 0)
- goto err;
- bigkey = 1;
- }
- if (data->size > t->bt_ovflsize) {
- B_TSET(dbo.type, B_OVERFLOW, 0);
- dbo.tlen = data->size;
- if ((ret = __db_poff(dbp, data, &dbo.pgno, __bam_new)) != 0)
- goto err;
- bigdata = 1;
- }
+ bigkey = LF_ISSET(BI_NEWKEY) && key->size > t->bt_ovflsize;
+ data_size = F_ISSET(data, DB_DBT_PARTIAL) ?
+ __bam_partsize(dbp, data, h, indx) : data->size;
+ bigdata = data_size > t->bt_ovflsize;
- dcopy = 0;
needed = 0;
if (LF_ISSET(BI_NEWKEY)) {
/* If BI_NEWKEY is set we're adding a new key and data pair. */
@@ -461,7 +416,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
if (bigdata)
needed += BOVERFLOW_PSIZE;
else
- needed += BKEYDATA_PSIZE(data->size);
+ needed += BKEYDATA_PSIZE(data_size);
} else {
/*
* We're either overwriting the data item of a key/data pair
@@ -482,16 +437,8 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
if (bigdata)
need_bytes += BOVERFLOW_PSIZE;
else
- need_bytes += BKEYDATA_PSIZE(data->size);
+ need_bytes += BKEYDATA_PSIZE(data_size);
- /*
- * If we're overwriting a data item, we copy it if it's not a
- * special record type and it's the same size (including any
- * alignment) and do a delete/insert otherwise.
- */
- if (op == DB_CURRENT && !bigdata &&
- B_TYPE(bk->type) == B_KEYDATA && have_bytes == need_bytes)
- dcopy = 1;
if (have_bytes < need_bytes)
needed += need_bytes - have_bytes;
}
@@ -505,9 +452,15 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
* check in the btree split code, so we don't undo it there!?!?
*/
if (P_FREESPACE(h) < needed ||
- (t->bt_maxkey != 0 && NUM_ENT(h) > t->bt_maxkey)) {
- ret = DB_NEEDSPLIT;
- goto err;
+ (t->bt_maxkey != 0 && NUM_ENT(h) > t->bt_maxkey))
+ return (DB_NEEDSPLIT);
+
+ /* Handle partial puts: build the real record. */
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ tdbt = *data;
+ if ((ret = __bam_partial(dbp, &tdbt, h, indx, data_size)) != 0)
+ return (ret);
+ data = &tdbt;
}
/*
@@ -515,10 +468,10 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
*
* 1. Append a new key/data pair.
* 2. Insert a new key/data pair.
- * 3. Copy the data item.
- * 4. Delete/insert the data item.
- * 5. Append a new data item.
- * 6. Insert a new data item.
+ * 3. Append a new data item (a new duplicate).
+ * 4. Insert a new data item (a new duplicate).
+ * 5. Overflow item: delete and re-add the data item.
+ * 6. Replace the data item.
*/
if (LF_ISSET(BI_NEWKEY)) {
switch (op) {
@@ -533,42 +486,17 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
}
/* Add the key. */
- if (bigkey)
- OVPUT(h, indx, kbo);
- else {
- DBT __data;
- memset(&__data, 0, sizeof(__data));
- __data.data = key->data;
- __data.size = key->size;
+ if (bigkey) {
+ if ((ret = __bam_ovput(dbp, h, indx, key)) != 0)
+ return (ret);
+ } else
if ((ret = __db_pitem(dbp, h, indx,
- BKEYDATA_SIZE(key->size), NULL, &__data)) != 0)
- goto err;
- }
+ BKEYDATA_SIZE(key->size), NULL, key)) != 0)
+ return (ret);
++indx;
} else {
switch (op) {
- case DB_CURRENT: /* 3. Copy the data item. */
- /*
- * If we're not logging and it's possible, overwrite
- * the current item.
- *
- * XXX
- * We should add a separate logging message so that
- * we can do this anytime it's possible, including
- * for partial record puts.
- */
- if (dcopy && !DB_LOGGING(dbp)) {
- bk->len = data->size;
- memcpy(bk->data, data->data, data->size);
- goto done;
- }
- /* 4. Delete/insert the data item. */
- if (TYPE(h) == P_LBTREE)
- ++indx;
- if ((ret = __bam_ditem(dbp, h, indx)) != 0)
- goto err;
- break;
- case DB_AFTER: /* 5. Append a new data item. */
+ case DB_AFTER: /* 3. Append a new data item. */
if (TYPE(h) == P_LBTREE) {
/*
* Adjust the cursor and copy in the key for
@@ -576,7 +504,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
*/
if ((ret = __bam_adjindx(dbp,
h, indx + P_INDX, indx, 1)) != 0)
- goto err;
+ return (ret);
indx += 3;
dupadjust = 1;
@@ -589,7 +517,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
*indxp += 1;
}
break;
- case DB_BEFORE: /* 6. Insert a new data item. */
+ case DB_BEFORE: /* 4. Insert a new data item. */
if (TYPE(h) == P_LBTREE) {
/*
* Adjust the cursor and copy in the key for
@@ -597,43 +525,62 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
*/
if ((ret =
__bam_adjindx(dbp, h, indx, indx, 1)) != 0)
- goto err;
+ return (ret);
++indx;
dupadjust = 1;
} else
__bam_ca_di(dbp, h->pgno, indx, 1);
break;
+ case DB_CURRENT:
+ if (TYPE(h) == P_LBTREE)
+ ++indx;
+
+ /*
+ * 5. Delete/re-add the data item.
+ *
+ * If we're dealing with offpage items, we have to
+ * delete and then re-add the item.
+ */
+ if (bigdata || B_TYPE(bk->type) == B_OVERFLOW) {
+ if ((ret = __bam_ditem(dbp, h, indx)) != 0)
+ return (ret);
+ break;
+ }
+
+ /* 6. Replace the data item. */
+ replace = 1;
+ break;
default:
abort();
}
}
/* Add the data. */
- if (bigdata)
- OVPUT(h, indx, dbo);
- else {
+ if (bigdata) {
+ if ((ret = __bam_ovput(dbp, h, indx, data)) != 0)
+ return (ret);
+ } else {
BKEYDATA __bk;
- DBT __hdr, __data;
- memset(&__data, 0, sizeof(__data));
- __data.data = data->data;
- __data.size = data->size;
+ DBT __hdr;
if (LF_ISSET(BI_DELETED)) {
B_TSET(__bk.type, B_KEYDATA, 1);
- __bk.len = __data.size;
+ __bk.len = data->size;
__hdr.data = &__bk;
__hdr.size = SSZA(BKEYDATA, data);
ret = __db_pitem(dbp, h, indx,
- BKEYDATA_SIZE(__data.size), &__hdr, &__data);
- } else
+ BKEYDATA_SIZE(data->size), &__hdr, data);
+ } else if (replace)
+ ret = __bam_ritem(dbp, h, indx, data);
+ else
ret = __db_pitem(dbp, h, indx,
- BKEYDATA_SIZE(data->size), NULL, &__data);
+ BKEYDATA_SIZE(data->size), NULL, data);
if (ret != 0)
- goto err;
+ return (ret);
}
-done: ++t->lstat.bt_added;
+ ++t->lstat.bt_added;
ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY);
@@ -645,22 +592,206 @@ done: ++t->lstat.bt_added;
if (dupadjust && P_FREESPACE(h) <= dbp->pgsize / 2) {
--indx;
if ((ret = __bam_ndup(dbp, h, indx)) != 0)
- goto err;
+ return (ret);
}
if (t->bt_recno != NULL)
F_SET(t->bt_recno, RECNO_MODIFIED);
- if (0) {
-err: if (bigkey)
- (void)__db_doff(dbp, kbo.pgno, __bam_free);
- if (bigdata)
- (void)__db_doff(dbp, dbo.pgno, __bam_free);
- }
return (ret);
}
/*
+ * __bam_partsize --
+ * Figure out how much space a partial data item is in total.
+ */
+static u_int32_t
+__bam_partsize(dbp, data, h, indx)
+ DB *dbp;
+ DBT *data;
+ PAGE *h;
+ u_int32_t indx;
+{
+ BKEYDATA *bk;
+ u_int32_t nbytes;
+
+ /*
+ * Figure out how much total space we'll need. If the record doesn't
+ * already exist, it's simply the data we're provided.
+ */
+ if (indx >= NUM_ENT(h))
+ return (data->doff + data->size);
+
+ /*
+ * Otherwise, it's the data provided plus any already existing data
+ * that we're not replacing.
+ */
+ bk = GET_BKEYDATA(h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
+ nbytes =
+ B_TYPE(bk->type) == B_OVERFLOW ? ((BOVERFLOW *)bk)->tlen : bk->len;
+
+ /*
+ * There are really two cases here:
+ *
+ * Case 1: We are replacing some bytes that do not exist (i.e., they
+ * are past the end of the record). In this case the number of bytes
+ * we are replacing is irrelevant and all we care about is how many
+ * bytes we are going to add from offset. So, the new record length
+ * is going to be the size of the new bytes (size) plus wherever those
+ * new bytes begin (doff).
+ *
+ * Case 2: All the bytes we are replacing exist. Therefore, the new
+ * size is the oldsize (nbytes) minus the bytes we are replacing (dlen)
+ * plus the bytes we are adding (size).
+ */
+ if (nbytes < data->doff + data->dlen) /* Case 1 */
+ return (data->doff + data->size);
+
+ return (nbytes + data->size - data->dlen); /* Case 2 */
+}
+
+/*
+ * OVPUT --
+ * Copy an overflow item onto a page.
+ */
+#undef OVPUT
+#define OVPUT(h, indx, bo) do { \
+ DBT __hdr; \
+ memset(&__hdr, 0, sizeof(__hdr)); \
+ __hdr.data = &bo; \
+ __hdr.size = BOVERFLOW_SIZE; \
+ if ((ret = __db_pitem(dbp, \
+ h, indx, BOVERFLOW_SIZE, &__hdr, NULL)) != 0) \
+ return (ret); \
+} while (0)
+
+/*
+ * __bam_ovput --
+ * Build an overflow item and put it on the page.
+ */
+static int
+__bam_ovput(dbp, h, indx, item)
+ DB *dbp;
+ PAGE *h;
+ u_int32_t indx;
+ DBT *item;
+{
+ BOVERFLOW bo;
+ int ret;
+
+ B_TSET(bo.type, B_OVERFLOW, 0);
+ bo.tlen = item->size;
+ if ((ret = __db_poff(dbp, item, &bo.pgno, __bam_new)) != 0)
+ return (ret);
+
+ OVPUT(h, indx, bo);
+
+ return (0);
+}
+
+/*
+ * __bam_ritem --
+ * Replace an item on a page.
+ *
+ * PUBLIC: int __bam_ritem __P((DB *, PAGE *, u_int32_t, DBT *));
+ */
+int
+__bam_ritem(dbp, h, indx, data)
+ DB *dbp;
+ PAGE *h;
+ u_int32_t indx;
+ DBT *data;
+{
+ BKEYDATA *bk;
+ DBT orig, repl;
+ db_indx_t lo, ln, min, off, prefix, suffix;
+ int32_t nbytes;
+ int cnt, ret;
+ u_int8_t *p, *t;
+
+ /*
+ * Replace a single item onto a page. The logic figuring out where
+ * to insert and whether it fits is handled in the caller. All we do
+ * here is manage the page shuffling.
+ */
+ bk = GET_BKEYDATA(h, indx);
+
+ /* Log the change. */
+ if (DB_LOGGING(dbp)) {
+ /*
+ * We might as well check to see if the two data items share
+ * a common prefix and suffix -- it can save us a lot of log
+ * message if they're large.
+ */
+ min = data->size < bk->len ? data->size : bk->len;
+ for (prefix = 0,
+ p = bk->data, t = data->data;
+ prefix < min && *p == *t; ++prefix, ++p, ++t)
+ ;
+
+ min -= prefix;
+ for (suffix = 0,
+ p = (u_int8_t *)bk->data + bk->len - 1,
+ t = (u_int8_t *)data->data + data->size - 1;
+ suffix < min && *p == *t; ++suffix, --p, --t)
+ ;
+
+ /* We only log the parts of the keys that have changed. */
+ orig.data = (u_int8_t *)bk->data + prefix;
+ orig.size = bk->len - (prefix + suffix);
+ repl.data = (u_int8_t *)data->data + prefix;
+ repl.size = data->size - (prefix + suffix);
+ if ((ret = __bam_repl_log(dbp->dbenv->lg_info, dbp->txn,
+ &LSN(h), 0, dbp->log_fileid, PGNO(h), &LSN(h),
+ (u_int32_t)indx, (u_int32_t)B_DISSET(bk->type),
+ &orig, &repl, (u_int32_t)prefix, (u_int32_t)suffix)) != 0)
+ return (ret);
+ }
+
+ /*
+ * Set references to the first in-use byte on the page and the
+ * first byte of the item being replaced.
+ */
+ p = (u_int8_t *)h + HOFFSET(h);
+ t = (u_int8_t *)bk;
+
+ /*
+ * If the entry is growing in size, shift the beginning of the data
+ * part of the page down. If the entry is shrinking in size, shift
+ * the beginning of the data part of the page up. Use memmove(3),
+ * the regions overlap.
+ */
+ lo = BKEYDATA_SIZE(bk->len);
+ ln = BKEYDATA_SIZE(data->size);
+ if (lo != ln) {
+ nbytes = lo - ln; /* Signed difference. */
+ if (p == t) /* First index is fast. */
+ h->inp[indx] += nbytes;
+ else { /* Else, shift the page. */
+ memmove(p + nbytes, p, t - p);
+
+ /* Adjust the indices' offsets. */
+ off = h->inp[indx];
+ for (cnt = 0; cnt < NUM_ENT(h); ++cnt)
+ if (h->inp[cnt] <= off)
+ h->inp[cnt] += nbytes;
+ }
+
+ /* Clean up the page and adjust the item's reference. */
+ HOFFSET(h) += nbytes;
+ t += nbytes;
+ }
+
+ /* Copy the new item onto the page. */
+ bk = (BKEYDATA *)t;
+ B_TSET(bk->type, B_KEYDATA, 0);
+ bk->len = data->size;
+ memcpy(bk->data, data->data, data->size);
+
+ return (0);
+}
+
+/*
* __bam_ndup --
* Check to see if the duplicate set at indx should have its own page.
* If it should, create it.
@@ -766,16 +897,21 @@ __bam_fixed(t, dbt)
rp = t->bt_recno;
/*
- * If using fixed-length records, and the record is long, return
- * EINVAL. If it's short, pad it out. Use the record data return
- * memory, it's only short-term.
+ * If database contains fixed-length records, and the record is long,
+ * return EINVAL.
*/
if (dbt->size > rp->re_len)
return (EINVAL);
+
+ /*
+ * The caller checked to see if it was just right, so we know it's
+ * short. Pad it out. We use the record data return memory, it's
+ * only a short-term use.
+ */
if (t->bt_rdata.ulen < rp->re_len) {
t->bt_rdata.data = t->bt_rdata.data == NULL ?
- (void *)malloc(rp->re_len) :
- (void *)realloc(t->bt_rdata.data, rp->re_len);
+ (void *)__db_malloc(rp->re_len) :
+ (void *)__db_realloc(t->bt_rdata.data, rp->re_len);
if (t->bt_rdata.data == NULL) {
t->bt_rdata.ulen = 0;
return (ENOMEM);
@@ -786,12 +922,16 @@ __bam_fixed(t, dbt)
memset((u_int8_t *)t->bt_rdata.data + dbt->size,
rp->re_pad, rp->re_len - dbt->size);
- /* Set the DBT to reference our new record. */
+ /*
+ * Clean up our flags and other information just in case, and
+ * change the caller's DBT to reference our created record.
+ */
t->bt_rdata.size = rp->re_len;
t->bt_rdata.dlen = 0;
t->bt_rdata.doff = 0;
t->bt_rdata.flags = 0;
*dbt = t->bt_rdata;
+
return (0);
}
@@ -800,47 +940,28 @@ __bam_fixed(t, dbt)
* Build the real record for a partial put.
*/
static int
-__bam_partial(dbp, dbt, h, indx)
+__bam_partial(dbp, dbt, h, indx, nbytes)
DB *dbp;
DBT *dbt;
PAGE *h;
- u_int32_t indx;
+ u_int32_t indx, nbytes;
{
BTREE *t;
BKEYDATA *bk, tbk;
BOVERFLOW *bo;
DBT copy;
- u_int32_t len, nbytes, tlen;
+ u_int32_t len, tlen;
int ret;
u_int8_t *p;
bo = NULL; /* XXX: Shut the compiler up. */
t = dbp->internal;
- /*
- * Figure out how much total space we'll need. Worst case is where
- * the record is 0 bytes long, in which case doff causes the record
- * to extend, and the put data is appended to it.
- */
- if (indx < NUM_ENT(h)) {
- bk = GET_BKEYDATA(h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
- if (B_TYPE(bk->type) == B_OVERFLOW) {
- bo = (BOVERFLOW *)bk;
- nbytes = bo->tlen;
- } else
- nbytes = bk->len;
- } else {
- bk = &tbk;
- B_TSET(bk->type, B_KEYDATA, 0);
- nbytes = bk->len = 0;
- }
- nbytes += dbt->doff + dbt->size + dbt->dlen;
-
- /* Allocate the space. */
+ /* We use the record data return memory, it's only a short-term use. */
if (t->bt_rdata.ulen < nbytes) {
t->bt_rdata.data = t->bt_rdata.data == NULL ?
- (void *)malloc(nbytes) :
- (void *)realloc(t->bt_rdata.data, nbytes);
+ (void *)__db_malloc(nbytes) :
+ (void *)__db_realloc(t->bt_rdata.data, nbytes);
if (t->bt_rdata.data == NULL) {
t->bt_rdata.ulen = 0;
return (ENOMEM);
@@ -848,6 +969,16 @@ __bam_partial(dbp, dbt, h, indx)
t->bt_rdata.ulen = nbytes;
}
+ /* Find the current record. */
+ if (indx < NUM_ENT(h)) {
+ bk = GET_BKEYDATA(h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
+ bo = (BOVERFLOW *)bk;
+ } else {
+ bk = &tbk;
+ B_TSET(bk->type, B_KEYDATA, 0);
+ bk->len = 0;
+ }
+
/* We use nul bytes for extending the record, get it over with. */
memset(t->bt_rdata.data, 0, nbytes);