diff options
Diffstat (limited to 'db2/btree/bt_put.c')
-rw-r--r-- | db2/btree/bt_put.c | 461 |
1 files changed, 296 insertions, 165 deletions
diff --git a/db2/btree/bt_put.c b/db2/btree/bt_put.c index af09f76..b3d775b 100644 --- a/db2/btree/bt_put.c +++ b/db2/btree/bt_put.c @@ -47,7 +47,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_put.c 10.25 (Sleepycat) 9/17/97"; +static const char sccsid[] = "@(#)bt_put.c 10.31 (Sleepycat) 10/26/97"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -66,7 +66,10 @@ static const char sccsid[] = "@(#)bt_put.c 10.25 (Sleepycat) 9/17/97"; static int __bam_fixed __P((BTREE *, DBT *)); static int __bam_lookup __P((DB *, DBT *, int *)); static int __bam_ndup __P((DB *, PAGE *, u_int32_t)); -static int __bam_partial __P((DB *, DBT *, PAGE *, u_int32_t)); +static int __bam_ovput __P((DB *, PAGE *, u_int32_t, DBT *)); +static int __bam_partial __P((DB *, DBT *, PAGE *, u_int32_t, u_int32_t)); +static u_int32_t + __bam_partsize __P((DB *, DBT *, PAGE *, u_int32_t)); /* * __bam_put -- @@ -334,21 +337,6 @@ slow: return (__bam_search(dbp, key, S_INSERT, 1, NULL, exactp)); } /* - * OVPUT -- - * Copy an overflow item onto a page. - */ -#undef OVPUT -#define OVPUT(h, indx, bo) do { \ - DBT __hdr; \ - memset(&__hdr, 0, sizeof(__hdr)); \ - __hdr.data = &bo; \ - __hdr.size = BOVERFLOW_SIZE; \ - if ((ret = __db_pitem(dbp, \ - h, indx, BOVERFLOW_SIZE, &__hdr, NULL)) != 0) \ - return (ret); \ -} while (0) - -/* * __bam_iitem -- * Insert an item into the tree. * @@ -365,19 +353,18 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) { BTREE *t; BKEYDATA *bk; - BOVERFLOW kbo, dbo; DBT tdbt; PAGE *h; db_indx_t indx; - u_int32_t have_bytes, need_bytes, needed; - int bigkey, bigdata, dcopy, dupadjust, ret; + u_int32_t data_size, have_bytes, need_bytes, needed; + int bigkey, bigdata, dupadjust, replace, ret; t = dbp->internal; h = *hp; indx = *indxp; - dupadjust = 0; bk = NULL; /* XXX: Shut the compiler up. */ + dupadjust = replace = 0; /* * If it's a page of duplicates, call the common code to do the work. @@ -385,7 +372,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) * !!! * Here's where the hp and indxp are important. The duplicate code * may decide to rework/rearrange the pages and indices we're using, - * so the caller must understand that the stack has to change. + * so the caller must understand that the page stack may change. */ if (TYPE(h) == P_DUPLICATE) { /* Adjust the index for the new item if it's a DB_AFTER op. */ @@ -401,24 +388,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) return (__db_dput(dbp, data, hp, indxp, __bam_new)); } - /* - * XXX - * Handle partial puts. - * - * This is truly awful from a performance standput. We don't optimize - * for partial puts at all, we delete the record and add it back in, - * regardless of size or if we're simply overwriting current data. - * The hash access method does this a lot better than we do, and we're - * eventually going to have to fix it. - */ - if (F_ISSET(data, DB_DBT_PARTIAL)) { - tdbt = *data; - if ((ret = __bam_partial(dbp, &tdbt, h, indx)) != 0) - return (ret); - data = &tdbt; - } - - /* If it's a short fixed-length record, fix it up. */ + /* Handle fixed-length records: build the real record. */ if (F_ISSET(dbp, DB_RE_FIXEDLEN) && data->size != t->bt_recno->re_len) { tdbt = *data; if ((ret = __bam_fixed(t, &tdbt)) != 0) @@ -427,30 +397,15 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) } /* - * If the key or data item won't fit on a page, store it in the - * overflow pages. - * - * !!! - * From this point on, we have to recover the allocated overflow - * pages on error. + * Figure out how much space the data will take, including if it's a + * partial record. If either of the key or data items won't fit on + * a page, we'll have to store them on overflow pages. */ - bigkey = bigdata = 0; - if (LF_ISSET(BI_NEWKEY) && key->size > t->bt_ovflsize) { - B_TSET(kbo.type, B_OVERFLOW, 0); - kbo.tlen = key->size; - if ((ret = __db_poff(dbp, key, &kbo.pgno, __bam_new)) != 0) - goto err; - bigkey = 1; - } - if (data->size > t->bt_ovflsize) { - B_TSET(dbo.type, B_OVERFLOW, 0); - dbo.tlen = data->size; - if ((ret = __db_poff(dbp, data, &dbo.pgno, __bam_new)) != 0) - goto err; - bigdata = 1; - } + bigkey = LF_ISSET(BI_NEWKEY) && key->size > t->bt_ovflsize; + data_size = F_ISSET(data, DB_DBT_PARTIAL) ? + __bam_partsize(dbp, data, h, indx) : data->size; + bigdata = data_size > t->bt_ovflsize; - dcopy = 0; needed = 0; if (LF_ISSET(BI_NEWKEY)) { /* If BI_NEWKEY is set we're adding a new key and data pair. */ @@ -461,7 +416,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) if (bigdata) needed += BOVERFLOW_PSIZE; else - needed += BKEYDATA_PSIZE(data->size); + needed += BKEYDATA_PSIZE(data_size); } else { /* * We're either overwriting the data item of a key/data pair @@ -482,16 +437,8 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) if (bigdata) need_bytes += BOVERFLOW_PSIZE; else - need_bytes += BKEYDATA_PSIZE(data->size); + need_bytes += BKEYDATA_PSIZE(data_size); - /* - * If we're overwriting a data item, we copy it if it's not a - * special record type and it's the same size (including any - * alignment) and do a delete/insert otherwise. - */ - if (op == DB_CURRENT && !bigdata && - B_TYPE(bk->type) == B_KEYDATA && have_bytes == need_bytes) - dcopy = 1; if (have_bytes < need_bytes) needed += need_bytes - have_bytes; } @@ -505,9 +452,15 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) * check in the btree split code, so we don't undo it there!?!? */ if (P_FREESPACE(h) < needed || - (t->bt_maxkey != 0 && NUM_ENT(h) > t->bt_maxkey)) { - ret = DB_NEEDSPLIT; - goto err; + (t->bt_maxkey != 0 && NUM_ENT(h) > t->bt_maxkey)) + return (DB_NEEDSPLIT); + + /* Handle partial puts: build the real record. */ + if (F_ISSET(data, DB_DBT_PARTIAL)) { + tdbt = *data; + if ((ret = __bam_partial(dbp, &tdbt, h, indx, data_size)) != 0) + return (ret); + data = &tdbt; } /* @@ -515,10 +468,10 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) * * 1. Append a new key/data pair. * 2. Insert a new key/data pair. - * 3. Copy the data item. - * 4. Delete/insert the data item. - * 5. Append a new data item. - * 6. Insert a new data item. + * 3. Append a new data item (a new duplicate). + * 4. Insert a new data item (a new duplicate). + * 5. Overflow item: delete and re-add the data item. + * 6. Replace the data item. */ if (LF_ISSET(BI_NEWKEY)) { switch (op) { @@ -533,42 +486,17 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) } /* Add the key. */ - if (bigkey) - OVPUT(h, indx, kbo); - else { - DBT __data; - memset(&__data, 0, sizeof(__data)); - __data.data = key->data; - __data.size = key->size; + if (bigkey) { + if ((ret = __bam_ovput(dbp, h, indx, key)) != 0) + return (ret); + } else if ((ret = __db_pitem(dbp, h, indx, - BKEYDATA_SIZE(key->size), NULL, &__data)) != 0) - goto err; - } + BKEYDATA_SIZE(key->size), NULL, key)) != 0) + return (ret); ++indx; } else { switch (op) { - case DB_CURRENT: /* 3. Copy the data item. */ - /* - * If we're not logging and it's possible, overwrite - * the current item. - * - * XXX - * We should add a separate logging message so that - * we can do this anytime it's possible, including - * for partial record puts. - */ - if (dcopy && !DB_LOGGING(dbp)) { - bk->len = data->size; - memcpy(bk->data, data->data, data->size); - goto done; - } - /* 4. Delete/insert the data item. */ - if (TYPE(h) == P_LBTREE) - ++indx; - if ((ret = __bam_ditem(dbp, h, indx)) != 0) - goto err; - break; - case DB_AFTER: /* 5. Append a new data item. */ + case DB_AFTER: /* 3. Append a new data item. */ if (TYPE(h) == P_LBTREE) { /* * Adjust the cursor and copy in the key for @@ -576,7 +504,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) */ if ((ret = __bam_adjindx(dbp, h, indx + P_INDX, indx, 1)) != 0) - goto err; + return (ret); indx += 3; dupadjust = 1; @@ -589,7 +517,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) *indxp += 1; } break; - case DB_BEFORE: /* 6. Insert a new data item. */ + case DB_BEFORE: /* 4. Insert a new data item. */ if (TYPE(h) == P_LBTREE) { /* * Adjust the cursor and copy in the key for @@ -597,43 +525,62 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) */ if ((ret = __bam_adjindx(dbp, h, indx, indx, 1)) != 0) - goto err; + return (ret); ++indx; dupadjust = 1; } else __bam_ca_di(dbp, h->pgno, indx, 1); break; + case DB_CURRENT: + if (TYPE(h) == P_LBTREE) + ++indx; + + /* + * 5. Delete/re-add the data item. + * + * If we're dealing with offpage items, we have to + * delete and then re-add the item. + */ + if (bigdata || B_TYPE(bk->type) == B_OVERFLOW) { + if ((ret = __bam_ditem(dbp, h, indx)) != 0) + return (ret); + break; + } + + /* 6. Replace the data item. */ + replace = 1; + break; default: abort(); } } /* Add the data. */ - if (bigdata) - OVPUT(h, indx, dbo); - else { + if (bigdata) { + if ((ret = __bam_ovput(dbp, h, indx, data)) != 0) + return (ret); + } else { BKEYDATA __bk; - DBT __hdr, __data; - memset(&__data, 0, sizeof(__data)); - __data.data = data->data; - __data.size = data->size; + DBT __hdr; if (LF_ISSET(BI_DELETED)) { B_TSET(__bk.type, B_KEYDATA, 1); - __bk.len = __data.size; + __bk.len = data->size; __hdr.data = &__bk; __hdr.size = SSZA(BKEYDATA, data); ret = __db_pitem(dbp, h, indx, - BKEYDATA_SIZE(__data.size), &__hdr, &__data); - } else + BKEYDATA_SIZE(data->size), &__hdr, data); + } else if (replace) + ret = __bam_ritem(dbp, h, indx, data); + else ret = __db_pitem(dbp, h, indx, - BKEYDATA_SIZE(data->size), NULL, &__data); + BKEYDATA_SIZE(data->size), NULL, data); if (ret != 0) - goto err; + return (ret); } -done: ++t->lstat.bt_added; + ++t->lstat.bt_added; ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY); @@ -645,22 +592,206 @@ done: ++t->lstat.bt_added; if (dupadjust && P_FREESPACE(h) <= dbp->pgsize / 2) { --indx; if ((ret = __bam_ndup(dbp, h, indx)) != 0) - goto err; + return (ret); } if (t->bt_recno != NULL) F_SET(t->bt_recno, RECNO_MODIFIED); - if (0) { -err: if (bigkey) - (void)__db_doff(dbp, kbo.pgno, __bam_free); - if (bigdata) - (void)__db_doff(dbp, dbo.pgno, __bam_free); - } return (ret); } /* + * __bam_partsize -- + * Figure out how much space a partial data item is in total. + */ +static u_int32_t +__bam_partsize(dbp, data, h, indx) + DB *dbp; + DBT *data; + PAGE *h; + u_int32_t indx; +{ + BKEYDATA *bk; + u_int32_t nbytes; + + /* + * Figure out how much total space we'll need. If the record doesn't + * already exist, it's simply the data we're provided. + */ + if (indx >= NUM_ENT(h)) + return (data->doff + data->size); + + /* + * Otherwise, it's the data provided plus any already existing data + * that we're not replacing. + */ + bk = GET_BKEYDATA(h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); + nbytes = + B_TYPE(bk->type) == B_OVERFLOW ? ((BOVERFLOW *)bk)->tlen : bk->len; + + /* + * There are really two cases here: + * + * Case 1: We are replacing some bytes that do not exist (i.e., they + * are past the end of the record). In this case the number of bytes + * we are replacing is irrelevant and all we care about is how many + * bytes we are going to add from offset. So, the new record length + * is going to be the size of the new bytes (size) plus wherever those + * new bytes begin (doff). + * + * Case 2: All the bytes we are replacing exist. Therefore, the new + * size is the oldsize (nbytes) minus the bytes we are replacing (dlen) + * plus the bytes we are adding (size). + */ + if (nbytes < data->doff + data->dlen) /* Case 1 */ + return (data->doff + data->size); + + return (nbytes + data->size - data->dlen); /* Case 2 */ +} + +/* + * OVPUT -- + * Copy an overflow item onto a page. + */ +#undef OVPUT +#define OVPUT(h, indx, bo) do { \ + DBT __hdr; \ + memset(&__hdr, 0, sizeof(__hdr)); \ + __hdr.data = &bo; \ + __hdr.size = BOVERFLOW_SIZE; \ + if ((ret = __db_pitem(dbp, \ + h, indx, BOVERFLOW_SIZE, &__hdr, NULL)) != 0) \ + return (ret); \ +} while (0) + +/* + * __bam_ovput -- + * Build an overflow item and put it on the page. + */ +static int +__bam_ovput(dbp, h, indx, item) + DB *dbp; + PAGE *h; + u_int32_t indx; + DBT *item; +{ + BOVERFLOW bo; + int ret; + + B_TSET(bo.type, B_OVERFLOW, 0); + bo.tlen = item->size; + if ((ret = __db_poff(dbp, item, &bo.pgno, __bam_new)) != 0) + return (ret); + + OVPUT(h, indx, bo); + + return (0); +} + +/* + * __bam_ritem -- + * Replace an item on a page. + * + * PUBLIC: int __bam_ritem __P((DB *, PAGE *, u_int32_t, DBT *)); + */ +int +__bam_ritem(dbp, h, indx, data) + DB *dbp; + PAGE *h; + u_int32_t indx; + DBT *data; +{ + BKEYDATA *bk; + DBT orig, repl; + db_indx_t lo, ln, min, off, prefix, suffix; + int32_t nbytes; + int cnt, ret; + u_int8_t *p, *t; + + /* + * Replace a single item onto a page. The logic figuring out where + * to insert and whether it fits is handled in the caller. All we do + * here is manage the page shuffling. + */ + bk = GET_BKEYDATA(h, indx); + + /* Log the change. */ + if (DB_LOGGING(dbp)) { + /* + * We might as well check to see if the two data items share + * a common prefix and suffix -- it can save us a lot of log + * message if they're large. + */ + min = data->size < bk->len ? data->size : bk->len; + for (prefix = 0, + p = bk->data, t = data->data; + prefix < min && *p == *t; ++prefix, ++p, ++t) + ; + + min -= prefix; + for (suffix = 0, + p = (u_int8_t *)bk->data + bk->len - 1, + t = (u_int8_t *)data->data + data->size - 1; + suffix < min && *p == *t; ++suffix, --p, --t) + ; + + /* We only log the parts of the keys that have changed. */ + orig.data = (u_int8_t *)bk->data + prefix; + orig.size = bk->len - (prefix + suffix); + repl.data = (u_int8_t *)data->data + prefix; + repl.size = data->size - (prefix + suffix); + if ((ret = __bam_repl_log(dbp->dbenv->lg_info, dbp->txn, + &LSN(h), 0, dbp->log_fileid, PGNO(h), &LSN(h), + (u_int32_t)indx, (u_int32_t)B_DISSET(bk->type), + &orig, &repl, (u_int32_t)prefix, (u_int32_t)suffix)) != 0) + return (ret); + } + + /* + * Set references to the first in-use byte on the page and the + * first byte of the item being replaced. + */ + p = (u_int8_t *)h + HOFFSET(h); + t = (u_int8_t *)bk; + + /* + * If the entry is growing in size, shift the beginning of the data + * part of the page down. If the entry is shrinking in size, shift + * the beginning of the data part of the page up. Use memmove(3), + * the regions overlap. + */ + lo = BKEYDATA_SIZE(bk->len); + ln = BKEYDATA_SIZE(data->size); + if (lo != ln) { + nbytes = lo - ln; /* Signed difference. */ + if (p == t) /* First index is fast. */ + h->inp[indx] += nbytes; + else { /* Else, shift the page. */ + memmove(p + nbytes, p, t - p); + + /* Adjust the indices' offsets. */ + off = h->inp[indx]; + for (cnt = 0; cnt < NUM_ENT(h); ++cnt) + if (h->inp[cnt] <= off) + h->inp[cnt] += nbytes; + } + + /* Clean up the page and adjust the item's reference. */ + HOFFSET(h) += nbytes; + t += nbytes; + } + + /* Copy the new item onto the page. */ + bk = (BKEYDATA *)t; + B_TSET(bk->type, B_KEYDATA, 0); + bk->len = data->size; + memcpy(bk->data, data->data, data->size); + + return (0); +} + +/* * __bam_ndup -- * Check to see if the duplicate set at indx should have its own page. * If it should, create it. @@ -766,16 +897,21 @@ __bam_fixed(t, dbt) rp = t->bt_recno; /* - * If using fixed-length records, and the record is long, return - * EINVAL. If it's short, pad it out. Use the record data return - * memory, it's only short-term. + * If database contains fixed-length records, and the record is long, + * return EINVAL. */ if (dbt->size > rp->re_len) return (EINVAL); + + /* + * The caller checked to see if it was just right, so we know it's + * short. Pad it out. We use the record data return memory, it's + * only a short-term use. + */ if (t->bt_rdata.ulen < rp->re_len) { t->bt_rdata.data = t->bt_rdata.data == NULL ? - (void *)malloc(rp->re_len) : - (void *)realloc(t->bt_rdata.data, rp->re_len); + (void *)__db_malloc(rp->re_len) : + (void *)__db_realloc(t->bt_rdata.data, rp->re_len); if (t->bt_rdata.data == NULL) { t->bt_rdata.ulen = 0; return (ENOMEM); @@ -786,12 +922,16 @@ __bam_fixed(t, dbt) memset((u_int8_t *)t->bt_rdata.data + dbt->size, rp->re_pad, rp->re_len - dbt->size); - /* Set the DBT to reference our new record. */ + /* + * Clean up our flags and other information just in case, and + * change the caller's DBT to reference our created record. + */ t->bt_rdata.size = rp->re_len; t->bt_rdata.dlen = 0; t->bt_rdata.doff = 0; t->bt_rdata.flags = 0; *dbt = t->bt_rdata; + return (0); } @@ -800,47 +940,28 @@ __bam_fixed(t, dbt) * Build the real record for a partial put. */ static int -__bam_partial(dbp, dbt, h, indx) +__bam_partial(dbp, dbt, h, indx, nbytes) DB *dbp; DBT *dbt; PAGE *h; - u_int32_t indx; + u_int32_t indx, nbytes; { BTREE *t; BKEYDATA *bk, tbk; BOVERFLOW *bo; DBT copy; - u_int32_t len, nbytes, tlen; + u_int32_t len, tlen; int ret; u_int8_t *p; bo = NULL; /* XXX: Shut the compiler up. */ t = dbp->internal; - /* - * Figure out how much total space we'll need. Worst case is where - * the record is 0 bytes long, in which case doff causes the record - * to extend, and the put data is appended to it. - */ - if (indx < NUM_ENT(h)) { - bk = GET_BKEYDATA(h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); - if (B_TYPE(bk->type) == B_OVERFLOW) { - bo = (BOVERFLOW *)bk; - nbytes = bo->tlen; - } else - nbytes = bk->len; - } else { - bk = &tbk; - B_TSET(bk->type, B_KEYDATA, 0); - nbytes = bk->len = 0; - } - nbytes += dbt->doff + dbt->size + dbt->dlen; - - /* Allocate the space. */ + /* We use the record data return memory, it's only a short-term use. */ if (t->bt_rdata.ulen < nbytes) { t->bt_rdata.data = t->bt_rdata.data == NULL ? - (void *)malloc(nbytes) : - (void *)realloc(t->bt_rdata.data, nbytes); + (void *)__db_malloc(nbytes) : + (void *)__db_realloc(t->bt_rdata.data, nbytes); if (t->bt_rdata.data == NULL) { t->bt_rdata.ulen = 0; return (ENOMEM); @@ -848,6 +969,16 @@ __bam_partial(dbp, dbt, h, indx) t->bt_rdata.ulen = nbytes; } + /* Find the current record. */ + if (indx < NUM_ENT(h)) { + bk = GET_BKEYDATA(h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); + bo = (BOVERFLOW *)bk; + } else { + bk = &tbk; + B_TSET(bk->type, B_KEYDATA, 0); + bk->len = 0; + } + /* We use nul bytes for extending the record, get it over with. */ memset(t->bt_rdata.data, 0, nbytes); |