diff options
Diffstat (limited to 'db2/btree')
-rw-r--r-- | db2/btree/bt_cursor.c | 62 | ||||
-rw-r--r-- | db2/btree/bt_delete.c | 15 | ||||
-rw-r--r-- | db2/btree/bt_put.c | 131 | ||||
-rw-r--r-- | db2/btree/bt_search.c | 14 | ||||
-rw-r--r-- | db2/btree/bt_split.c | 9 | ||||
-rw-r--r-- | db2/btree/btree_auto.c | 8 |
6 files changed, 175 insertions, 64 deletions
diff --git a/db2/btree/bt_cursor.c b/db2/btree/bt_cursor.c index e5f3fae..47ecd7c 100644 --- a/db2/btree/bt_cursor.c +++ b/db2/btree/bt_cursor.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_cursor.c 10.35 (Sleepycat) 10/25/97"; +static const char sccsid[] = "@(#)bt_cursor.c 10.37 (Sleepycat) 11/22/97"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -33,7 +33,7 @@ static int __bam_c_next __P((DB *, CURSOR *, int)); static int __bam_c_physdel __P((DB *, CURSOR *, PAGE *)); static int __bam_c_prev __P((DB *, CURSOR *)); static int __bam_c_put __P((DBC *, DBT *, DBT *, int)); -static int __bam_c_rget __P((DB *, CURSOR *, DBT *, DBT *, int)); +static int __bam_c_rget __P((DB *, CURSOR *, DBT *, int)); static int __bam_c_search __P((DB *, CURSOR *, const DBT *, u_int, int, int *)); /* Discard the current page/lock held by a cursor. */ @@ -229,7 +229,7 @@ __bam_c_del(dbc, flags) B_DSET(GET_BKEYDATA(h, indx + O_INDX)->type); else B_DSET(GET_BKEYDATA(h, indx)->type); - (void)__bam_ca_delete(dbp, pgno, indx, NULL); + (void)__bam_ca_delete(dbp, pgno, indx, NULL, 0); ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY); @@ -313,7 +313,7 @@ __bam_c_get(dbc, key, data, flags) * been rammed into the interface. */ if (LF_ISSET(DB_GET_RECNO)) { - ret = __bam_c_rget(dbp, cp, key, data, flags); + ret = __bam_c_rget(dbp, cp, data, flags); PUTHANDLE(dbp); return (ret); } @@ -441,10 +441,10 @@ err: if (cp->page != NULL) * Return the record number for a cursor. */ static int -__bam_c_rget(dbp, cp, key, data, flags) +__bam_c_rget(dbp, cp, data, flags) DB *dbp; CURSOR *cp; - DBT *key, *data; + DBT *data; int flags; { BTREE *t; @@ -1113,18 +1113,18 @@ __bam_cprint(dbp) /* * __bam_ca_delete -- - * Check if any of the cursors refer to the item we are about to delete. - * We'll return the number of cursors that refer to the item in question. - * If a cursor does refer to the item, then we set its deleted bit. + * Check if any of the cursors refer to the item we are about to delete, + * returning the number of cursors that refer to the item in question. * - * PUBLIC: int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, CURSOR *)); + * PUBLIC: int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, CURSOR *, int)); */ int -__bam_ca_delete(dbp, pgno, indx, curs) +__bam_ca_delete(dbp, pgno, indx, curs, key_delete) DB *dbp; db_pgno_t pgno; u_int32_t indx; CURSOR *curs; + int key_delete; { DBC *dbc; CURSOR *cp; @@ -1140,22 +1140,40 @@ __bam_ca_delete(dbp, pgno, indx, curs) * It's possible for multiple cursors within the thread to have write * locks on the same page, but, cursors within a thread must be single * threaded, so all we're locking here is the cursor linked list. - * - * indx refers to the first of what might be a duplicate set. The - * cursor passed in is the one initiating the delete, so we don't - * want to count it. */ DB_THREAD_LOCK(dbp); + for (count = 0, dbc = TAILQ_FIRST(&dbp->curs_queue); dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { cp = (CURSOR *)dbc->internal; - if ((curs != cp && - cp->pgno == pgno && cp->indx == indx) || - (cp->dpgno == pgno && cp->dindx == indx)) { - ++count; - F_SET(cp, C_DELETED); - } + + /* + * Optionally, a cursor passed in is the one initiating the + * delete, so we don't want to count it or set its deleted + * flag. Otherwise, if a cursor refers to the item, then we + * set its deleted flag. + */ + if (curs == cp) + continue; + + /* + * If we're deleting the key itself and not just one of its + * duplicates, repoint the cursor to the main-page key/data + * pair, everything else is about to be discarded. + */ + if (key_delete || cp->dpgno == PGNO_INVALID) { + if (cp->pgno == pgno && cp->indx == indx) { + cp->dpgno = PGNO_INVALID; + ++count; + F_SET(cp, C_DELETED); + } + } else + if (cp->dpgno == pgno && cp->dindx == indx) { + ++count; + F_SET(cp, C_DELETED); + } } + DB_THREAD_UNLOCK(dbp); return (count); } @@ -1440,7 +1458,7 @@ __bam_c_physdel(dbp, cp, h) * If the item is referenced by another cursor, leave it up to that * cursor to do the delete. */ - if (__bam_ca_delete(dbp, pgno, indx, cp) != 0) + if (__bam_ca_delete(dbp, pgno, indx, cp, 0) != 0) return (0); /* diff --git a/db2/btree/bt_delete.c b/db2/btree/bt_delete.c index 9593d01..dbd1995 100644 --- a/db2/btree/bt_delete.c +++ b/db2/btree/bt_delete.c @@ -47,7 +47,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_delete.c 10.22 (Sleepycat) 11/2/97"; +static const char sccsid[] = "@(#)bt_delete.c 10.23 (Sleepycat) 11/22/97"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -101,17 +101,20 @@ __bam_delete(argdbp, txn, key, flags) h = t->bt_csp->page; indx = t->bt_csp->indx; - /* Delete the key/data pair, including any duplicates. */ + /* Delete the key/data pair, including any on-or-off page duplicates. */ for (cnt = 1, i = indx;; ++cnt) if ((i += P_INDX) >= NUM_ENT(h) || h->inp[i] != h->inp[indx]) break; for (; cnt > 0; --cnt, ++t->lstat.bt_deleted) - if (__bam_ca_delete(dbp, h->pgno, indx, NULL) != 0) { + if (__bam_ca_delete(dbp, h->pgno, indx, NULL, 1) == 0) { + if ((ret = __bam_ditem(dbp, h, indx)) != 0) + goto err; + if ((ret = __bam_ditem(dbp, h, indx)) != 0) + goto err; + } else { B_DSET(GET_BKEYDATA(h, indx + O_INDX)->type); indx += P_INDX; - } else if ((ret = __bam_ditem(dbp, h, indx)) != 0 || - (ret = __bam_ditem(dbp, h, indx)) != 0) - goto err; + } /* If we're using record numbers, update internal page record counts. */ if (F_ISSET(dbp, DB_BT_RECNUM) && (ret = __bam_adjust(dbp, t, -1)) != 0) diff --git a/db2/btree/bt_put.c b/db2/btree/bt_put.c index b3d775b..3161b02 100644 --- a/db2/btree/bt_put.c +++ b/db2/btree/bt_put.c @@ -47,7 +47,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_put.c 10.31 (Sleepycat) 10/26/97"; +static const char sccsid[] = "@(#)bt_put.c 10.35 (Sleepycat) 11/22/97"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -64,6 +64,7 @@ static const char sccsid[] = "@(#)bt_put.c 10.31 (Sleepycat) 10/26/97"; #include "btree.h" static int __bam_fixed __P((BTREE *, DBT *)); +static int __bam_isdeleted __P((DB *, PAGE *, u_int32_t, int *)); static int __bam_lookup __P((DB *, DBT *, int *)); static int __bam_ndup __P((DB *, PAGE *, u_int32_t)); static int __bam_ovput __P((DB *, PAGE *, u_int32_t, DBT *)); @@ -89,7 +90,7 @@ __bam_put(argdbp, txn, key, data, flags) DB *dbp; PAGE *h; db_indx_t indx; - int exact, iflags, newkey, replace, ret, stack; + int exact, iflags, isdeleted, newkey, replace, ret, stack; DEBUG_LWRITE(argdbp, txn, "bam_put", key, data, flags); @@ -114,21 +115,25 @@ retry: /* stack = 1; /* - * If an identical key is already in the tree, and DB_NOOVERWRITE is - * set, an error is returned. If an identical key is already in the - * tree and DB_NOOVERWRITE is not set, the key is either added (when - * duplicates are permitted) or an error is returned. The exception - * is when the item located is referenced by a cursor and marked for - * deletion, in which case we permit the overwrite and flag the cursor. + * If DB_NOOVERWRITE is set and there's an identical key in the tree, + * return an error unless the data item has already been marked for + * deletion, or, all the remaining data items have already been marked + * for deletion in the case of duplicates. If all the data items have + * been marked for deletion, we do a replace, otherwise, it has to be + * a set of duplicates, and we simply append a new one to the set. */ - replace = 0; - if (exact && flags == DB_NOOVERWRITE) { - if (!B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type)) { - ret = DB_KEYEXIST; + isdeleted = replace = 0; + if (exact) { + if ((ret = __bam_isdeleted(dbp, h, indx, &isdeleted)) != 0) goto err; - } - replace = 1; - __bam_ca_replace(dbp, h->pgno, indx, REPLACE_SETUP); + if (isdeleted) { + replace = 1; + __bam_ca_replace(dbp, h->pgno, indx, REPLACE_SETUP); + } else + if (flags == DB_NOOVERWRITE) { + ret = DB_KEYEXIST; + goto err; + } } /* @@ -151,7 +156,7 @@ retry: /* */ newkey = dbp->type == DB_BTREE && !exact; if (exact) { - if (F_ISSET(dbp, DB_AM_DUP)) { + if (!isdeleted && F_ISSET(dbp, DB_AM_DUP)) { /* * Make sure that we're not looking at a page of * duplicates -- if so, move to the last entry on @@ -234,6 +239,88 @@ err: if (stack) } /* + * __bam_isdeleted -- + * Return if the only remaining data item for the element has been + * deleted. + */ +static int +__bam_isdeleted(dbp, h, indx, isdeletedp) + DB *dbp; + PAGE *h; + u_int32_t indx; + int *isdeletedp; +{ + BKEYDATA *bk; + db_pgno_t pgno; + int ret; + + *isdeletedp = 1; + for (;;) { + bk = GET_BKEYDATA(h, indx + O_INDX); + switch (B_TYPE(bk->type)) { + case B_KEYDATA: + case B_OVERFLOW: + if (!B_DISSET(bk->type)) { + *isdeletedp = 0; + return (0); + } + break; + case B_DUPLICATE: + /* + * If the data item referencing the off-page duplicates + * is flagged as deleted, we're done. Else, we have to + * walk the chain of duplicate pages. + */ + if (B_DISSET(bk->type)) + return (0); + goto dupchk; + default: + return (__db_pgfmt(dbp, h->pgno)); + } + + /* + * If there are no more on-page duplicate items, then every + * data item for this key must have been deleted. + */ + if (indx + P_INDX >= (u_int32_t)NUM_ENT(h)) + return (0); + if (h->inp[indx] != h->inp[indx + P_INDX]) + return (0); + + /* Check the next item. */ + indx += P_INDX; + } + /* NOTREACHED */ + +dupchk: /* Check a chain of duplicate pages. */ + pgno = ((BOVERFLOW *)bk)->pgno; + for (;;) { + /* Acquire the next page in the duplicate chain. */ + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + return (ret); + + /* Check each item for a delete flag. */ + for (indx = 0; indx < NUM_ENT(h); ++indx) + if (!B_DISSET(GET_BKEYDATA(h, indx)->type)) { + *isdeletedp = 0; + goto done; + } + /* + * If we reach the end of the duplicate pages, then every + * item we reviewed must have been deleted. + */ + if ((pgno = NEXT_PGNO(h)) == PGNO_INVALID) + goto done; + + (void)memp_fput(dbp->mpf, h, 0); + } + /* NOTREACHED */ + +done: (void)memp_fput(dbp->mpf, h, 0); + return (0); +} + +/* * __bam_lookup -- * Find the right location in the tree for the key. */ @@ -425,10 +512,10 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) if (op == DB_CURRENT) { bk = GET_BKEYDATA(h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); - if (B_TYPE(bk->type) == B_OVERFLOW) - have_bytes = BOVERFLOW_PSIZE; - else + if (B_TYPE(bk->type) == B_KEYDATA) have_bytes = BKEYDATA_PSIZE(bk->len); + else + have_bytes = BOVERFLOW_PSIZE; need_bytes = 0; } else { have_bytes = 0; @@ -542,7 +629,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) * If we're dealing with offpage items, we have to * delete and then re-add the item. */ - if (bigdata || B_TYPE(bk->type) == B_OVERFLOW) { + if (bigdata || B_TYPE(bk->type) != B_KEYDATA) { if ((ret = __bam_ditem(dbp, h, indx)) != 0) return (ret); break; @@ -704,9 +791,9 @@ __bam_ritem(dbp, h, indx, data) { BKEYDATA *bk; DBT orig, repl; - db_indx_t lo, ln, min, off, prefix, suffix; + db_indx_t cnt, lo, ln, min, off, prefix, suffix; int32_t nbytes; - int cnt, ret; + int ret; u_int8_t *p, *t; /* diff --git a/db2/btree/bt_search.c b/db2/btree/bt_search.c index a21a820..c39c9af 100644 --- a/db2/btree/bt_search.c +++ b/db2/btree/bt_search.c @@ -47,7 +47,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_search.c 10.8 (Sleepycat) 10/25/97"; +static const char sccsid[] = "@(#)bt_search.c 10.9 (Sleepycat) 11/18/97"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -119,12 +119,20 @@ __bam_search(dbp, key, flags, stop, recnop, exactp) return (ret); } - /* Decide if we need to save this page; if we do, write lock it. */ + /* + * Decide if we need to save this page; if we do, write lock it. + * We deliberately don't lock-couple on this call. If the tree + * is tiny, i.e., one page, and two threads are busily updating + * the root page, we're almost guaranteed deadlocks galore, as + * each one gets a read lock and then blocks the other's attempt + * for a write lock. + */ if (!stack && ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) || (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) { (void)memp_fput(dbp->mpf, h, 0); - if ((ret = __bam_lget(dbp, 1, pg, DB_LOCK_WRITE, &lock)) != 0) + (void)__BT_LPUT(dbp, lock); + if ((ret = __bam_lget(dbp, 0, pg, DB_LOCK_WRITE, &lock)) != 0) return (ret); if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) { (void)__BT_LPUT(dbp, lock); diff --git a/db2/btree/bt_split.c b/db2/btree/bt_split.c index bc09131..219d486 100644 --- a/db2/btree/bt_split.c +++ b/db2/btree/bt_split.c @@ -44,7 +44,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_split.c 10.17 (Sleepycat) 11/2/97"; +static const char sccsid[] = "@(#)bt_split.c 10.18 (Sleepycat) 11/23/97"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -396,14 +396,14 @@ __bam_broot(dbp, rootp, lp, rp) * The btree comparison code guarantees that the left-most key on any * level of the tree is never used, so it doesn't need to be filled in. */ + memset(&bi, 0, sizeof(bi)); bi.len = 0; B_TSET(bi.type, B_KEYDATA, 0); bi.pgno = lp->pgno; if (F_ISSET(dbp, DB_BT_RECNUM)) { bi.nrecs = __bam_total(lp); RE_NREC_SET(rootp, bi.nrecs); - } else - bi.nrecs = 0; + } hdr.data = &bi; hdr.size = SSZA(BINTERNAL, data); if ((ret = @@ -591,6 +591,7 @@ __bam_pinsert(dbp, parent, lchild, rchild) return (DB_NEEDSPLIT); /* Add a new record for the right page. */ + memset(&bi, 0, sizeof(bi)); bi.len = child_bi->len; B_TSET(bi.type, child_bi->type, 0); bi.pgno = rchild->pgno; @@ -640,6 +641,7 @@ noprefix: nksize = child_bk->len; if (P_FREESPACE(ppage) < nbytes) return (DB_NEEDSPLIT); + memset(&bi, 0, sizeof(bi)); bi.len = nksize; B_TSET(bi.type, child_bk->type, 0); bi.pgno = rchild->pgno; @@ -661,6 +663,7 @@ noprefix: nksize = child_bk->len; if (P_FREESPACE(ppage) < nbytes) return (DB_NEEDSPLIT); + memset(&bi, 0, sizeof(bi)); bi.len = BOVERFLOW_SIZE; B_TSET(bi.type, child_bk->type, 0); bi.pgno = rchild->pgno; diff --git a/db2/btree/btree_auto.c b/db2/btree/btree_auto.c index 45232bb..18b9b34 100644 --- a/db2/btree/btree_auto.c +++ b/db2/btree/btree_auto.c @@ -100,7 +100,6 @@ int __bam_pg_alloc_log(logp, txnid, ret_lsnp, flags, * PUBLIC: int __bam_pg_alloc_print * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); */ - int __bam_pg_alloc_print(notused1, dbtp, lsnp, notused3, notused4) DB_LOG *notused1; @@ -265,7 +264,6 @@ int __bam_pg_free_log(logp, txnid, ret_lsnp, flags, * PUBLIC: int __bam_pg_free_print * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); */ - int __bam_pg_free_print(notused1, dbtp, lsnp, notused3, notused4) DB_LOG *notused1; @@ -460,7 +458,6 @@ int __bam_split_log(logp, txnid, ret_lsnp, flags, * PUBLIC: int __bam_split_print * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); */ - int __bam_split_print(notused1, dbtp, lsnp, notused3, notused4) DB_LOG *notused1; @@ -657,7 +654,6 @@ int __bam_rsplit_log(logp, txnid, ret_lsnp, flags, * PUBLIC: int __bam_rsplit_print * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); */ - int __bam_rsplit_print(notused1, dbtp, lsnp, notused3, notused4) DB_LOG *notused1; @@ -836,7 +832,6 @@ int __bam_adj_log(logp, txnid, ret_lsnp, flags, * PUBLIC: int __bam_adj_print * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); */ - int __bam_adj_print(notused1, dbtp, lsnp, notused3, notused4) DB_LOG *notused1; @@ -995,7 +990,6 @@ int __bam_cadjust_log(logp, txnid, ret_lsnp, flags, * PUBLIC: int __bam_cadjust_print * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); */ - int __bam_cadjust_print(notused1, dbtp, lsnp, notused3, notused4) DB_LOG *notused1; @@ -1145,7 +1139,6 @@ int __bam_cdel_log(logp, txnid, ret_lsnp, flags, * PUBLIC: int __bam_cdel_print * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); */ - int __bam_cdel_print(notused1, dbtp, lsnp, notused3, notused4) DB_LOG *notused1; @@ -1329,7 +1322,6 @@ int __bam_repl_log(logp, txnid, ret_lsnp, flags, * PUBLIC: int __bam_repl_print * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); */ - int __bam_repl_print(notused1, dbtp, lsnp, notused3, notused4) DB_LOG *notused1; |