aboutsummaryrefslogtreecommitdiff
path: root/db2/btree
diff options
context:
space:
mode:
Diffstat (limited to 'db2/btree')
-rw-r--r--db2/btree/bt_cursor.c62
-rw-r--r--db2/btree/bt_delete.c15
-rw-r--r--db2/btree/bt_put.c131
-rw-r--r--db2/btree/bt_search.c14
-rw-r--r--db2/btree/bt_split.c9
-rw-r--r--db2/btree/btree_auto.c8
6 files changed, 175 insertions, 64 deletions
diff --git a/db2/btree/bt_cursor.c b/db2/btree/bt_cursor.c
index e5f3fae..47ecd7c 100644
--- a/db2/btree/bt_cursor.c
+++ b/db2/btree/bt_cursor.c
@@ -8,7 +8,7 @@
#include "config.h"
#ifndef lint
-static const char sccsid[] = "@(#)bt_cursor.c 10.35 (Sleepycat) 10/25/97";
+static const char sccsid[] = "@(#)bt_cursor.c 10.37 (Sleepycat) 11/22/97";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -33,7 +33,7 @@ static int __bam_c_next __P((DB *, CURSOR *, int));
static int __bam_c_physdel __P((DB *, CURSOR *, PAGE *));
static int __bam_c_prev __P((DB *, CURSOR *));
static int __bam_c_put __P((DBC *, DBT *, DBT *, int));
-static int __bam_c_rget __P((DB *, CURSOR *, DBT *, DBT *, int));
+static int __bam_c_rget __P((DB *, CURSOR *, DBT *, int));
static int __bam_c_search __P((DB *, CURSOR *, const DBT *, u_int, int, int *));
/* Discard the current page/lock held by a cursor. */
@@ -229,7 +229,7 @@ __bam_c_del(dbc, flags)
B_DSET(GET_BKEYDATA(h, indx + O_INDX)->type);
else
B_DSET(GET_BKEYDATA(h, indx)->type);
- (void)__bam_ca_delete(dbp, pgno, indx, NULL);
+ (void)__bam_ca_delete(dbp, pgno, indx, NULL, 0);
ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY);
@@ -313,7 +313,7 @@ __bam_c_get(dbc, key, data, flags)
* been rammed into the interface.
*/
if (LF_ISSET(DB_GET_RECNO)) {
- ret = __bam_c_rget(dbp, cp, key, data, flags);
+ ret = __bam_c_rget(dbp, cp, data, flags);
PUTHANDLE(dbp);
return (ret);
}
@@ -441,10 +441,10 @@ err: if (cp->page != NULL)
* Return the record number for a cursor.
*/
static int
-__bam_c_rget(dbp, cp, key, data, flags)
+__bam_c_rget(dbp, cp, data, flags)
DB *dbp;
CURSOR *cp;
- DBT *key, *data;
+ DBT *data;
int flags;
{
BTREE *t;
@@ -1113,18 +1113,18 @@ __bam_cprint(dbp)
/*
* __bam_ca_delete --
- * Check if any of the cursors refer to the item we are about to delete.
- * We'll return the number of cursors that refer to the item in question.
- * If a cursor does refer to the item, then we set its deleted bit.
+ * Check if any of the cursors refer to the item we are about to delete,
+ * returning the number of cursors that refer to the item in question.
*
- * PUBLIC: int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, CURSOR *));
+ * PUBLIC: int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, CURSOR *, int));
*/
int
-__bam_ca_delete(dbp, pgno, indx, curs)
+__bam_ca_delete(dbp, pgno, indx, curs, key_delete)
DB *dbp;
db_pgno_t pgno;
u_int32_t indx;
CURSOR *curs;
+ int key_delete;
{
DBC *dbc;
CURSOR *cp;
@@ -1140,22 +1140,40 @@ __bam_ca_delete(dbp, pgno, indx, curs)
* It's possible for multiple cursors within the thread to have write
* locks on the same page, but, cursors within a thread must be single
* threaded, so all we're locking here is the cursor linked list.
- *
- * indx refers to the first of what might be a duplicate set. The
- * cursor passed in is the one initiating the delete, so we don't
- * want to count it.
*/
DB_THREAD_LOCK(dbp);
+
for (count = 0, dbc = TAILQ_FIRST(&dbp->curs_queue);
dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
cp = (CURSOR *)dbc->internal;
- if ((curs != cp &&
- cp->pgno == pgno && cp->indx == indx) ||
- (cp->dpgno == pgno && cp->dindx == indx)) {
- ++count;
- F_SET(cp, C_DELETED);
- }
+
+ /*
+ * Optionally, a cursor passed in is the one initiating the
+ * delete, so we don't want to count it or set its deleted
+ * flag. Otherwise, if a cursor refers to the item, then we
+ * set its deleted flag.
+ */
+ if (curs == cp)
+ continue;
+
+ /*
+ * If we're deleting the key itself and not just one of its
+ * duplicates, repoint the cursor to the main-page key/data
+ * pair, everything else is about to be discarded.
+ */
+ if (key_delete || cp->dpgno == PGNO_INVALID) {
+ if (cp->pgno == pgno && cp->indx == indx) {
+ cp->dpgno = PGNO_INVALID;
+ ++count;
+ F_SET(cp, C_DELETED);
+ }
+ } else
+ if (cp->dpgno == pgno && cp->dindx == indx) {
+ ++count;
+ F_SET(cp, C_DELETED);
+ }
}
+
DB_THREAD_UNLOCK(dbp);
return (count);
}
@@ -1440,7 +1458,7 @@ __bam_c_physdel(dbp, cp, h)
* If the item is referenced by another cursor, leave it up to that
* cursor to do the delete.
*/
- if (__bam_ca_delete(dbp, pgno, indx, cp) != 0)
+ if (__bam_ca_delete(dbp, pgno, indx, cp, 0) != 0)
return (0);
/*
diff --git a/db2/btree/bt_delete.c b/db2/btree/bt_delete.c
index 9593d01..dbd1995 100644
--- a/db2/btree/bt_delete.c
+++ b/db2/btree/bt_delete.c
@@ -47,7 +47,7 @@
#include "config.h"
#ifndef lint
-static const char sccsid[] = "@(#)bt_delete.c 10.22 (Sleepycat) 11/2/97";
+static const char sccsid[] = "@(#)bt_delete.c 10.23 (Sleepycat) 11/22/97";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -101,17 +101,20 @@ __bam_delete(argdbp, txn, key, flags)
h = t->bt_csp->page;
indx = t->bt_csp->indx;
- /* Delete the key/data pair, including any duplicates. */
+ /* Delete the key/data pair, including any on-or-off page duplicates. */
for (cnt = 1, i = indx;; ++cnt)
if ((i += P_INDX) >= NUM_ENT(h) || h->inp[i] != h->inp[indx])
break;
for (; cnt > 0; --cnt, ++t->lstat.bt_deleted)
- if (__bam_ca_delete(dbp, h->pgno, indx, NULL) != 0) {
+ if (__bam_ca_delete(dbp, h->pgno, indx, NULL, 1) == 0) {
+ if ((ret = __bam_ditem(dbp, h, indx)) != 0)
+ goto err;
+ if ((ret = __bam_ditem(dbp, h, indx)) != 0)
+ goto err;
+ } else {
B_DSET(GET_BKEYDATA(h, indx + O_INDX)->type);
indx += P_INDX;
- } else if ((ret = __bam_ditem(dbp, h, indx)) != 0 ||
- (ret = __bam_ditem(dbp, h, indx)) != 0)
- goto err;
+ }
/* If we're using record numbers, update internal page record counts. */
if (F_ISSET(dbp, DB_BT_RECNUM) && (ret = __bam_adjust(dbp, t, -1)) != 0)
diff --git a/db2/btree/bt_put.c b/db2/btree/bt_put.c
index b3d775b..3161b02 100644
--- a/db2/btree/bt_put.c
+++ b/db2/btree/bt_put.c
@@ -47,7 +47,7 @@
#include "config.h"
#ifndef lint
-static const char sccsid[] = "@(#)bt_put.c 10.31 (Sleepycat) 10/26/97";
+static const char sccsid[] = "@(#)bt_put.c 10.35 (Sleepycat) 11/22/97";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -64,6 +64,7 @@ static const char sccsid[] = "@(#)bt_put.c 10.31 (Sleepycat) 10/26/97";
#include "btree.h"
static int __bam_fixed __P((BTREE *, DBT *));
+static int __bam_isdeleted __P((DB *, PAGE *, u_int32_t, int *));
static int __bam_lookup __P((DB *, DBT *, int *));
static int __bam_ndup __P((DB *, PAGE *, u_int32_t));
static int __bam_ovput __P((DB *, PAGE *, u_int32_t, DBT *));
@@ -89,7 +90,7 @@ __bam_put(argdbp, txn, key, data, flags)
DB *dbp;
PAGE *h;
db_indx_t indx;
- int exact, iflags, newkey, replace, ret, stack;
+ int exact, iflags, isdeleted, newkey, replace, ret, stack;
DEBUG_LWRITE(argdbp, txn, "bam_put", key, data, flags);
@@ -114,21 +115,25 @@ retry: /*
stack = 1;
/*
- * If an identical key is already in the tree, and DB_NOOVERWRITE is
- * set, an error is returned. If an identical key is already in the
- * tree and DB_NOOVERWRITE is not set, the key is either added (when
- * duplicates are permitted) or an error is returned. The exception
- * is when the item located is referenced by a cursor and marked for
- * deletion, in which case we permit the overwrite and flag the cursor.
+ * If DB_NOOVERWRITE is set and there's an identical key in the tree,
+ * return an error unless the data item has already been marked for
+ * deletion, or, all the remaining data items have already been marked
+ * for deletion in the case of duplicates. If all the data items have
+ * been marked for deletion, we do a replace, otherwise, it has to be
+ * a set of duplicates, and we simply append a new one to the set.
*/
- replace = 0;
- if (exact && flags == DB_NOOVERWRITE) {
- if (!B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type)) {
- ret = DB_KEYEXIST;
+ isdeleted = replace = 0;
+ if (exact) {
+ if ((ret = __bam_isdeleted(dbp, h, indx, &isdeleted)) != 0)
goto err;
- }
- replace = 1;
- __bam_ca_replace(dbp, h->pgno, indx, REPLACE_SETUP);
+ if (isdeleted) {
+ replace = 1;
+ __bam_ca_replace(dbp, h->pgno, indx, REPLACE_SETUP);
+ } else
+ if (flags == DB_NOOVERWRITE) {
+ ret = DB_KEYEXIST;
+ goto err;
+ }
}
/*
@@ -151,7 +156,7 @@ retry: /*
*/
newkey = dbp->type == DB_BTREE && !exact;
if (exact) {
- if (F_ISSET(dbp, DB_AM_DUP)) {
+ if (!isdeleted && F_ISSET(dbp, DB_AM_DUP)) {
/*
* Make sure that we're not looking at a page of
* duplicates -- if so, move to the last entry on
@@ -234,6 +239,88 @@ err: if (stack)
}
/*
+ * __bam_isdeleted --
+ * Return if the only remaining data item for the element has been
+ * deleted.
+ */
+static int
+__bam_isdeleted(dbp, h, indx, isdeletedp)
+ DB *dbp;
+ PAGE *h;
+ u_int32_t indx;
+ int *isdeletedp;
+{
+ BKEYDATA *bk;
+ db_pgno_t pgno;
+ int ret;
+
+ *isdeletedp = 1;
+ for (;;) {
+ bk = GET_BKEYDATA(h, indx + O_INDX);
+ switch (B_TYPE(bk->type)) {
+ case B_KEYDATA:
+ case B_OVERFLOW:
+ if (!B_DISSET(bk->type)) {
+ *isdeletedp = 0;
+ return (0);
+ }
+ break;
+ case B_DUPLICATE:
+ /*
+ * If the data item referencing the off-page duplicates
+ * is flagged as deleted, we're done. Else, we have to
+ * walk the chain of duplicate pages.
+ */
+ if (B_DISSET(bk->type))
+ return (0);
+ goto dupchk;
+ default:
+ return (__db_pgfmt(dbp, h->pgno));
+ }
+
+ /*
+ * If there are no more on-page duplicate items, then every
+ * data item for this key must have been deleted.
+ */
+ if (indx + P_INDX >= (u_int32_t)NUM_ENT(h))
+ return (0);
+ if (h->inp[indx] != h->inp[indx + P_INDX])
+ return (0);
+
+ /* Check the next item. */
+ indx += P_INDX;
+ }
+ /* NOTREACHED */
+
+dupchk: /* Check a chain of duplicate pages. */
+ pgno = ((BOVERFLOW *)bk)->pgno;
+ for (;;) {
+ /* Acquire the next page in the duplicate chain. */
+ if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
+ return (ret);
+
+ /* Check each item for a delete flag. */
+ for (indx = 0; indx < NUM_ENT(h); ++indx)
+ if (!B_DISSET(GET_BKEYDATA(h, indx)->type)) {
+ *isdeletedp = 0;
+ goto done;
+ }
+ /*
+ * If we reach the end of the duplicate pages, then every
+ * item we reviewed must have been deleted.
+ */
+ if ((pgno = NEXT_PGNO(h)) == PGNO_INVALID)
+ goto done;
+
+ (void)memp_fput(dbp->mpf, h, 0);
+ }
+ /* NOTREACHED */
+
+done: (void)memp_fput(dbp->mpf, h, 0);
+ return (0);
+}
+
+/*
* __bam_lookup --
* Find the right location in the tree for the key.
*/
@@ -425,10 +512,10 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
if (op == DB_CURRENT) {
bk = GET_BKEYDATA(h,
indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
- if (B_TYPE(bk->type) == B_OVERFLOW)
- have_bytes = BOVERFLOW_PSIZE;
- else
+ if (B_TYPE(bk->type) == B_KEYDATA)
have_bytes = BKEYDATA_PSIZE(bk->len);
+ else
+ have_bytes = BOVERFLOW_PSIZE;
need_bytes = 0;
} else {
have_bytes = 0;
@@ -542,7 +629,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags)
* If we're dealing with offpage items, we have to
* delete and then re-add the item.
*/
- if (bigdata || B_TYPE(bk->type) == B_OVERFLOW) {
+ if (bigdata || B_TYPE(bk->type) != B_KEYDATA) {
if ((ret = __bam_ditem(dbp, h, indx)) != 0)
return (ret);
break;
@@ -704,9 +791,9 @@ __bam_ritem(dbp, h, indx, data)
{
BKEYDATA *bk;
DBT orig, repl;
- db_indx_t lo, ln, min, off, prefix, suffix;
+ db_indx_t cnt, lo, ln, min, off, prefix, suffix;
int32_t nbytes;
- int cnt, ret;
+ int ret;
u_int8_t *p, *t;
/*
diff --git a/db2/btree/bt_search.c b/db2/btree/bt_search.c
index a21a820..c39c9af 100644
--- a/db2/btree/bt_search.c
+++ b/db2/btree/bt_search.c
@@ -47,7 +47,7 @@
#include "config.h"
#ifndef lint
-static const char sccsid[] = "@(#)bt_search.c 10.8 (Sleepycat) 10/25/97";
+static const char sccsid[] = "@(#)bt_search.c 10.9 (Sleepycat) 11/18/97";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -119,12 +119,20 @@ __bam_search(dbp, key, flags, stop, recnop, exactp)
return (ret);
}
- /* Decide if we need to save this page; if we do, write lock it. */
+ /*
+ * Decide if we need to save this page; if we do, write lock it.
+ * We deliberately don't lock-couple on this call. If the tree
+ * is tiny, i.e., one page, and two threads are busily updating
+ * the root page, we're almost guaranteed deadlocks galore, as
+ * each one gets a read lock and then blocks the other's attempt
+ * for a write lock.
+ */
if (!stack &&
((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) ||
(LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
(void)memp_fput(dbp->mpf, h, 0);
- if ((ret = __bam_lget(dbp, 1, pg, DB_LOCK_WRITE, &lock)) != 0)
+ (void)__BT_LPUT(dbp, lock);
+ if ((ret = __bam_lget(dbp, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
return (ret);
if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) {
(void)__BT_LPUT(dbp, lock);
diff --git a/db2/btree/bt_split.c b/db2/btree/bt_split.c
index bc09131..219d486 100644
--- a/db2/btree/bt_split.c
+++ b/db2/btree/bt_split.c
@@ -44,7 +44,7 @@
#include "config.h"
#ifndef lint
-static const char sccsid[] = "@(#)bt_split.c 10.17 (Sleepycat) 11/2/97";
+static const char sccsid[] = "@(#)bt_split.c 10.18 (Sleepycat) 11/23/97";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -396,14 +396,14 @@ __bam_broot(dbp, rootp, lp, rp)
* The btree comparison code guarantees that the left-most key on any
* level of the tree is never used, so it doesn't need to be filled in.
*/
+ memset(&bi, 0, sizeof(bi));
bi.len = 0;
B_TSET(bi.type, B_KEYDATA, 0);
bi.pgno = lp->pgno;
if (F_ISSET(dbp, DB_BT_RECNUM)) {
bi.nrecs = __bam_total(lp);
RE_NREC_SET(rootp, bi.nrecs);
- } else
- bi.nrecs = 0;
+ }
hdr.data = &bi;
hdr.size = SSZA(BINTERNAL, data);
if ((ret =
@@ -591,6 +591,7 @@ __bam_pinsert(dbp, parent, lchild, rchild)
return (DB_NEEDSPLIT);
/* Add a new record for the right page. */
+ memset(&bi, 0, sizeof(bi));
bi.len = child_bi->len;
B_TSET(bi.type, child_bi->type, 0);
bi.pgno = rchild->pgno;
@@ -640,6 +641,7 @@ noprefix: nksize = child_bk->len;
if (P_FREESPACE(ppage) < nbytes)
return (DB_NEEDSPLIT);
+ memset(&bi, 0, sizeof(bi));
bi.len = nksize;
B_TSET(bi.type, child_bk->type, 0);
bi.pgno = rchild->pgno;
@@ -661,6 +663,7 @@ noprefix: nksize = child_bk->len;
if (P_FREESPACE(ppage) < nbytes)
return (DB_NEEDSPLIT);
+ memset(&bi, 0, sizeof(bi));
bi.len = BOVERFLOW_SIZE;
B_TSET(bi.type, child_bk->type, 0);
bi.pgno = rchild->pgno;
diff --git a/db2/btree/btree_auto.c b/db2/btree/btree_auto.c
index 45232bb..18b9b34 100644
--- a/db2/btree/btree_auto.c
+++ b/db2/btree/btree_auto.c
@@ -100,7 +100,6 @@ int __bam_pg_alloc_log(logp, txnid, ret_lsnp, flags,
* PUBLIC: int __bam_pg_alloc_print
* PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
*/
-
int
__bam_pg_alloc_print(notused1, dbtp, lsnp, notused3, notused4)
DB_LOG *notused1;
@@ -265,7 +264,6 @@ int __bam_pg_free_log(logp, txnid, ret_lsnp, flags,
* PUBLIC: int __bam_pg_free_print
* PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
*/
-
int
__bam_pg_free_print(notused1, dbtp, lsnp, notused3, notused4)
DB_LOG *notused1;
@@ -460,7 +458,6 @@ int __bam_split_log(logp, txnid, ret_lsnp, flags,
* PUBLIC: int __bam_split_print
* PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
*/
-
int
__bam_split_print(notused1, dbtp, lsnp, notused3, notused4)
DB_LOG *notused1;
@@ -657,7 +654,6 @@ int __bam_rsplit_log(logp, txnid, ret_lsnp, flags,
* PUBLIC: int __bam_rsplit_print
* PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
*/
-
int
__bam_rsplit_print(notused1, dbtp, lsnp, notused3, notused4)
DB_LOG *notused1;
@@ -836,7 +832,6 @@ int __bam_adj_log(logp, txnid, ret_lsnp, flags,
* PUBLIC: int __bam_adj_print
* PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
*/
-
int
__bam_adj_print(notused1, dbtp, lsnp, notused3, notused4)
DB_LOG *notused1;
@@ -995,7 +990,6 @@ int __bam_cadjust_log(logp, txnid, ret_lsnp, flags,
* PUBLIC: int __bam_cadjust_print
* PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
*/
-
int
__bam_cadjust_print(notused1, dbtp, lsnp, notused3, notused4)
DB_LOG *notused1;
@@ -1145,7 +1139,6 @@ int __bam_cdel_log(logp, txnid, ret_lsnp, flags,
* PUBLIC: int __bam_cdel_print
* PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
*/
-
int
__bam_cdel_print(notused1, dbtp, lsnp, notused3, notused4)
DB_LOG *notused1;
@@ -1329,7 +1322,6 @@ int __bam_repl_log(logp, txnid, ret_lsnp, flags,
* PUBLIC: int __bam_repl_print
* PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *));
*/
-
int
__bam_repl_print(notused1, dbtp, lsnp, notused3, notused4)
DB_LOG *notused1;