aboutsummaryrefslogtreecommitdiff
path: root/db2/common/db_region.c
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>1998-06-09 15:16:55 +0000
committerUlrich Drepper <drepper@redhat.com>1998-06-09 15:16:55 +0000
commitbf7997b65c7887d2acda95f5201d818a19d81711 (patch)
treeda3583de3a0b5892f90a4b1eb773a87b554ae37e /db2/common/db_region.c
parent7646e67e6cc4c738a7b402c60fed39d52db0433b (diff)
downloadglibc-bf7997b65c7887d2acda95f5201d818a19d81711.zip
glibc-bf7997b65c7887d2acda95f5201d818a19d81711.tar.gz
glibc-bf7997b65c7887d2acda95f5201d818a19d81711.tar.bz2
Update.
1998-06-09 Ulrich Drepper <drepper@cygnus.com> * sysdeps/unix/sysv/linux/netinet/ip.h (struct ip_options): Define __data member only for gcc. Reported by ak@muc.de. * misc/mntent.h: Undo last patch. * sysdeps/unix/sysv/linux/fstatvfs.c (fstatvfs): Undo last patch. * misc/tst/mntent.c: Adjust code for this change. * io/fts.c: Updated from a slightly more recent BSD version. * io/fts.h: Likewise. * libc.map: Add __libc_stack_end. * db2/Makefile (routines): Add lock_region. * db2/config.h: Update from db-2.4.14. * db2/db.h: Likewise. * db2/db_185.h: Likewise. * db2/db_int.h: Likewise. * db2/bt_close.c: Likewise. * db2/bt_compare.c: Likewise. * db2/bt_conv.c: Likewise. * db2/bt_cursor.c: Likewise. * db2/bt_delete.c: Likewise. * db2/bt_open.c: Likewise. * db2/bt_page.c: Likewise. * db2/bt_put.c: Likewise. * db2/bt_rec.c: Likewise. * db2/bt_recno.c: Likewise. * db2/bt_rsearch.c: Likewise. * db2/bt_search.c: Likewise. * db2/bt_split.c: Likewise. * db2/bt_stat.c: Likewise. * db2/btree.src: Likewise. * db2/btree_auto.c: Likewise. * db2/getlong.c: Likewise. * db2/db_appinit.c: Likewise. * db2/db_apprec.c: Likewise. * db2/db_byteorder.c: Likewise. * db2/db_err.c: Likewise. * db2/db_log2.c: Likewise. * db2/db_region.c: Likewise. * db2/db_salloc.c: Likewise. * db2/db_shash.c: Likewise. * db2/db.c: Likewise. * db2/db.src: Likewise. * db2/db_auto.c: Likewise. * db2/db_conv.c: Likewise. * db2/db_dispatch.c: Likewise. * db2/db_dup.c: Likewise. * db2/db_overflow.c: Likewise. * db2/db_pr.c: Likewise. * db2/db_rec.c: Likewise. * db2/db_ret.c: Likewise. * db2/db_thread.c: Likewise. * db2/db185.c: Likewise. * db2/db185_int.h: Likewise. * db2/dbm.c: Likewise. * db2/hash.c: Likewise. * db2/hash.src: Likewise. * db2/hash_auto.c: Likewise. * db2/hash_conv.c: Likewise. * db2/hash_debug.c: Likewise. * db2/hash_dup.c: Likewise. * db2/hash_func.c: Likewise. * db2/hash_page.c: Likewise. * db2/hash_rec.c: Likewise. * db2/hash_stat.c: Likewise. * db2/btree.h: Likewise. * db2/btree_ext.h: Likewise. * db2/clib_ext.h: Likewise. * db2/common_ext.h: Likewise. * db2/cxx_int.h: Likewise. * db2/db.h.src: Likewise. * db2/db_185.h.src: Likewise. * db2/db_am.h: Likewise. * db2/db_auto.h: Likewise. * db2/db_cxx.h: Likewise. * db2/db_dispatch.h: Likewise. * db2/db_ext.h: Likewise. * db2/db_int.h.src: Likewise. * db2/db_page.h: Likewise. * db2/db_shash.h: Likewise. * db2/db_swap.h: Likewise. * db2/hash.h: Likewise. * db2/hash_ext.h: Likewise. * db2/lock.h: Likewise. * db2/lock_ext.h: Likewise. * db2/log.h: Likewise. * db2/log_ext.h: Likewise. * db2/mp.h: Likewise. * db2/mp_ext.h: Likewise. * db2/mutex_ext.h: Likewise. * db2/os_ext.h: Likewise. * db2/os_func.h: Likewise. * db2/queue.h: Likewise. * db2/shqueue.h: Likewise. * db2/txn.h: Likewise. * db2/lock.c: Likewise. * db2/lock_conflict.c: Likewise. * db2/lock_deadlock.c: Likewise. * db2/lock_region.c: Likewise. * db2/lock_util.c: Likewise. * db2/log.c: Likewise. * db2/log.src: Likewise. * db2/log_archive.c: Likewise. * db2/log_auto.c: Likewise. * db2/log_compare.c: Likewise. * db2/log_findckp.c: Likewise. * db2/log_get.c: Likewise. * db2/log_put.c: Likewise. * db2/log_rec.c: Likewise. * db2/log_register.c: Likewise. * db2/mp_bh.c: Likewise. * db2/mp_fget.c: Likewise. * db2/mp_fopen.c: Likewise. * db2/mp_fput.c: Likewise. * db2/mp_fset.c: Likewise. * db2/mp_open.c: Likewise. * db2/mp_pr.c: Likewise. * db2/mp_region.c: Likewise. * db2/mp_sync.c: Likewise. * db2/68020.gcc: Likewise. * db2/mutex.c: Likewise. * db2/parisc.gcc: Likewise. * db2/parisc.hp: Likewise. * db2/sco.cc: Likewise. * db2/os_abs.c: Likewise. * db2/os_alloc.c: Likewise. * db2/os_config.c: Likewise. * db2/os_dir.c: Likewise. * db2/os_fid.c: Likewise. * db2/os_fsync.c: Likewise. * db2/os_map.c: Likewise. * db2/os_oflags.c: Likewise. * db2/os_open.c: Likewise. * db2/os_rpath.c: Likewise. * db2/os_rw.c: Likewise. * db2/os_seek.c: Likewise. * db2/os_sleep.c: Likewise. * db2/os_spin.c: Likewise. * db2/os_stat.c: Likewise. * db2/os_unlink.c: Likewise. * db2/db_archive.c: Likewise. * db2/db_checkpoint.c: Likewise. * db2/db_deadlock.c: Likewise. * db2/db_dump.c: Likewise. * db2/db_dump185.c: Likewise. * db2/db_load.c: Likewise. * db2/db_printlog.c: Likewise. * db2/db_recover.c: Likewise. * db2/db_stat.c: Likewise. * db2/txn.c: Likewise. * db2/txn.src: Likewise. * db2/txn_auto.c: Likewise. * db2/txn_rec.c: Likewise. * elf/rtld.c: Move definition of __libc_stack_end to ... * sysdeps/generic/dl-sysdep.h: ...here. * sysdeps/unix/sysv/linux/fstatvfs.c: Handle nodiratime option. * sysdeps/unix/sysv/linux/bits/statvfs.h: Define ST_NODIRATIME. * sysdeps/unix/sysv/linux/sys/mount.h: Define MS_NODIRATIME. 1998-06-08 21:44 Ulrich Drepper <drepper@cygnus.com> * sysdeps/unix/sysv/linux/fstatvfs.c: Handle constant option string from mntent correctly. 1998-06-06 Andreas Jaeger <aj@arthur.rhein-neckar.de> * sunrpc/Makefile (generated): Correct typo. 1998-06-04 Philip Blundell <philb@gnu.org> * elf/elf.h (EM_ARM, et al.): New definitions. * sysdeps/arm/dl-machine.h: Update for new draft ARM ELF ABI.
Diffstat (limited to 'db2/common/db_region.c')
-rw-r--r--db2/common/db_region.c1131
1 files changed, 692 insertions, 439 deletions
diff --git a/db2/common/db_region.c b/db2/common/db_region.c
index 02d939e..6d15f7f 100644
--- a/db2/common/db_region.c
+++ b/db2/common/db_region.c
@@ -1,59 +1,20 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997
+ * Copyright (c) 1996, 1997, 1998
* Sleepycat Software. All rights reserved.
*/
-/*
- * Copyright (c) 1995, 1996
- * The President and Fellows of Harvard University. All rights reserved.
- *
- * This code is derived from software contributed to Harvard by
- * Margo Seltzer.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
#include "config.h"
#ifndef lint
-static const char sccsid[] = "@(#)db_region.c 10.21 (Sleepycat) 1/16/98";
+static const char sccsid[] = "@(#)db_region.c 10.46 (Sleepycat) 5/26/98";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>
-#include <sys/stat.h>
#include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#endif
@@ -61,548 +22,840 @@ static const char sccsid[] = "@(#)db_region.c 10.21 (Sleepycat) 1/16/98";
#include "db_int.h"
#include "common_ext.h"
-static int __db_rmap __P((DB_ENV *, int, size_t, void *));
+static int __db_growregion __P((REGINFO *, size_t));
/*
- * __db_rcreate --
- *
- * Common interface for creating a shared region. Handles synchronization
- * across multiple processes.
- *
- * The dbenv contains the environment for this process, including naming
- * information. The path argument represents the parameters passed to
- * the open routines and may be either a file or a directory. If it is
- * a directory, it must exist. If it is a file, then the file parameter
- * must be NULL, otherwise, file is the name to be created inside the
- * directory path.
- *
- * The function returns a pointer to the shared region that has been mapped
- * into memory, NULL on error.
+ * __db_rattach --
+ * Optionally create and attach to a shared memory region.
*
- * PUBLIC: int __db_rcreate __P((DB_ENV *, APPNAME,
- * PUBLIC: const char *, const char *, int, size_t, int, int *, void *));
+ * PUBLIC: int __db_rattach __P((REGINFO *));
*/
int
-__db_rcreate(dbenv, appname, path, file, mode, size, oflags, fdp, retp)
- DB_ENV *dbenv;
- APPNAME appname;
- const char *path, *file;
- int mode, oflags, *fdp;
- size_t size;
- void *retp;
+__db_rattach(infop)
+ REGINFO *infop;
{
- RLAYOUT *rp;
- int fd, ret;
- char *name;
+ RLAYOUT *rlp, rl;
+ size_t grow_region, size;
+ ssize_t nr, nw;
+ u_int32_t flags, mbytes, bytes;
+ u_int8_t *p;
+ int malloc_possible, ret, retry_cnt;
+
+ grow_region = 0;
+ malloc_possible = 1;
+ ret = retry_cnt = 0;
+
+ /* Round off the requested size to the next page boundary. */
+ DB_ROUNDOFF(infop->size);
+
+ /* Some architectures have hard limits on the maximum region size. */
+#ifdef DB_REGIONSIZE_MAX
+ if (infop->size > DB_REGIONSIZE_MAX) {
+ __db_err(infop->dbenv, "__db_rattach: cache size too large");
+ return (EINVAL);
+ }
+#endif
- fd = -1;
- rp = NULL;
+ /* Intialize the return information in the REGINFO structure. */
+loop: infop->addr = NULL;
+ infop->fd = -1;
+ infop->segid = INVALID_SEGID;
+ if (infop->name != NULL) {
+ FREES(infop->name);
+ infop->name = NULL;
+ }
+ F_CLR(infop, REGION_CANGROW | REGION_CREATED);
+#ifndef HAVE_SPINLOCKS
/*
- * Get the filename -- note, if it's a temporary file, it will
- * be created by the underlying temporary file creation code,
- * so we have to check the file descriptor to be sure it's an
- * error.
+ * XXX
+ * Lacking spinlocks, we must have a file descriptor for fcntl(2)
+ * locking, which implies using mmap(2) to map in a regular file.
+ * (Theoretically, we could probably get a file descriptor to lock
+ * other types of shared regions, but I don't see any reason to
+ * bother.)
*/
- if ((ret = __db_appname(dbenv, appname, path, file, &fd, &name)) != 0)
- return (ret);
+ malloc_possible = 0;
+#endif
+#ifdef __hppa
/*
- * Now open the file. We need to make sure that multiple processes
- * that attempt to create the region at the same time are properly
- * ordered, so we open it DB_EXCL and DB_CREATE so two simultaneous
- * attempts to create the region will return failure in one of the
- * attempts.
+ * XXX
+ * HP-UX won't permit mutexes to live in anything but shared memory.
+ * Instantiate a shared region file on that architecture, regardless.
*/
- oflags |= DB_CREATE | DB_EXCL;
- if (fd == -1 &&
- (ret = __db_open(name, oflags, oflags, mode, &fd)) != 0) {
- if (ret != EEXIST)
- __db_err(dbenv,
- "region create: %s: %s", name, strerror(ret));
- goto err;
+ malloc_possible = 0;
+#endif
+ /*
+ * If a region is truly private, malloc the memory. That's faster
+ * than either anonymous memory or a shared file.
+ */
+ if (malloc_possible && F_ISSET(infop, REGION_PRIVATE)) {
+ if ((infop->addr = __db_malloc(infop->size)) == NULL)
+ return (ENOMEM);
+
+ /*
+ * It's sometimes significantly faster to page-fault in all
+ * of the region's pages before we run the application, as
+ * we can see fairly nasty side-effects when we page-fault
+ * while holding various locks, i.e., the lock takes a long
+ * time, and other threads convoy behind the lock holder.
+ */
+ if (DB_GLOBAL(db_region_init))
+ for (p = infop->addr;
+ p < (u_int8_t *)infop->addr + infop->size;
+ p += DB_VMPAGESIZE)
+ p[0] = '\0';
+
+ F_SET(infop, REGION_CREATED | REGION_MALLOC);
+ goto region_init;
}
- *fdp = fd;
- /* Grow the region to the correct size. */
- if ((ret = __db_rgrow(dbenv, fd, size)) != 0)
- goto err;
+ /*
+ * Get the name of the region (creating the file if a temporary file
+ * is being used). The dbenv contains the current DB environment,
+ * including naming information. The path argument may be a file or
+ * a directory. If path is a directory, it must exist and file is the
+ * file name to be created inside the directory. If path is a file,
+ * then file must be NULL.
+ */
+ if ((ret = __db_appname(infop->dbenv, infop->appname, infop->path,
+ infop->file, infop->dbflags, &infop->fd, &infop->name)) != 0)
+ return (ret);
+ if (infop->fd != -1)
+ F_SET(infop, REGION_CREATED);
- /* Map the region in. */
- if ((ret = __db_rmap(dbenv, fd, size, &rp)) != 0)
- goto err;
+ /*
+ * Try to create the file, if we have authority. We have to make sure
+ * that multiple threads/processes attempting to simultaneously create
+ * the region are properly ordered, so we open it using DB_CREATE and
+ * DB_EXCL, so two attempts to create the region will return failure in
+ * one.
+ */
+ if (infop->fd == -1 && infop->dbflags & DB_CREATE) {
+ flags = infop->dbflags;
+ LF_SET(DB_EXCL);
+ if ((ret = __db_open(infop->name,
+ flags, flags, infop->mode, &infop->fd)) == 0)
+ F_SET(infop, REGION_CREATED);
+ else
+ if (ret != EEXIST)
+ goto errmsg;
+ }
- /* Initialize the region. */
- if ((ret = __db_rinit(dbenv, rp, fd, size, 1)) != 0)
- goto err;
+ /* If we couldn't create the file, try and open it. */
+ if (infop->fd == -1) {
+ flags = infop->dbflags;
+ LF_CLR(DB_CREATE | DB_EXCL);
+ if ((ret = __db_open(infop->name,
+ flags, flags, infop->mode, &infop->fd)) != 0)
+ goto errmsg;
+ }
- if (name != NULL)
- FREES(name);
+ /*
+ * There are three cases we support:
+ * 1. Named anonymous memory (shmget(2)).
+ * 2. Unnamed anonymous memory (mmap(2): MAP_ANON/MAP_ANONYMOUS).
+ * 3. Memory backed by a regular file (mmap(2)).
+ *
+ * We instantiate a backing file in all cases, which contains at least
+ * the RLAYOUT structure, and in case #4, contains the actual region.
+ * This is necessary for a couple of reasons:
+ *
+ * First, the mpool region uses temporary files to name regions, and
+ * since you may have multiple regions in the same directory, we need
+ * a filesystem name to ensure that they don't collide.
+ *
+ * Second, applications are allowed to forcibly remove regions, even
+ * if they don't know anything about them other than the name. If a
+ * region is backed by anonymous memory, there has to be some way for
+ * the application to find out that information, and, in some cases,
+ * determine ID information for the anonymous memory.
+ */
+ if (F_ISSET(infop, REGION_CREATED)) {
+ /*
+ * If we're using anonymous memory to back this region, set
+ * the flag.
+ */
+ if (DB_GLOBAL(db_region_anon))
+ F_SET(infop, REGION_ANONYMOUS);
- *(void **)retp = rp;
- return (0);
+ /*
+ * If we're using a regular file to back a region we created,
+ * grow it to the specified size.
+ */
+ if (!DB_GLOBAL(db_region_anon) &&
+ (ret = __db_growregion(infop, infop->size)) != 0)
+ goto err;
+ } else {
+ /*
+ * If we're joining a region, figure out what it looks like.
+ *
+ * XXX
+ * We have to figure out if the file is a regular file backing
+ * a region that we want to map into our address space, or a
+ * file with the information we need to find a shared anonymous
+ * region that we want to map into our address space.
+ *
+ * All this noise is because some systems don't have a coherent
+ * VM and buffer cache, and worse, if you mix operations on the
+ * VM and buffer cache, half the time you hang the system.
+ *
+ * There are two possibilities. If the file is the size of an
+ * RLAYOUT structure, then we know that the real region is in
+ * shared memory, because otherwise it would be bigger. (As
+ * the RLAYOUT structure size is smaller than a disk sector,
+ * the only way it can be this size is if deliberately written
+ * that way.) In which case, retrieve the information we need
+ * from the RLAYOUT structure and use it to acquire the shared
+ * memory.
+ *
+ * If the structure is larger than an RLAYOUT structure, then
+ * the file is backing the shared memory region, and we use
+ * the current size of the file without reading any information
+ * from the file itself so that we don't confuse the VM.
+ *
+ * And yes, this makes me want to take somebody and kill them,
+ * but I can't think of any other solution.
+ */
+ if ((ret = __db_ioinfo(infop->name,
+ infop->fd, &mbytes, &bytes, NULL)) != 0)
+ goto errmsg;
+ size = mbytes * MEGABYTE + bytes;
+
+ if (size <= sizeof(RLAYOUT)) {
+ /*
+ * If the size is too small, the read fails or the
+ * valid flag is incorrect, assume it's because the
+ * RLAYOUT information hasn't been written out yet,
+ * and retry.
+ */
+ if (size < sizeof(RLAYOUT))
+ goto retry;
+ if ((ret =
+ __db_read(infop->fd, &rl, sizeof(rl), &nr)) != 0)
+ goto retry;
+ if (rl.valid != DB_REGIONMAGIC)
+ goto retry;
+
+ /* Copy the size, memory id and characteristics. */
+ size = rl.size;
+ infop->segid = rl.segid;
+ if (F_ISSET(&rl, REGION_ANONYMOUS))
+ F_SET(infop, REGION_ANONYMOUS);
+ }
-err: if (fd != -1) {
- if (rp != NULL)
- (void)__db_unmap(rp, rp->size);
- (void)__db_unlink(name);
- (void)__db_close(fd);
+ /*
+ * If the region is larger than we think, that's okay, use the
+ * current size. If it's smaller than we think, and we were
+ * just using the default size, that's okay, use the current
+ * size. If it's smaller than we think and we really care,
+ * save the size and we'll catch that further down -- we can't
+ * correct it here because we have to have a lock to grow the
+ * region.
+ */
+ if (infop->size > size && !F_ISSET(infop, REGION_SIZEDEF))
+ grow_region = infop->size;
+ infop->size = size;
}
- if (name != NULL)
- FREES(name);
- return (ret);
-}
-
-/*
- * __db_rinit --
- * Initialize the region.
- *
- * PUBLIC: int __db_rinit __P((DB_ENV *, RLAYOUT *, int, size_t, int));
- */
-int
-__db_rinit(dbenv, rp, fd, size, lock_region)
- DB_ENV *dbenv;
- RLAYOUT *rp;
- size_t size;
- int fd, lock_region;
-{
- int ret;
- COMPQUIET(dbenv, NULL);
+ /*
+ * Map the region into our address space. If we're creating it, the
+ * underlying routines will make it the right size.
+ *
+ * There are at least two cases where we can "reasonably" fail when
+ * we attempt to map in the region. On Windows/95, closing the last
+ * reference to a region causes it to be zeroed out. On UNIX, when
+ * using the shmget(2) interfaces, the region will no longer exist
+ * if the system was rebooted. In these cases, the underlying map call
+ * returns EAGAIN, and we *remove* our file and try again. There are
+ * obvious races in doing this, but it should eventually settle down
+ * to a winner and then things should proceed normally.
+ */
+ if ((ret = __db_mapregion(infop->name, infop)) != 0)
+ if (ret == EAGAIN) {
+ /*
+ * Pretend we created the region even if we didn't so
+ * that our error processing unlinks it.
+ */
+ F_SET(infop, REGION_CREATED);
+ ret = 0;
+ goto retry;
+ } else
+ goto err;
+region_init:
/*
- * Initialize the common information.
+ * Initialize the common region information.
*
* !!!
* We have to order the region creates so that two processes don't try
- * to simultaneously create the region and so that processes that are
- * joining the region never see inconsistent data. We'd like to play
- * file permissions games, but we can't because WNT filesystems won't
- * open a file mode 0.
- *
- * If the lock_region flag is set, the process creating the region
- * acquires the lock before the setting the version number. Any
- * process joining the region checks the version number before
- * attempting to acquire the lock. (The lock_region flag may not be
- * set -- the mpool code sometimes malloc's private regions but still
- * needs to initialize them, specifically, the mutex for threads.)
+ * to simultaneously create the region. This is handled by using the
+ * DB_CREATE and DB_EXCL flags when we create the "backing" region file.
*
- * We have to check the version number first, because if the version
- * number has not been written, it's possible that the mutex has not
- * been initialized in which case an attempt to get it could lead to
- * random behavior. If the version number isn't there (the file size
- * is too small) or it's 0, we know that the region is being created.
- *
- * We also make sure to check the return of __db_mutex_lock() here,
- * even though we don't usually check elsewhere. This is the first
- * lock we attempt to acquire, and if it fails we have to know. (It
- * can fail -- SunOS, using fcntl(2) for locking, with an in-memory
- * filesystem specified as the database home.)
+ * We also have to order region joins so that processes joining regions
+ * never see inconsistent data. We'd like to play permissions games
+ * with the backing file, but we can't because WNT filesystems won't
+ * open a file mode 0.
*/
- __db_mutex_init(&rp->lock, MUTEX_LOCK_OFFSET(rp, &rp->lock));
- if (lock_region && (ret = __db_mutex_lock(&rp->lock, fd)) != 0)
- return (ret);
-
- rp->refcnt = 1;
- rp->size = size;
- rp->flags = 0;
- db_version(&rp->majver, &rp->minver, &rp->patch);
+ rlp = (RLAYOUT *)infop->addr;
+ if (F_ISSET(infop, REGION_CREATED)) {
+ /*
+ * The process creating the region acquires a lock before it
+ * sets the valid flag. Any processes joining the region will
+ * check the valid flag before acquiring the lock.
+ *
+ * Check the return of __db_mutex_init() and __db_mutex_lock(),
+ * even though we don't usually check elsewhere. This is the
+ * first lock we initialize and acquire, and we have to know if
+ * it fails. (It CAN fail, e.g., SunOS, when using fcntl(2)
+ * for locking, with an in-memory filesystem specified as the
+ * database home.)
+ */
+ if ((ret = __db_mutex_init(&rlp->lock,
+ MUTEX_LOCK_OFFSET(rlp, &rlp->lock))) != 0 ||
+ (ret = __db_mutex_lock(&rlp->lock, infop->fd)) != 0)
+ goto err;
- return (0);
-}
+ /* Initialize the remaining region information. */
+ rlp->refcnt = 1;
+ rlp->size = infop->size;
+ db_version(&rlp->majver, &rlp->minver, &rlp->patch);
+ rlp->segid = infop->segid;
+ rlp->flags = 0;
+ if (F_ISSET(infop, REGION_ANONYMOUS))
+ F_SET(rlp, REGION_ANONYMOUS);
-/*
- * __db_ropen --
- * Construct the name of a file, open it and map it in.
- *
- * PUBLIC: int __db_ropen __P((DB_ENV *,
- * PUBLIC: APPNAME, const char *, const char *, int, int *, void *));
- */
-int
-__db_ropen(dbenv, appname, path, file, flags, fdp, retp)
- DB_ENV *dbenv;
- APPNAME appname;
- const char *path, *file;
- int flags, *fdp;
- void *retp;
-{
- RLAYOUT *rp;
- size_t size;
- u_int32_t mbytes, bytes;
- int fd, ret;
- char *name;
+ /*
+ * Fill in the valid field last -- use a magic number, memory
+ * may not be zero-filled, and we want to minimize the chance
+ * for collision.
+ */
+ rlp->valid = DB_REGIONMAGIC;
- fd = -1;
- rp = NULL;
+ /*
+ * If the region is anonymous, write the RLAYOUT information
+ * into the backing file so that future region join and unlink
+ * calls can find it.
+ *
+ * XXX
+ * We MUST do the seek before we do the write. On Win95, while
+ * closing the last reference to an anonymous shared region
+ * doesn't discard the region, it does zero it out. So, the
+ * REGION_CREATED may be set, but the file may have already
+ * been written and the file descriptor may be at the end of
+ * the file.
+ */
+ if (F_ISSET(infop, REGION_ANONYMOUS)) {
+ if ((ret = __db_seek(infop->fd, 0, 0, 0, 0, 0)) != 0)
+ goto err;
+ if ((ret =
+ __db_write(infop->fd, rlp, sizeof(*rlp), &nw)) != 0)
+ goto err;
+ }
+ } else {
+ /*
+ * Check the valid flag to ensure the region is initialized.
+ * If the valid flag has not been set, the mutex may not have
+ * been initialized, and an attempt to get it could lead to
+ * random behavior.
+ */
+ if (rlp->valid != DB_REGIONMAGIC)
+ goto retry;
- /* Get the filename. */
- if ((ret = __db_appname(dbenv, appname, path, file, NULL, &name)) != 0)
- return (ret);
+ /* Get the region lock. */
+ (void)__db_mutex_lock(&rlp->lock, infop->fd);
- /* Open the file. */
- if ((ret = __db_open(name, flags, DB_MUTEXDEBUG, 0, &fd)) != 0) {
- __db_err(dbenv, "region open: %s: %s", name, strerror(ret));
- goto err2;
- }
+ /*
+ * We now own the region. There are a couple of things that
+ * may have gone wrong, however.
+ *
+ * Problem #1: while we were waiting for the lock, the region
+ * was deleted. Detected by re-checking the valid flag, since
+ * it's cleared by the delete region routines.
+ */
+ if (rlp->valid != DB_REGIONMAGIC) {
+ (void)__db_mutex_unlock(&rlp->lock, infop->fd);
+ goto retry;
+ }
- *fdp = fd;
+ /*
+ * Problem #2: We want a bigger region than has previously been
+ * created. Detected by checking if the region is smaller than
+ * our caller requested. If it is, we grow the region, (which
+ * does the detach and re-attach for us).
+ */
+ if (grow_region != 0 &&
+ (ret = __db_rgrow(infop, grow_region)) != 0) {
+ (void)__db_mutex_unlock(&rlp->lock, infop->fd);
+ goto err;
+ }
- /*
- * Map the file in. We have to do things in a strange order so that
- * we don't get into a situation where the file was just created and
- * isn't yet initialized. See the comment in __db_rcreate() above.
- *
- * XXX
- * We'd like to test to see if the file is too big to mmap. Since we
- * don't know what size or type off_t's or size_t's are, or the largest
- * unsigned integral type is, or what random insanity the local C
- * compiler will perpetrate, doing the comparison in a portable way is
- * flatly impossible. Hope that mmap fails if the file is too large.
- *
- */
- if ((ret = __db_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0) {
- __db_err(dbenv, "%s: %s", name, strerror(ret));
- goto err2;
- }
- size = mbytes * MEGABYTE + bytes;
+ /*
+ * Problem #3: when we checked the size of the file, it was
+ * still growing as part of creation. Detected by the fact
+ * that infop->size isn't the same size as the region.
+ */
+ if (infop->size != rlp->size) {
+ (void)__db_mutex_unlock(&rlp->lock, infop->fd);
+ goto retry;
+ }
- /* Check to make sure the first block has been written. */
- if (size < sizeof(RLAYOUT)) {
- ret = EAGAIN;
- goto err2;
+ /* Increment the reference count. */
+ ++rlp->refcnt;
}
- /* Map in whatever is there. */
- if ((ret = __db_rmap(dbenv, fd, size, &rp)) != 0)
- goto err2;
+ /* Return the region in a locked condition. */
- /*
- * Check to make sure the region has been initialized. We can't just
- * grab the lock because the lock may not have been initialized yet.
- */
- if (rp->majver == 0) {
- ret = EAGAIN;
- goto err2;
- }
-
- /* Get the region lock. */
- if (!LF_ISSET(DB_MUTEXDEBUG))
- (void)__db_mutex_lock(&rp->lock, fd);
+ if (0) {
+errmsg: __db_err(infop->dbenv, "%s: %s", infop->name, strerror(ret));
- /*
- * The file may have been half-written if we were descheduled between
- * getting the size of the file and checking the major version. Check
- * to make sure we got the entire file.
- */
- if ((ret = __db_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0) {
- __db_err(dbenv, "%s: %s", name, strerror(ret));
- goto err1;
- }
- if (size != mbytes * MEGABYTE + bytes) {
- ret = EAGAIN;
- goto err1;
- }
+err:
+retry: /* Discard the region. */
+ if (infop->addr != NULL) {
+ (void)__db_unmapregion(infop);
+ infop->addr = NULL;
+ }
- /* The file may have just been deleted. */
- if (F_ISSET(rp, DB_R_DELETED)) {
- ret = EAGAIN;
- goto err1;
- }
+ /* Discard the backing file. */
+ if (infop->fd != -1) {
+ (void)__db_close(infop->fd);
+ infop->fd = -1;
- /* Increment the reference count. */
- ++rp->refcnt;
+ if (F_ISSET(infop, REGION_CREATED))
+ (void)__db_unlink(infop->name);
+ }
- /* Release the lock. */
- if (!LF_ISSET(DB_MUTEXDEBUG))
- (void)__db_mutex_unlock(&rp->lock, fd);
+ /* Discard the name. */
+ if (infop->name != NULL) {
+ FREES(infop->name);
+ infop->name = NULL;
+ }
- FREES(name);
+ /*
+ * If we had a temporary error, wait a few seconds and
+ * try again.
+ */
+ if (ret == 0) {
+ if (++retry_cnt <= 3) {
+ __db_sleep(retry_cnt * 2, 0);
+ goto loop;
+ }
+ ret = EAGAIN;
+ }
+ }
- *(void **)retp = rp;
- return (0);
+ /*
+ * XXX
+ * HP-UX won't permit mutexes to live in anything but shared memory.
+ * Instantiate a shared region file on that architecture, regardless.
+ *
+ * XXX
+ * There's a problem in cleaning this up on application exit, or on
+ * application failure. If an application opens a database without
+ * an environment, we create a temporary backing mpool region for it.
+ * That region is marked REGION_PRIVATE, but as HP-UX won't permit
+ * mutexes to live in anything but shared memory, we instantiate a
+ * real file plus a memory region of some form. If the application
+ * crashes, the necessary information to delete the backing file and
+ * any system region (e.g., the shmget(2) segment ID) is no longer
+ * available. We can't completely fix the problem, but we try.
+ *
+ * The underlying UNIX __db_mapregion() code preferentially uses the
+ * mmap(2) interface with the MAP_ANON/MAP_ANONYMOUS flags for regions
+ * that are marked REGION_PRIVATE. This means that we normally aren't
+ * holding any system resources when we get here, in which case we can
+ * delete the backing file. This results in a short race, from the
+ * __db_open() call above to here.
+ *
+ * If, for some reason, we are holding system resources when we get
+ * here, we don't have any choice -- we can't delete the backing file
+ * because we may need it to detach from the resources. Set the
+ * REGION_LASTDETACH flag, so that we do all necessary cleanup when
+ * the application closes the region.
+ */
+ if (F_ISSET(infop, REGION_PRIVATE) && !F_ISSET(infop, REGION_MALLOC))
+ if (F_ISSET(infop, REGION_HOLDINGSYS))
+ F_SET(infop, REGION_LASTDETACH);
+ else {
+ F_SET(infop, REGION_REMOVED);
+ F_CLR(infop, REGION_CANGROW);
+
+ (void)__db_close(infop->fd);
+ (void)__db_unlink(infop->name);
+ }
-err1: if (!LF_ISSET(DB_MUTEXDEBUG))
- (void)__db_mutex_unlock(&rp->lock, fd);
-err2: if (rp != NULL)
- (void)__db_unmap(rp, rp->size);
- if (fd != -1)
- (void)__db_close(fd);
- FREES(name);
return (ret);
}
/*
- * __db_rclose --
- * Close a shared memory region.
+ * __db_rdetach --
+ * De-attach from a shared memory region.
*
- * PUBLIC: int __db_rclose __P((DB_ENV *, int, void *));
+ * PUBLIC: int __db_rdetach __P((REGINFO *));
*/
int
-__db_rclose(dbenv, fd, ptr)
- DB_ENV *dbenv;
- int fd;
- void *ptr;
+__db_rdetach(infop)
+ REGINFO *infop;
{
- RLAYOUT *rp;
- int ret, t_ret;
- const char *fail;
+ RLAYOUT *rlp;
+ int detach, ret, t_ret;
- rp = ptr;
- fail = NULL;
+ ret = 0;
- /* Get the lock. */
- if ((ret = __db_mutex_lock(&rp->lock, fd)) != 0) {
- fail = "lock get";
- goto err;
+ /*
+ * If the region was removed when it was created, no further action
+ * is required.
+ */
+ if (F_ISSET(infop, REGION_REMOVED))
+ goto done;
+ /*
+ * If the region was created in memory returned by malloc, the only
+ * action required is freeing the memory.
+ */
+ if (F_ISSET(infop, REGION_MALLOC)) {
+ __db_free(infop->addr);
+ goto done;
}
+ /* Otherwise, attach to the region and optionally delete it. */
+ rlp = infop->addr;
+
+ /* Get the lock. */
+ (void)__db_mutex_lock(&rlp->lock, infop->fd);
+
/* Decrement the reference count. */
- --rp->refcnt;
+ if (rlp->refcnt == 0)
+ __db_err(infop->dbenv,
+ "region rdetach: reference count went to zero!");
+ else
+ --rlp->refcnt;
+
+ /*
+ * If we're going to remove the region, clear the valid flag so
+ * that any region join that's blocked waiting for us will know
+ * what happened.
+ */
+ detach = 0;
+ if (F_ISSET(infop, REGION_LASTDETACH))
+ if (rlp->refcnt == 0) {
+ detach = 1;
+ rlp->valid = 0;
+ } else
+ ret = EBUSY;
/* Release the lock. */
- if ((t_ret = __db_mutex_unlock(&rp->lock, fd)) != 0 && fail == NULL) {
- ret = t_ret;
- fail = "lock release";
- }
+ (void)__db_mutex_unlock(&rlp->lock, infop->fd);
- /* Discard the region. */
- if ((t_ret = __db_unmap(ptr, rp->size)) != 0 && fail == NULL) {
- ret = t_ret;
- fail = "munmap";
- }
+ /* Close the backing file descriptor. */
+ (void)__db_close(infop->fd);
+ infop->fd = -1;
- if ((t_ret = __db_close(fd)) != 0 && fail == NULL) {
+ /* Discard our mapping of the region. */
+ if ((t_ret = __db_unmapregion(infop)) != 0 && ret == 0)
ret = t_ret;
- fail = "close";
+
+ /* Discard the region itself. */
+ if (detach) {
+ if ((t_ret =
+ __db_unlinkregion(infop->name, infop) != 0) && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __db_unlink(infop->name) != 0) && ret == 0)
+ ret = t_ret;
}
- if (fail == NULL)
- return (0);
+done: /* Discard the name. */
+ if (infop->name != NULL) {
+ FREES(infop->name);
+ infop->name = NULL;
+ }
-err: __db_err(dbenv, "region detach: %s: %s", fail, strerror(ret));
return (ret);
}
/*
* __db_runlink --
- * Remove a shared memory region.
+ * Remove a region.
*
- * PUBLIC: int __db_runlink __P((DB_ENV *,
- * PUBLIC: APPNAME, const char *, const char *, int));
+ * PUBLIC: int __db_runlink __P((REGINFO *, int));
*/
int
-__db_runlink(dbenv, appname, path, file, force)
- DB_ENV *dbenv;
- APPNAME appname;
- const char *path, *file;
+__db_runlink(infop, force)
+ REGINFO *infop;
int force;
{
- RLAYOUT *rp;
- int cnt, fd, ret, t_ret;
+ RLAYOUT rl, *rlp;
+ size_t size;
+ ssize_t nr;
+ u_int32_t mbytes, bytes;
+ int fd, ret, t_ret;
char *name;
- rp = NULL;
+ /*
+ * XXX
+ * We assume that we've created a new REGINFO structure for this
+ * call, not used one that was already initialized. Regardless,
+ * if anyone is planning to use it after we're done, they're going
+ * to be sorely disappointed.
+ *
+ * If force isn't set, we attach to the region, set a flag to delete
+ * the region on last close, and let the region delete code do the
+ * work.
+ */
+ if (!force) {
+ if ((ret = __db_rattach(infop)) != 0)
+ return (ret);
- /* Get the filename. */
- if ((ret = __db_appname(dbenv, appname, path, file, NULL, &name)) != 0)
- return (ret);
+ rlp = (RLAYOUT *)infop->addr;
+ (void)__db_mutex_unlock(&rlp->lock, infop->fd);
- /* If the file doesn't exist, we're done. */
- if (__db_exists(name, NULL))
- goto done;
+ F_SET(infop, REGION_LASTDETACH);
+
+ return (__db_rdetach(infop));
+ }
/*
- * If we're called with a force flag, try and unlink the file. This
- * may not succeed if the file is currently open, but there's nothing
- * we can do about that. There is a race condition between the check
- * for existence above and the actual unlink. If someone else snuck
- * in and removed it before we do the remove, then we might get an
- * ENOENT error. If we get the ENOENT, we treat it as success, just
- * as we do above.
+ * Otherwise, we don't want to attach to the region. We may have been
+ * called to clean up if a process died leaving a region locked and/or
+ * corrupted, which could cause the attach to hang.
*/
- if (force) {
- if ((ret = __db_unlink(name)) != 0 && ret != ENOENT)
- goto err1;
- goto done;
+ if ((ret = __db_appname(infop->dbenv, infop->appname,
+ infop->path, infop->file, infop->dbflags, NULL, &name)) != 0)
+ return (ret);
+
+ /*
+ * An underlying file is created for all regions other than private
+ * (REGION_PRIVATE) ones, regardless of whether or not it's used to
+ * back the region. If that file doesn't exist, we're done.
+ */
+ if (__db_exists(name, NULL) != 0) {
+ FREES(name);
+ return (0);
}
- /* Open and lock the region. */
- if ((ret = __db_ropen(dbenv, appname, path, file, 0, &fd, &rp)) != 0)
- goto err1;
- (void)__db_mutex_lock(&rp->lock, fd);
+ /*
+ * See the comments in __db_rattach -- figure out if this is a regular
+ * file backing a region or if it's a regular file with information
+ * about a region.
+ */
+ if ((ret = __db_open(name, DB_RDONLY, DB_RDONLY, 0, &fd)) != 0)
+ goto errmsg;
+ if ((ret = __db_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0)
+ goto errmsg;
+ size = mbytes * MEGABYTE + bytes;
- /* If the region is currently being deleted, fail. */
- if (F_ISSET(rp, DB_R_DELETED)) {
- ret = ENOENT; /* XXX: ENOENT? */
- goto err2;
- }
+ if (size <= sizeof(RLAYOUT)) {
+ if ((ret = __db_read(fd, &rl, sizeof(rl), &nr)) != 0)
+ goto errmsg;
+ if (rl.valid != DB_REGIONMAGIC) {
+ __db_err(infop->dbenv,
+ "%s: illegal region magic number", name);
+ ret = EINVAL;
+ goto err;
+ }
- /* If the region is currently in use by someone else, fail. */
- if (rp->refcnt > 1) {
- ret = EBUSY;
- goto err2;
+ /* Set the size, memory id and characteristics. */
+ infop->size = rl.size;
+ infop->segid = rl.segid;
+ if (F_ISSET(&rl, REGION_ANONYMOUS))
+ F_SET(infop, REGION_ANONYMOUS);
+ } else {
+ infop->size = size;
+ infop->segid = INVALID_SEGID;
}
- /* Set the delete flag. */
- F_SET(rp, DB_R_DELETED);
-
- /* Release the lock and close the region. */
- (void)__db_mutex_unlock(&rp->lock, fd);
- if ((t_ret = __db_rclose(dbenv, fd, rp)) != 0 && ret == 0)
- goto err1;
+ /* Remove the underlying region. */
+ ret = __db_unlinkregion(name, infop);
/*
- * Unlink the region. There's a race here -- other threads or
- * processes might be opening the region while we're trying to
- * remove it. They'll fail, because we've set the DELETED flag,
- * but they could still stop us from succeeding in the unlink.
+ * Unlink the backing file. Close the open file descriptor first,
+ * because some architectures (e.g., Win32) won't unlink a file if
+ * open file descriptors remain.
*/
- for (cnt = 5; cnt > 0; --cnt) {
- if ((ret = __db_unlink(name)) == 0)
- break;
- (void)__db_sleep(0, 250000);
- }
- if (ret == 0) {
-done: FREES(name);
- return (0);
- }
-
- /* Not a clue. Try to clear the DB_R_DELETED flag. */
- if ((ret = __db_ropen(dbenv, appname, path, file, 0, &fd, &rp)) != 0)
- goto err1;
- (void)__db_mutex_lock(&rp->lock, fd);
- F_CLR(rp, DB_R_DELETED);
- /* FALLTHROUGH */
+ (void)__db_close(fd);
+ if ((t_ret = __db_unlink(name)) != 0 && ret == 0)
+ ret = t_ret;
-err2: (void)__db_mutex_unlock(&rp->lock, fd);
- (void)__db_rclose(dbenv, fd, rp);
-err1: __db_err(dbenv, "region unlink: %s: %s", name, strerror(ret));
+ if (0) {
+errmsg: __db_err(infop->dbenv, "%s: %s", name, strerror(ret));
+err: (void)__db_close(fd);
+ }
FREES(name);
return (ret);
}
/*
- * DB creates all regions on 4K boundaries so that we don't make the
- * underlying VM unhappy.
- */
-#define __DB_VMPAGESIZE (4 * 1024)
-
-/*
* __db_rgrow --
- * Extend a region by a specified amount.
+ * Extend a region.
*
- * PUBLIC: int __db_rgrow __P((DB_ENV *, int, size_t));
+ * PUBLIC: int __db_rgrow __P((REGINFO *, size_t));
*/
int
-__db_rgrow(dbenv, fd, incr)
- DB_ENV *dbenv;
- int fd;
- size_t incr;
+__db_rgrow(infop, new_size)
+ REGINFO *infop;
+ size_t new_size;
+{
+ RLAYOUT *rlp;
+ size_t increment;
+ int ret;
+
+ /*
+ * !!!
+ * This routine MUST be called with the region already locked.
+ */
+
+ /* The underlying routines have flagged if this region can grow. */
+ if (!F_ISSET(infop, REGION_CANGROW))
+ return (EINVAL);
+
+ /*
+ * Round off the requested size to the next page boundary, and
+ * determine the additional space required.
+ */
+ rlp = (RLAYOUT *)infop->addr;
+ DB_ROUNDOFF(new_size);
+ increment = new_size - rlp->size;
+
+ if ((ret = __db_growregion(infop, increment)) != 0)
+ return (ret);
+
+ /* Update the on-disk region size. */
+ rlp->size = new_size;
+
+ /* Detach from and reattach to the region. */
+ return (__db_rreattach(infop, new_size));
+}
+
+/*
+ * __db_growregion --
+ * Grow a shared memory region.
+ */
+static int
+__db_growregion(infop, increment)
+ REGINFO *infop;
+ size_t increment;
{
+ db_pgno_t pages;
size_t i;
- ssize_t nw;
- int mmap_init_needed, ret;
- char buf[__DB_VMPAGESIZE];
+ ssize_t nr, nw;
+ u_int32_t relative;
+ int ret;
+ char buf[DB_VMPAGESIZE];
/* Seek to the end of the region. */
- if ((ret = __db_seek(fd, 0, 0, 0, SEEK_END)) != 0)
+ if ((ret = __db_seek(infop->fd, 0, 0, 0, 0, SEEK_END)) != 0)
goto err;
/* Write nuls to the new bytes. */
memset(buf, 0, sizeof(buf));
/*
- * Historically, some systems required that all of the bytes of the
- * region be written before it could be mmapped and accessed randomly.
- *
- * Windows/95 doesn't have that problem, but it leaves file contents
- * uninitialized. Win/NT apparently initializes them.
+ * Some systems require that all of the bytes of the region be
+ * written before it can be mapped and accessed randomly, and
+ * other systems don't zero out the pages.
*/
-#ifdef MMAP_INIT_NEEDED
- mmap_init_needed = 1;
-#else
- mmap_init_needed = __os_oldwin();
-#endif
- if (mmap_init_needed)
+ if (__db_mapinit())
/* Extend the region by writing each new page. */
- for (i = 0; i < incr; i += __DB_VMPAGESIZE) {
- if ((ret = __db_write(fd, buf, sizeof(buf), &nw)) != 0)
+ for (i = 0; i < increment; i += DB_VMPAGESIZE) {
+ if ((ret =
+ __db_write(infop->fd, buf, sizeof(buf), &nw)) != 0)
goto err;
if (nw != sizeof(buf))
goto eio;
}
else {
/*
- * Extend the region by writing the last page.
- *
- * Round off the increment to the next page boundary.
+ * Extend the region by writing the last page. If the region
+ * is >4Gb, increment may be larger than the maximum possible
+ * seek "relative" argument, as it's an unsigned 32-bit value.
+ * Break the offset into pages of 1MB each so that we don't
+ * overflow (2^20 + 2^32 is bigger than any memory I expect
+ * to see for awhile).
*/
- incr += __DB_VMPAGESIZE - 1;
- incr -= incr % __DB_VMPAGESIZE;
-
- /* Write the last page, not the page after the last. */
- if ((ret =
- __db_seek(fd, 0, 0, incr - __DB_VMPAGESIZE, SEEK_CUR)) != 0)
+ pages = (increment - DB_VMPAGESIZE) / MEGABYTE;
+ relative = (increment - DB_VMPAGESIZE) % MEGABYTE;
+ if ((ret = __db_seek(infop->fd,
+ MEGABYTE, pages, relative, 0, SEEK_CUR)) != 0)
goto err;
- if ((ret = __db_write(fd, buf, sizeof(buf), &nw)) != 0)
+ if ((ret = __db_write(infop->fd, buf, sizeof(buf), &nw)) != 0)
goto err;
if (nw != sizeof(buf))
goto eio;
+
+ /*
+ * It's sometimes significantly faster to page-fault in all
+ * of the region's pages before we run the application, as
+ * we can see fairly nasty side-effects when we page-fault
+ * while holding various locks, i.e., the lock takes a long
+ * time, and other threads convoy behind the lock holder.
+ */
+ if (DB_GLOBAL(db_region_init)) {
+ pages = increment / MEGABYTE;
+ relative = increment % MEGABYTE;
+ if ((ret = __db_seek(infop->fd,
+ MEGABYTE, pages, relative, 1, SEEK_END)) != 0)
+ goto err;
+
+ /* Read a byte from each page. */
+ for (i = 0; i < increment; i += DB_VMPAGESIZE) {
+ if ((ret =
+ __db_read(infop->fd, buf, 1, &nr)) != 0)
+ goto err;
+ if (nr != 1)
+ goto eio;
+ if ((ret = __db_seek(infop->fd,
+ 0, 0, DB_VMPAGESIZE - 1, 0, SEEK_CUR)) != 0)
+ goto err;
+ }
+ }
}
return (0);
eio: ret = EIO;
-err: __db_err(dbenv, "region grow: %s", strerror(ret));
+err: __db_err(infop->dbenv, "region grow: %s", strerror(ret));
return (ret);
}
/*
- * __db_rremap --
- * Unmap the old region and map in a new region of a new size. If
- * either call fails, returns NULL, else returns the address of the
- * new region.
+ * __db_rreattach --
+ * Detach from and reattach to a region.
*
- * PUBLIC: int __db_rremap __P((DB_ENV *, void *, size_t, size_t, int, void *));
+ * PUBLIC: int __db_rreattach __P((REGINFO *, size_t));
*/
int
-__db_rremap(dbenv, ptr, oldsize, newsize, fd, retp)
- DB_ENV *dbenv;
- void *ptr, *retp;
- size_t oldsize, newsize;
- int fd;
+__db_rreattach(infop, new_size)
+ REGINFO *infop;
+ size_t new_size;
{
int ret;
- if ((ret = __db_unmap(ptr, oldsize)) != 0) {
- __db_err(dbenv, "region remap: munmap: %s", strerror(ret));
- return (ret);
+#ifdef DIAGNOSTIC
+ if (infop->name == NULL) {
+ __db_err(infop->dbenv, "__db_rreattach: name was NULL");
+ return (EINVAL);
}
+#endif
+ /*
+ * If we're growing an already mapped region, we have to unmap it
+ * and get it back. We have it locked, so nobody else can get in,
+ * which makes it fairly straight-forward to do, as everybody else
+ * is going to block while we do the unmap/remap. NB: if we fail
+ * to get it back, the pooch is genuinely screwed, because we can
+ * never release the lock we're holding.
+ *
+ * Detach from the region. We have to do this first so architectures
+ * that don't permit a file to be mapped into different places in the
+ * address space simultaneously, e.g., HP's PaRisc, will work.
+ */
+ if ((ret = __db_unmapregion(infop)) != 0)
+ return (ret);
- return (__db_rmap(dbenv, fd, newsize, retp));
-}
-
-/*
- * __db_rmap --
- * Attach to a shared memory region.
- */
-static int
-__db_rmap(dbenv, fd, size, retp)
- DB_ENV *dbenv;
- int fd;
- size_t size;
- void *retp;
-{
- RLAYOUT *rp;
- int ret;
+ /* Update the caller's REGINFO size to the new map size. */
+ infop->size = new_size;
- if ((ret = __db_map(fd, size, 0, 0, (void **)&rp)) != 0) {
- __db_err(dbenv, "region map: mmap %s", strerror(ret));
- return (ret);
- }
- if (rp->size < size)
- rp->size = size;
+ /* Attach to the region. */
+ ret = __db_mapregion(infop->name, infop);
- *(void **)retp = rp;
- return (0);
+ return (ret);
}