/* * Copyright (c) Meta Platforms, Inc. and affiliates * * Authors: Mattias Nissler * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Nutanix nor the names of its contributors may be * used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include #include #include #include #include #include #include "btree.h" #include "common.h" /* * The allocation size for nodes. This parameter determines fan-out and thus * balances cost of scanning a node vs. allocation overhead tree height. */ #define BTREE_NODE_SIZE 4096 /* * Number of entries within a node. This calculates the number of entries based * on allocation size and node contents, filling the space not occupied by * fixed-size items with array entries. */ #define BTREE_NODE_NUM_ENTRIES \ ((BTREE_NODE_SIZE - (sizeof(size_t) + sizeof(struct btree_node *))) / \ (sizeof(uintptr_t) + sizeof(void *) + sizeof(struct btree_node *))) /* * Node alignment. Chosen such that there are enough unused low-order bits to * hold the node index in a cursor. */ #define BTREE_NODE_ALIGNMENT NEXT_POWER_OF_2(BTREE_NODE_NUM_ENTRIES) /* * Cursor mask, used for extracting the pointer and index fields from a cursor * value. */ #define BTREE_NODE_CURSOR_MASK (BTREE_NODE_ALIGNMENT - 1) /* * Minimum occupancy count of a node. This is used for making rebalancing * decisions. Note that the root node is an exception and may hold any number * of entries. */ #define BTREE_MIN_DEGREE (BTREE_NODE_NUM_ENTRIES / 2) /* Helpers for invoking memcpy/memmove on node entry arrays. */ #define btree_move_array_entries(array, from, to, count) \ memmove(&(array)[(to)], &(array)[(from)], (count) * sizeof((array)[0])) #define btree_copy_array_entries(from, to, count) \ memcpy((to), (from), (count) * sizeof((from)[0])) /* Represents a tree node. */ struct btree_node { /* Number of valid entries in the arrays below (children has count + 1). */ size_t count; /* * Numeric entry keys used as ordering criterion. These are stored directly * in the node rather than obtained from the corresponding value to make * sure tree traversal only needs to access node memory. */ uintptr_t keys[BTREE_NODE_NUM_ENTRIES]; /* * Tree data payload, entirely opaque to the code here. It is OK to store * NULL pointers, although that might make it harder for the caller to * distinguish present-but-NULL from absent entries. */ void *values[BTREE_NODE_NUM_ENTRIES]; /* * Child node pointers. These are conceptually on each side of a node's * entry, hence the number of children of a node is one more than its * entries. The subtree at the child left to an entry contains keys that * are less than or equal to the entry, the right subtree contains larger * or equal entries. */ struct btree_node *children[BTREE_NODE_NUM_ENTRIES + 1]; }; _Static_assert(sizeof(struct btree_node) <= BTREE_NODE_SIZE, "btree node size exceeds allocation size"); _Static_assert(BTREE_NODE_SIZE % BTREE_NODE_ALIGNMENT == 0, "allocation size must be a multiple of alignment"); /* Make a cursor from a node pointer and entry index. */ static inline btree_cursor_t btree_cursor(struct btree_node *node, size_t pos) { uintptr_t ptr = (uintptr_t)node; assert((ptr & BTREE_NODE_CURSOR_MASK) == 0); assert((pos & ~BTREE_NODE_CURSOR_MASK) == 0); return ptr | pos; } /* Extract the node pointer from a cursor value. */ static inline struct btree_node * btree_cursor_node(btree_iter_t *iter, int level) { return (struct btree_node *)(iter->cursors[level] & ~BTREE_NODE_CURSOR_MASK); } /* Extract the entry index from a cursor value. */ static inline size_t btree_cursor_pos(btree_iter_t *iter, int level) { return iter->cursors[level] & BTREE_NODE_CURSOR_MASK; } /* * Allocate a new node with the correct alignment so we have enough low-order * bits to use for index storage. */ static struct btree_node * node_alloc(void) { void *node = NULL; if (posix_memalign(&node, BTREE_NODE_ALIGNMENT, BTREE_NODE_SIZE) != 0) { return NULL; } memset(node, 0, sizeof(struct btree_node)); return node; } /* * Recursively free a node and all its descendants. */ static void node_destroy_recursive(struct btree_node *node, int height) { if (height == 0) { return; } for (size_t i = 0; i <= node->count; ++i) { node_destroy_recursive(node->children[i], height - 1); } free(node); } void btree_init(btree_t *tree) { assert(tree != NULL); tree->root = NULL; tree->size = 0; tree->height = 0; } void btree_destroy(btree_t *tree) { assert(tree != NULL); node_destroy_recursive(tree->root, tree->height); btree_init(tree); } size_t btree_size(btree_t *tree) { assert(tree != NULL); return tree->size; } void btree_iter_init(btree_t *tree, uintptr_t key, btree_iter_t *iter) { iter->tree = tree; /* * Build the cursor stack starting at the root, working towards the leaf * and filling in cursors for each level. Note that for empty trees, height * is 0 and thus the loop will be skipped entirely. */ struct btree_node *node = tree->root; for (int level = iter->tree->height - 1; level >= 0; --level) { /* Find the position within the current node. */ size_t pos; for (pos = 0; pos < node->count; pos++) { if (node->keys[pos] >= key) { break; } } assert(pos <= BTREE_NODE_NUM_ENTRIES); iter->cursors[level] = btree_cursor(node, pos); node = node->children[pos]; } } /* * Helper function to find the level of the element that is on the right side * of the iterator cut. This is usually the next element on the leaf level, but * in case we have reached the end of a node, we need to recursively check * parent nodes towards the tree root as indicated by the iterator's cursors. * Returns the level, or the tree height if we're at the right end of the tree * and no right side element exists. */ static inline int btree_iter_right_side_level(btree_iter_t *iter) { int level; for (level = 0; level < iter->tree->height; ++level) { struct btree_node *node = btree_cursor_node(iter, level); size_t pos = btree_cursor_pos(iter, level); if (pos < node->count) { break; } } return level; } void * btree_iter_get(btree_iter_t *iter, uintptr_t *key) { int level = btree_iter_right_side_level(iter); if (level >= iter->tree->height) { return NULL; } struct btree_node *node = btree_cursor_node(iter, level); size_t pos = btree_cursor_pos(iter, level); if (key != NULL) { *key = node->keys[pos]; } return node->values[pos]; } void * btree_iter_next(btree_iter_t *iter) { int level = btree_iter_right_side_level(iter); if (level >= iter->tree->height) { return NULL; } /* * Skip across the right element. We are sure it exists, otherwise we would * have bailed above. */ ++iter->cursors[level]; struct btree_node *node = btree_cursor_node(iter, level); size_t pos = btree_cursor_pos(iter, level); /* * Rebuild the cursor levels towards the leaf level: We're at the left edge * of the subtree between the entries in the level we advanced. */ for (node = node->children[pos]; --level >= 0; node = node->children[0]) { iter->cursors[level] = btree_cursor(node, 0); } return btree_iter_get(iter, NULL); } /* * Insert a new entry into the given node. The caller must make sure there is * capacity, and that the insertion maintains proper key order. */ static void btree_node_insert_entry(struct btree_node *node, size_t pos, uintptr_t key, void *value, struct btree_node *left_child, struct btree_node *right_child) { assert(pos <= node->count); /* There must be available space in the given node. */ assert(node->count < BTREE_NODE_NUM_ENTRIES); /* Callers must make sure to maintain key order. */ assert(pos == node->count || node->keys[pos] >= key); assert(pos == 0 || node->keys[pos - 1] <= key); /* Shift existing entries right to make room for the new entry. */ int count = node->count - pos; btree_move_array_entries(node->keys, pos, pos + 1, count); btree_move_array_entries(node->values, pos, pos + 1, count); btree_move_array_entries(node->children, pos + 1, pos + 2, count); /* Put the new entry in place. */ node->keys[pos] = key; node->values[pos] = value; node->children[pos] = left_child; node->children[pos + 1] = right_child; ++node->count; } int btree_iter_insert(btree_iter_t *iter, uintptr_t key, void *value) { /* Lazy initialization of empty tree. */ if (iter->tree->height == 0) { struct btree_node *root = node_alloc(); if (root == NULL) { errno = ENOMEM; return -1; } root->count = 1; root->keys[0] = key; root->values[0] = value; assert(iter->tree->root == NULL); iter->tree->root = root; iter->cursors[0] = btree_cursor(root, 0); iter->tree->size = 1; iter->tree->height = 1; return 0; } /* * We traverse the tree from the top along the path given by the iterator. * Along the way, we proactively split nodes that are at capacity. This * guarantees that we can insert another element if we need to push one up * from the next level we will visit. * * While proactive splitting might do more work than necessary, it * simplifies the implementation: Tree structure remains consistent at all * times, so we can just bail when hitting errors without having to repair * the tree in the error path. */ for (int level = iter->tree->height - 1; level >= 0; --level) { struct btree_node *node = btree_cursor_node(iter, level); size_t pos = btree_cursor_pos(iter, level); /* Reject insertion attempts that violate key order. */ if ((pos < node->count && key > node->keys[pos]) || (pos > 0 && key < node->keys[pos - 1])) { errno = EINVAL; return -1; } /* Proactively split the node if it is full. */ if (node->count == BTREE_NODE_NUM_ENTRIES) { /* Allocate a right sibling to insert */ struct btree_node *right = node_alloc(); if (right == NULL) { errno = ENOMEM; return -1; } /* If necessary, allocate a new root. */ if (level == iter->tree->height - 1) { if (iter->tree->height >= BTREE_MAX_HEIGHT) { free(right); errno = EOVERFLOW; return -1; } struct btree_node *root = node_alloc(); if (root == NULL) { free(right); errno = ENOMEM; return -1; } root->count = 0; /* insertion will bump it */ root->children[0] = node; iter->cursors[level + 1] = btree_cursor(root, 0); iter->tree->root = root; ++iter->tree->height; } size_t split = BTREE_NODE_NUM_ENTRIES / 2; size_t count = BTREE_NODE_NUM_ENTRIES - split; btree_copy_array_entries(&node->keys[split + 1], &right->keys[0], count - 1); btree_copy_array_entries(&node->values[split + 1], &right->values[0], count - 1); btree_copy_array_entries(&node->children[split + 1], &right->children[0], count); right->count = count - 1; uintptr_t median_key = node->keys[split]; void *median_value = node->values[split]; node->count = split; /* Push the median element to the parent level. */ btree_node_insert_entry(btree_cursor_node(iter, level + 1), btree_cursor_pos(iter, level + 1), median_key, median_value, node, right); /* Update the iterator cursor stack. */ iter->cursors[level] = pos <= split ? btree_cursor(node, pos) : btree_cursor(right, pos - split - 1); iter->cursors[level + 1] += pos <= split ? 0 : 1; } } /* Insert the new element. */ btree_node_insert_entry(btree_cursor_node(iter, 0), btree_cursor_pos(iter, 0), key, value, NULL, NULL); ++iter->tree->size; return 0; } /* * Remove the entry at the given position in a node. Entries to the right will * be shifted one position to the left to close the gap. */ static void btree_node_remove_entry(struct btree_node *node, size_t pos) { assert(pos < node->count); --node->count; size_t count = node->count - pos; btree_move_array_entries(node->keys, pos + 1, pos, count); btree_move_array_entries(node->values, pos + 1, pos, count); btree_move_array_entries(node->children, pos + 2, pos + 1, count); } void * btree_iter_remove(btree_iter_t *iter) { bool advance_iter = false; int level = btree_iter_right_side_level(iter); if (level >= iter->tree->height) { /* The iterator has reached the end, there is nothing to remove. */ return NULL; } struct btree_node *node = btree_cursor_node(iter, level); size_t pos = btree_cursor_pos(iter, level); void *value = node->values[pos]; if (level > 0) { /* * Inner node: Grab the left subtree's largest entry to use as a new * separator. */ struct btree_node *left_leaf_node = btree_cursor_node(iter, 0); assert(left_leaf_node->count > 0); --left_leaf_node->count; node->keys[pos] = left_leaf_node->keys[left_leaf_node->count]; node->values[pos] = left_leaf_node->values[left_leaf_node->count]; assert(btree_cursor_pos(iter, 0) > 0); --iter->cursors[0]; /* * We have moved the new separator element to the other side of the * iterator position, so the iterator must be advanced to maintain its * position. We can't do that here though because the cursor stack is * still needed for rebalancing, so we take a note to advance the * iterator later. */ advance_iter = true; } else { /* Leaf node: just remove the entry. */ btree_node_remove_entry(node, pos); } --iter->tree->size; /* Fix deficient nodes, working from the leaf level towards the root. */ for (level = 0; level < iter->tree->height - 1; ++level) { struct btree_node *parent = btree_cursor_node(iter, level + 1); pos = btree_cursor_pos(iter, level + 1); node = btree_cursor_node(iter, level); if (node->count >= BTREE_MIN_DEGREE) { /* * The node we're looking at has enough entries. This means we're * done, since nodes further towards the top haven't been changed. */ break; } /* Try rotating in an element from the left sibling. */ if (pos > 0) { struct btree_node *left_sibling = parent->children[pos - 1]; size_t left_count = left_sibling->count; if (left_count > BTREE_MIN_DEGREE) { btree_node_insert_entry( node, 0, parent->keys[pos - 1], parent->values[pos - 1], left_sibling->children[left_count], node->children[0]); parent->keys[pos - 1] = left_sibling->keys[left_count - 1]; parent->values[pos - 1] = left_sibling->values[left_count - 1]; --left_sibling->count; /* * We shifted the node's entries right by one, so adjust the * iterator's cursors for the current level. */ ++iter->cursors[level]; break; } } /* Try rotating in an element from the right sibling. */ if (pos < parent->count) { struct btree_node *right_sibling = parent->children[pos + 1]; if (right_sibling->count > BTREE_MIN_DEGREE) { node->keys[node->count] = parent->keys[pos]; node->values[node->count] = parent->values[pos]; node->children[node->count + 1] = right_sibling->children[0]; ++node->count; parent->keys[pos] = right_sibling->keys[0]; parent->values[pos] = right_sibling->values[0]; right_sibling->children[0] = right_sibling->children[1]; btree_node_remove_entry(right_sibling, 0); /* * Iterator state is good as is since we changed tree structure * only on the right of the iterator position. */ break; } } /* Rotation didn't work, merge nodes. */ size_t merge_pos = pos > 0 ? pos - 1 : pos; struct btree_node *left = parent->children[merge_pos]; struct btree_node *right = parent->children[merge_pos + 1]; left->keys[left->count] = parent->keys[merge_pos]; left->values[left->count] = parent->values[merge_pos]; ++left->count; size_t right_pos_shift = left->count; size_t right_count = right->count; btree_copy_array_entries(&right->keys[0], &left->keys[left->count], right_count); btree_copy_array_entries(&right->values[0], &left->values[left->count], right_count); btree_copy_array_entries(&right->children[0], &left->children[left->count], right_count + 1); left->count += right_count; assert(left->count <= BTREE_NODE_NUM_ENTRIES); btree_node_remove_entry(parent, merge_pos); if (node == right) { /* * Update iterator state if changes happened left to its position: * - The separator element between left and right in the parent * node has been removed: parent cursor position needs to be * decremented by one. * - The elements in the right node have been moved to the left * node: Update cursor with new node pointer and shifted index. */ --iter->cursors[level + 1]; iter->cursors[level] = btree_cursor( left, btree_cursor_pos(iter, level) + right_pos_shift); } free(right); } /* Remove the root node in case it has become empty. */ if (iter->tree->root->count == 0) { int height = --iter->tree->height; free(iter->tree->root); iter->tree->root = height > 0 ? btree_cursor_node(iter, height - 1) : NULL; } if (advance_iter) { btree_iter_next(iter); } return value; } /* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */