aboutsummaryrefslogtreecommitdiff
path: root/node.go
diff options
context:
space:
mode:
Diffstat (limited to 'node.go')
-rw-r--r--node.go141
1 files changed, 123 insertions, 18 deletions
diff --git a/node.go b/node.go
index 49de144..fd737ea 100644
--- a/node.go
+++ b/node.go
@@ -12,12 +12,20 @@ type node struct {
isLeaf bool
unbalanced bool
key []byte
- depth int
pgid pgid
parent *node
+ children []*node
inodes inodes
}
+// root returns the top-level node this node is attached to.
+func (n *node) root() *node {
+ if n.parent == nil {
+ return n
+ }
+ return n.parent.root()
+}
+
// minKeys returns the minimum number of inodes this node should have.
func (n *node) minKeys() int {
if n.isLeaf {
@@ -185,12 +193,15 @@ func (n *node) write(p *page) {
}
}
-// split divides up the node into appropriately sized nodes.
+// split breaks up a node into smaller nodes, if appropriate.
+// This should only be called from the spill() function.
func (n *node) split(pageSize int) []*node {
+ var nodes = []*node{n}
+
// Ignore the split if the page doesn't have at least enough nodes for
// multiple pages or if the data can fit on a single page.
if len(n.inodes) <= (minKeysPerPage*2) || n.size() < pageSize {
- return []*node{n}
+ return nodes
}
// Set fill threshold to 50%.
@@ -198,28 +209,106 @@ func (n *node) split(pageSize int) []*node {
// Group into smaller pages and target a given fill size.
size := pageHeaderSize
- inodes := n.inodes
+ internalNodes := n.inodes
current := n
current.inodes = nil
- var nodes []*node
- for i, inode := range inodes {
+ // Loop over every inode and split once we reach our threshold.
+ for i, inode := range internalNodes {
elemSize := n.pageElementSize() + len(inode.key) + len(inode.value)
- if len(current.inodes) >= minKeysPerPage && i < len(inodes)-minKeysPerPage && size+elemSize > threshold {
- size = pageHeaderSize
+ // Split once we reach our threshold split size. However, this should
+ // only be done if we have enough keys for this node and we will have
+ // enough keys for the next node.
+ if len(current.inodes) >= minKeysPerPage && i < len(internalNodes)-minKeysPerPage && size+elemSize > threshold {
+ // If there's no parent then we need to create one.
+ if n.parent == nil {
+ n.parent = &node{bucket: n.bucket, children: []*node{n}}
+ }
+
+ // Create a new node and add it to the parent.
+ current = &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
+ n.parent.children = append(n.parent.children, current)
nodes = append(nodes, current)
- current = &node{bucket: n.bucket, isLeaf: n.isLeaf}
+
+ // Reset our running total back to zero (plus header size).
+ size = pageHeaderSize
+
+ // Update the statistics.
+ n.bucket.tx.stats.Split++
}
+ // Increase our running total of the size and append the inode.
size += elemSize
current.inodes = append(current.inodes, inode)
}
- nodes = append(nodes, current)
return nodes
}
+// spill writes the nodes to dirty pages and splits nodes as it goes.
+// Returns an error if dirty pages cannot be allocated.
+func (n *node) spill() error {
+ var tx = n.bucket.tx
+
+ // Spill child nodes first.
+ for _, child := range n.children {
+ if err := child.spill(); err != nil {
+ return err
+ }
+ }
+
+ // Add node's page to the freelist if it's not new.
+ if n.pgid > 0 {
+ tx.db.freelist.free(tx.id(), tx.page(n.pgid))
+ }
+
+ // Spill nodes by deepest first.
+ var nodes = n.split(tx.db.pageSize)
+ for _, node := range nodes {
+ // Allocate contiguous space for the node.
+ p, err := tx.allocate((node.size() / tx.db.pageSize) + 1)
+ if err != nil {
+ return err
+ }
+
+ // Write the node.
+ node.write(p)
+ node.pgid = p.id
+
+ // Insert into parent inodes.
+ if node.parent != nil {
+ var key = node.key
+ if key == nil {
+ key = node.inodes[0].key
+ }
+
+ node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
+ node.key = node.inodes[0].key
+ }
+
+ // Update the statistics.
+ tx.stats.Spill++
+ }
+
+ // This is a special case where we need to write the parent if it is new
+ // and caused by a split in the root.
+ var parent = n.parent
+ if parent != nil && parent.pgid == 0 {
+ // Allocate contiguous space for the node.
+ p, err := tx.allocate((parent.size() / tx.db.pageSize) + 1)
+ if err != nil {
+ return err
+ }
+
+ // Write the new root.
+ parent.write(p)
+ parent.pgid = p.id
+ }
+
+ return nil
+}
+
// rebalance attempts to combine the node with sibling nodes if the node fill
// size is below a threshold or if there are not enough keys.
func (n *node) rebalance() {
@@ -241,10 +330,11 @@ func (n *node) rebalance() {
if n.parent == nil {
// If root node is a branch and only has one node then collapse it.
if !n.isLeaf && len(n.inodes) == 1 {
- // Move child's children up.
+ // Move root's child up.
child := n.bucket.nodes[n.inodes[0].pgid]
n.isLeaf = child.isLeaf
n.inodes = child.inodes[:]
+ n.children = child.children
// Reparent all child nodes being moved.
for _, inode := range n.inodes {
@@ -278,7 +368,9 @@ func (n *node) rebalance() {
if useNextSibling {
// Reparent and move node.
if child, ok := n.bucket.nodes[target.inodes[0].pgid]; ok {
+ child.parent.removeChild(child)
child.parent = n
+ child.parent.children = append(child.parent.children, child)
}
n.inodes = append(n.inodes, target.inodes[0])
target.inodes = target.inodes[1:]
@@ -289,7 +381,9 @@ func (n *node) rebalance() {
} else {
// Reparent and move node.
if child, ok := n.bucket.nodes[target.inodes[len(target.inodes)-1].pgid]; ok {
+ child.parent.removeChild(child)
child.parent = n
+ child.parent.children = append(child.parent.children, child)
}
n.inodes = append(n.inodes, inode{})
copy(n.inodes[1:], n.inodes)
@@ -309,26 +403,32 @@ func (n *node) rebalance() {
// Reparent all child nodes being moved.
for _, inode := range target.inodes {
if child, ok := n.bucket.nodes[inode.pgid]; ok {
+ child.parent.removeChild(child)
child.parent = n
+ child.parent.children = append(child.parent.children, child)
}
}
// Copy over inodes from target and remove target.
n.inodes = append(n.inodes, target.inodes...)
n.parent.del(target.key)
+ n.parent.removeChild(target)
delete(n.bucket.nodes, target.pgid)
target.free()
} else {
// Reparent all child nodes being moved.
for _, inode := range n.inodes {
if child, ok := n.bucket.nodes[inode.pgid]; ok {
+ child.parent.removeChild(child)
child.parent = target
+ child.parent.children = append(child.parent.children, child)
}
}
// Copy over inodes to target and remove node.
target.inodes = append(target.inodes, n.inodes...)
n.parent.del(n.key)
+ n.parent.removeChild(n)
n.parent.put(target.key, target.inodes[0].key, nil, target.pgid, 0)
delete(n.bucket.nodes, n.pgid)
n.free()
@@ -338,6 +438,17 @@ func (n *node) rebalance() {
n.parent.rebalance()
}
+// removes a node from the list of in-memory children.
+// This does not affect the inodes.
+func (n *node) removeChild(target *node) {
+ for i, child := range n.children {
+ if child == target {
+ n.children = append(n.children[:i], n.children[i+1:]...)
+ return
+ }
+ }
+}
+
// dereference causes the node to copy all its inode key/value references to heap memory.
// This is required when the mmap is reallocated so inodes are not pointing to stale data.
func (n *node) dereference() {
@@ -362,16 +473,10 @@ func (n *node) dereference() {
func (n *node) free() {
if n.pgid != 0 {
n.bucket.tx.db.freelist.free(n.bucket.tx.id(), n.bucket.tx.page(n.pgid))
+ n.pgid = 0
}
}
-// nodesByDepth sorts a list of branches by deepest first.
-type nodesByDepth []*node
-
-func (s nodesByDepth) Len() int { return len(s) }
-func (s nodesByDepth) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
-func (s nodesByDepth) Less(i, j int) bool { return s[i].depth > s[j].depth }
-
// inode represents an internal node inside of a node.
// It can be used to point to elements in a page or point
// to an element which hasn't been added to a page yet.