aboutsummaryrefslogtreecommitdiff
path: root/node.go
diff options
context:
space:
mode:
Diffstat (limited to 'node.go')
-rw-r--r--node.go121
1 files changed, 75 insertions, 46 deletions
diff --git a/node.go b/node.go
index 1502be0..f0978ca 100644
--- a/node.go
+++ b/node.go
@@ -14,7 +14,7 @@ type node struct {
key []byte
pgid pgid
parent *node
- children []*node
+ children nodes
inodes inodes
}
@@ -205,15 +205,14 @@ func (n *node) write(p *page) {
// DEBUG ONLY: n.dump()
}
-// split breaks up a node into smaller nodes, if appropriate.
+// split breaks up a node into two smaller nodes, if appropriate.
// This should only be called from the spill() function.
func (n *node) split(pageSize int) []*node {
- var nodes = []*node{n}
-
// Ignore the split if the page doesn't have at least enough nodes for
- // multiple pages or if the data can fit on a single page.
- if len(n.inodes) <= (minKeysPerPage*2) || n.size() < pageSize {
- return nodes
+ // two pages or if the data can fit on a single page.
+ sz := n.size()
+ if len(n.inodes) <= (minKeysPerPage*2) || sz < pageSize {
+ return []*node{n}
}
// Determine the threshold before starting a new node.
@@ -225,43 +224,60 @@ func (n *node) split(pageSize int) []*node {
}
threshold := int(float64(pageSize) * fillPercent)
- // Group into smaller pages and target a given fill size.
- size := pageHeaderSize
- internalNodes := n.inodes
- current := n
- current.inodes = nil
-
- // Loop over every inode and split once we reach our threshold.
- for i, inode := range internalNodes {
- elemSize := n.pageElementSize() + len(inode.key) + len(inode.value)
-
- // Split once we reach our threshold split size. However, this should
- // only be done if we have enough keys for this node and we will have
- // enough keys for the next node.
- if len(current.inodes) >= minKeysPerPage && i < len(internalNodes)-minKeysPerPage && size+elemSize > threshold {
- // If there's no parent then we need to create one.
- if n.parent == nil {
- n.parent = &node{bucket: n.bucket, children: []*node{n}}
- }
+ // Determine split position and sizes of the two pages.
+ splitIndex, sz0 := n.splitIndex(threshold)
+ sz1 := pageHeaderSize + (sz - sz0)
- // Create a new node and add it to the parent.
- current = &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
- n.parent.children = append(n.parent.children, current)
- nodes = append(nodes, current)
+ // If we can fit our extra keys on the next page then merge into it.
+ if next := n.nextSibling(); next != nil && next.size()+sz1 < threshold {
+ next.inodes = append(n.inodes[splitIndex:], next.inodes...)
+ n.inodes = n.inodes[:splitIndex]
+ return []*node{n}
+ }
- // Reset our running total back to zero (plus header size).
- size = pageHeaderSize
+ // Otherwise split node into two separate nodes. If there's no parent then
+ // we'll need to create one.
+ if n.parent == nil {
+ n.parent = &node{bucket: n.bucket, children: []*node{n}}
+ }
- // Update the statistics.
- n.bucket.tx.stats.Split++
+ // Create a new node and add it to the parent.
+ next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
+ n.parent.children = append(n.parent.children, next)
+
+ // Split inodes across two nodes.
+ next.inodes = n.inodes[splitIndex:]
+ n.inodes = n.inodes[:splitIndex]
+
+ // Update the statistics.
+ n.bucket.tx.stats.Split++
+
+ return []*node{n, next}
+}
+
+// splitIndex finds the position where a page will fill a given threshold.
+// It returns the index as well as the size of the first page.
+// This is only be called from split().
+func (n *node) splitIndex(threshold int) (index, sz int) {
+ sz = pageHeaderSize
+
+ // Loop until we only have the minimum number of keys required for the second page.
+ for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
+ index = i
+ inode := n.inodes[i]
+ elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
+
+ // If we have at least the minimum number of keys and adding another
+ // node would put us over the threshold then exit and return.
+ if i >= minKeysPerPage && sz+elsize > threshold {
+ break
}
- // Increase our running total of the size and append the inode.
- size += elemSize
- current.inodes = append(current.inodes, inode)
+ // Add the element size to the total size.
+ sz += elsize
}
- return nodes
+ return
}
// spill writes the nodes to dirty pages and splits nodes as it goes.
@@ -269,22 +285,29 @@ func (n *node) split(pageSize int) []*node {
func (n *node) spill() error {
var tx = n.bucket.tx
- // Spill child nodes first.
- for _, child := range n.children {
- if err := child.spill(); err != nil {
+ // Spill child nodes first. Child nodes can materialize sibling nodes in
+ // the case of split-merge so we cannot use a range loop. We have to check
+ // the children size on every loop iteration.
+ sort.Sort(n.children)
+ for i := 0; i < len(n.children); i++ {
+ if err := n.children[i].spill(); err != nil {
return err
}
}
- // Add node's page to the freelist if it's not new.
- if n.pgid > 0 {
- tx.db.freelist.free(tx.id(), tx.page(n.pgid))
- n.pgid = 0
- }
+ // We no longer need the child list because it's only used for spill tracking.
+ n.children = nil
- // Spill nodes by deepest first.
+ // Spill nodes by deepest first. The first node returned from split() will
+ // always be "n".
var nodes = n.split(tx.db.pageSize)
for _, node := range nodes {
+ // Add node's page to the freelist if it's not new.
+ if node.pgid > 0 {
+ tx.db.freelist.free(tx.id(), tx.page(node.pgid))
+ node.pgid = 0
+ }
+
// Allocate contiguous space for the node.
p, err := tx.allocate((node.size() / tx.db.pageSize) + 1)
if err != nil {
@@ -550,6 +573,12 @@ func (n *node) dump() {
}
*/
+type nodes []*node
+
+func (s nodes) Len() int { return len(s) }
+func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
+func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 }
+
// inode represents an internal node inside of a node.
// It can be used to point to elements in a page or point
// to an element which hasn't been added to a page yet.