Merge pull request #181 from benbjohnson/split-merge

Allow split nodes to be merged with the next node.
author: Ben Johnson <benbjohnson@yahoo.com> 2014-06-03 16:44:58 -0600
committer: Ben Johnson <benbjohnson@yahoo.com> 2014-06-03 16:44:58 -0600
commit: 510143d852e394d2fb5fb773bab41f1d9e35d602 (patch)
tree: c4d06f342b5e53d5eea7a34d3d5138db74f26b15 /node.go
parent: Add ipxed to README. (diff)
parent: Fix merge-split spill issues. (diff)
download: dedo-510143d852e394d2fb5fb773bab41f1d9e35d602.tar.gz
dedo-510143d852e394d2fb5fb773bab41f1d9e35d602.tar.xz
1 files changed, 75 insertions, 46 deletions
diff --git a/node.go b/node.go
index 1502be0..f0978ca 100644
--- a/node.go
+++ b/node.go
@@ -14,7 +14,7 @@ type node struct {
 	key        []byte
 	pgid       pgid
 	parent     *node
-	children   []*node
+	children   nodes
 	inodes     inodes
 }
 
@@ -205,15 +205,14 @@ func (n *node) write(p *page) {
 	// DEBUG ONLY: n.dump()
 }
 
-// split breaks up a node into smaller nodes, if appropriate.
+// split breaks up a node into two smaller nodes, if appropriate.
 // This should only be called from the spill() function.
 func (n *node) split(pageSize int) []*node {
-	var nodes = []*node{n}
-
 	// Ignore the split if the page doesn't have at least enough nodes for
-	// multiple pages or if the data can fit on a single page.
-	if len(n.inodes) <= (minKeysPerPage*2) || n.size() < pageSize {
-		return nodes
+	// two pages or if the data can fit on a single page.
+	sz := n.size()
+	if len(n.inodes) <= (minKeysPerPage*2) || sz < pageSize {
+		return []*node{n}
 	}
 
 	// Determine the threshold before starting a new node.
@@ -225,43 +224,60 @@ func (n *node) split(pageSize int) []*node {
 	}
 	threshold := int(float64(pageSize) * fillPercent)
 
-	// Group into smaller pages and target a given fill size.
-	size := pageHeaderSize
-	internalNodes := n.inodes
-	current := n
-	current.inodes = nil
-
-	// Loop over every inode and split once we reach our threshold.
-	for i, inode := range internalNodes {
-		elemSize := n.pageElementSize() + len(inode.key) + len(inode.value)
-
-		// Split once we reach our threshold split size. However, this should
-		// only be done if we have enough keys for this node and we will have
-		// enough keys for the next node.
-		if len(current.inodes) >= minKeysPerPage && i < len(internalNodes)-minKeysPerPage && size+elemSize > threshold {
-			// If there's no parent then we need to create one.
-			if n.parent == nil {
-				n.parent = &node{bucket: n.bucket, children: []*node{n}}
-			}
+	// Determine split position and sizes of the two pages.
+	splitIndex, sz0 := n.splitIndex(threshold)
+	sz1 := pageHeaderSize + (sz - sz0)
 
-			// Create a new node and add it to the parent.
-			current = &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
-			n.parent.children = append(n.parent.children, current)
-			nodes = append(nodes, current)
+	// If we can fit our extra keys on the next page then merge into it.
+	if next := n.nextSibling(); next != nil && next.size()+sz1 < threshold {
+		next.inodes = append(n.inodes[splitIndex:], next.inodes...)
+		n.inodes = n.inodes[:splitIndex]
+		return []*node{n}
+	}
 
-			// Reset our running total back to zero (plus header size).
-			size = pageHeaderSize
+	// Otherwise split node into two separate nodes. If there's no parent then
+	// we'll need to create one.
+	if n.parent == nil {
+		n.parent = &node{bucket: n.bucket, children: []*node{n}}
+	}
 
-			// Update the statistics.
-			n.bucket.tx.stats.Split++
+	// Create a new node and add it to the parent.
+	next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
+	n.parent.children = append(n.parent.children, next)
+
+	// Split inodes across two nodes.
+	next.inodes = n.inodes[splitIndex:]
+	n.inodes = n.inodes[:splitIndex]
+
+	// Update the statistics.
+	n.bucket.tx.stats.Split++
+
+	return []*node{n, next}
+}
+
+// splitIndex finds the position where a page will fill a given threshold.
+// It returns the index as well as the size of the first page.
+// This is only be called from split().
+func (n *node) splitIndex(threshold int) (index, sz int) {
+	sz = pageHeaderSize
+
+	// Loop until we only have the minimum number of keys required for the second page.
+	for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
+		index = i
+		inode := n.inodes[i]
+		elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
+
+		// If we have at least the minimum number of keys and adding another
+		// node would put us over the threshold then exit and return.
+		if i >= minKeysPerPage && sz+elsize > threshold {
+			break
 		}
 
-		// Increase our running total of the size and append the inode.
-		size += elemSize
-		current.inodes = append(current.inodes, inode)
+		// Add the element size to the total size.
+		sz += elsize
 	}
 
-	return nodes
+	return
 }
 
 // spill writes the nodes to dirty pages and splits nodes as it goes.
@@ -269,22 +285,29 @@ func (n *node) split(pageSize int) []*node {
 func (n *node) spill() error {
 	var tx = n.bucket.tx
 
-	// Spill child nodes first.
-	for _, child := range n.children {
-		if err := child.spill(); err != nil {
+	// Spill child nodes first. Child nodes can materialize sibling nodes in
+	// the case of split-merge so we cannot use a range loop. We have to check
+	// the children size on every loop iteration.
+	sort.Sort(n.children)
+	for i := 0; i < len(n.children); i++ {
+		if err := n.children[i].spill(); err != nil {
 			return err
 		}
 	}
 
-	// Add node's page to the freelist if it's not new.
-	if n.pgid > 0 {
-		tx.db.freelist.free(tx.id(), tx.page(n.pgid))
-		n.pgid = 0
-	}
+	// We no longer need the child list because it's only used for spill tracking.
+	n.children = nil
 
-	// Spill nodes by deepest first.
+	// Spill nodes by deepest first. The first node returned from split() will
+	// always be "n".
 	var nodes = n.split(tx.db.pageSize)
 	for _, node := range nodes {
+		// Add node's page to the freelist if it's not new.
+		if node.pgid > 0 {
+			tx.db.freelist.free(tx.id(), tx.page(node.pgid))
+			node.pgid = 0
+		}
+
 		// Allocate contiguous space for the node.
 		p, err := tx.allocate((node.size() / tx.db.pageSize) + 1)
 		if err != nil {
@@ -550,6 +573,12 @@ func (n *node) dump() {
 }
 */
 
+type nodes []*node
+
+func (s nodes) Len() int           { return len(s) }
+func (s nodes) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
+func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 }
+
 // inode represents an internal node inside of a node.
 // It can be used to point to elements in a page or point
 // to an element which hasn't been added to a page yet.
author	Ben Johnson <benbjohnson@yahoo.com>	2014-06-03 16:44:58 -0600
committer	Ben Johnson <benbjohnson@yahoo.com>	2014-06-03 16:44:58 -0600
commit	510143d852e394d2fb5fb773bab41f1d9e35d602 (patch)
tree	c4d06f342b5e53d5eea7a34d3d5138db74f26b15 /node.go
parent	Add ipxed to README. (diff)
parent	Fix merge-split spill issues. (diff)
download	dedo-510143d852e394d2fb5fb773bab41f1d9e35d602.tar.gz dedo-510143d852e394d2fb5fb773bab41f1d9e35d602.tar.xz