diff options
Diffstat (limited to 'node.go')
-rw-r--r-- | node.go | 141 |
1 files changed, 123 insertions, 18 deletions
@@ -12,12 +12,20 @@ type node struct { isLeaf bool unbalanced bool key []byte - depth int pgid pgid parent *node + children []*node inodes inodes } +// root returns the top-level node this node is attached to. +func (n *node) root() *node { + if n.parent == nil { + return n + } + return n.parent.root() +} + // minKeys returns the minimum number of inodes this node should have. func (n *node) minKeys() int { if n.isLeaf { @@ -185,12 +193,15 @@ func (n *node) write(p *page) { } } -// split divides up the node into appropriately sized nodes. +// split breaks up a node into smaller nodes, if appropriate. +// This should only be called from the spill() function. func (n *node) split(pageSize int) []*node { + var nodes = []*node{n} + // Ignore the split if the page doesn't have at least enough nodes for // multiple pages or if the data can fit on a single page. if len(n.inodes) <= (minKeysPerPage*2) || n.size() < pageSize { - return []*node{n} + return nodes } // Set fill threshold to 50%. @@ -198,28 +209,106 @@ func (n *node) split(pageSize int) []*node { // Group into smaller pages and target a given fill size. size := pageHeaderSize - inodes := n.inodes + internalNodes := n.inodes current := n current.inodes = nil - var nodes []*node - for i, inode := range inodes { + // Loop over every inode and split once we reach our threshold. + for i, inode := range internalNodes { elemSize := n.pageElementSize() + len(inode.key) + len(inode.value) - if len(current.inodes) >= minKeysPerPage && i < len(inodes)-minKeysPerPage && size+elemSize > threshold { - size = pageHeaderSize + // Split once we reach our threshold split size. However, this should + // only be done if we have enough keys for this node and we will have + // enough keys for the next node. + if len(current.inodes) >= minKeysPerPage && i < len(internalNodes)-minKeysPerPage && size+elemSize > threshold { + // If there's no parent then we need to create one. + if n.parent == nil { + n.parent = &node{bucket: n.bucket, children: []*node{n}} + } + + // Create a new node and add it to the parent. + current = &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent} + n.parent.children = append(n.parent.children, current) nodes = append(nodes, current) - current = &node{bucket: n.bucket, isLeaf: n.isLeaf} + + // Reset our running total back to zero (plus header size). + size = pageHeaderSize + + // Update the statistics. + n.bucket.tx.stats.Split++ } + // Increase our running total of the size and append the inode. size += elemSize current.inodes = append(current.inodes, inode) } - nodes = append(nodes, current) return nodes } +// spill writes the nodes to dirty pages and splits nodes as it goes. +// Returns an error if dirty pages cannot be allocated. +func (n *node) spill() error { + var tx = n.bucket.tx + + // Spill child nodes first. + for _, child := range n.children { + if err := child.spill(); err != nil { + return err + } + } + + // Add node's page to the freelist if it's not new. + if n.pgid > 0 { + tx.db.freelist.free(tx.id(), tx.page(n.pgid)) + } + + // Spill nodes by deepest first. + var nodes = n.split(tx.db.pageSize) + for _, node := range nodes { + // Allocate contiguous space for the node. + p, err := tx.allocate((node.size() / tx.db.pageSize) + 1) + if err != nil { + return err + } + + // Write the node. + node.write(p) + node.pgid = p.id + + // Insert into parent inodes. + if node.parent != nil { + var key = node.key + if key == nil { + key = node.inodes[0].key + } + + node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0) + node.key = node.inodes[0].key + } + + // Update the statistics. + tx.stats.Spill++ + } + + // This is a special case where we need to write the parent if it is new + // and caused by a split in the root. + var parent = n.parent + if parent != nil && parent.pgid == 0 { + // Allocate contiguous space for the node. + p, err := tx.allocate((parent.size() / tx.db.pageSize) + 1) + if err != nil { + return err + } + + // Write the new root. + parent.write(p) + parent.pgid = p.id + } + + return nil +} + // rebalance attempts to combine the node with sibling nodes if the node fill // size is below a threshold or if there are not enough keys. func (n *node) rebalance() { @@ -241,10 +330,11 @@ func (n *node) rebalance() { if n.parent == nil { // If root node is a branch and only has one node then collapse it. if !n.isLeaf && len(n.inodes) == 1 { - // Move child's children up. + // Move root's child up. child := n.bucket.nodes[n.inodes[0].pgid] n.isLeaf = child.isLeaf n.inodes = child.inodes[:] + n.children = child.children // Reparent all child nodes being moved. for _, inode := range n.inodes { @@ -278,7 +368,9 @@ func (n *node) rebalance() { if useNextSibling { // Reparent and move node. if child, ok := n.bucket.nodes[target.inodes[0].pgid]; ok { + child.parent.removeChild(child) child.parent = n + child.parent.children = append(child.parent.children, child) } n.inodes = append(n.inodes, target.inodes[0]) target.inodes = target.inodes[1:] @@ -289,7 +381,9 @@ func (n *node) rebalance() { } else { // Reparent and move node. if child, ok := n.bucket.nodes[target.inodes[len(target.inodes)-1].pgid]; ok { + child.parent.removeChild(child) child.parent = n + child.parent.children = append(child.parent.children, child) } n.inodes = append(n.inodes, inode{}) copy(n.inodes[1:], n.inodes) @@ -309,26 +403,32 @@ func (n *node) rebalance() { // Reparent all child nodes being moved. for _, inode := range target.inodes { if child, ok := n.bucket.nodes[inode.pgid]; ok { + child.parent.removeChild(child) child.parent = n + child.parent.children = append(child.parent.children, child) } } // Copy over inodes from target and remove target. n.inodes = append(n.inodes, target.inodes...) n.parent.del(target.key) + n.parent.removeChild(target) delete(n.bucket.nodes, target.pgid) target.free() } else { // Reparent all child nodes being moved. for _, inode := range n.inodes { if child, ok := n.bucket.nodes[inode.pgid]; ok { + child.parent.removeChild(child) child.parent = target + child.parent.children = append(child.parent.children, child) } } // Copy over inodes to target and remove node. target.inodes = append(target.inodes, n.inodes...) n.parent.del(n.key) + n.parent.removeChild(n) n.parent.put(target.key, target.inodes[0].key, nil, target.pgid, 0) delete(n.bucket.nodes, n.pgid) n.free() @@ -338,6 +438,17 @@ func (n *node) rebalance() { n.parent.rebalance() } +// removes a node from the list of in-memory children. +// This does not affect the inodes. +func (n *node) removeChild(target *node) { + for i, child := range n.children { + if child == target { + n.children = append(n.children[:i], n.children[i+1:]...) + return + } + } +} + // dereference causes the node to copy all its inode key/value references to heap memory. // This is required when the mmap is reallocated so inodes are not pointing to stale data. func (n *node) dereference() { @@ -362,16 +473,10 @@ func (n *node) dereference() { func (n *node) free() { if n.pgid != 0 { n.bucket.tx.db.freelist.free(n.bucket.tx.id(), n.bucket.tx.page(n.pgid)) + n.pgid = 0 } } -// nodesByDepth sorts a list of branches by deepest first. -type nodesByDepth []*node - -func (s nodesByDepth) Len() int { return len(s) } -func (s nodesByDepth) Swap(i, j int) { s[i], s[j] = s[j], s[i] } -func (s nodesByDepth) Less(i, j int) bool { return s[i].depth > s[j].depth } - // inode represents an internal node inside of a node. // It can be used to point to elements in a page or point // to an element which hasn't been added to a page yet. |