aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Johnson <benbjohnson@yahoo.com>2014-05-05 10:27:02 -0600
committerBen Johnson <benbjohnson@yahoo.com>2014-05-05 16:39:55 -0600
commit55e71b090259eb775c1bb74a2c3ec23bdfba8db5 (patch)
treea955d1a2bf1d7174e5255a8d68a8e57a103a8819
parentMerge pull request #153 from benbjohnson/consolidate (diff)
downloaddedo-55e71b090259eb775c1bb74a2c3ec23bdfba8db5.tar.gz
dedo-55e71b090259eb775c1bb74a2c3ec23bdfba8db5.tar.xz
Add inline bucket support.
This commit adds support for writing small buckets directly inline to their value in their parent's leaf node. Previously, subbuckets would simply have a bucket header stored in their parent bucket which pointed to the root page. This required that every bucket use at least a single page. This has a high overhead for buckets with only one or two small items. Inline buckets checks subbuckets to see if they only have a small amount of data (about 1kb) and no subbuckets. If these conditions are met then the bucket's root node is written to a fake page which is simply a pointer to the end of the bucket's header. Fixes #124.
-rw-r--r--bucket.go161
-rw-r--r--bucket_test.go20
-rw-r--r--db.go6
-rw-r--r--db_test.go10
-rw-r--r--freelist.go5
-rw-r--r--node.go34
-rw-r--r--tx_test.go2
7 files changed, 189 insertions, 49 deletions
diff --git a/bucket.go b/bucket.go
index 5dd79dd..6672356 100644
--- a/bucket.go
+++ b/bucket.go
@@ -52,11 +52,14 @@ const (
minInt = -maxInt - 1
)
+const bucketHeaderSize = int(unsafe.Sizeof(bucket{}))
+
// Bucket represents a collection of key/value pairs inside the database.
type Bucket struct {
*bucket
tx *Tx
buckets map[string]*Bucket
+ page *page
rootNode *node
nodes map[pgid]*node
}
@@ -128,6 +131,11 @@ func (b *Bucket) Bucket(name []byte) *Bucket {
*child.bucket = *(*bucket)(unsafe.Pointer(&v[0]))
b.buckets[string(name)] = &child
+ // Save a reference to the inline page if the bucket is inline.
+ if child.root == 0 {
+ child.page = (*page)(unsafe.Pointer(&v[bucketHeaderSize]))
+ }
+
return &child
}
@@ -155,22 +163,19 @@ func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) {
}
}
- // Create a blank root leaf page.
- p, err := b.tx.allocate(1)
- if err != nil {
- return nil, err
- }
- p.flags = leafPageFlag
-
- // Insert key/value.
- value := make([]byte, unsafe.Sizeof(bucket{}))
- bucket := (*bucket)(unsafe.Pointer(&value[0]))
- bucket.root = p.id
+ // Create empty, inline bucket.
+ var bucket = Bucket{bucket: &bucket{}, rootNode: &node{isLeaf: true}}
+ var value = bucket.write()
// Insert into node.
key = cloneBytes(key)
c.node().put(key, key, value, 0, bucketLeafFlag)
+ // De-inline the parent bucket, if necessary.
+ if b.root == 0 {
+ b.page = nil
+ }
+
return b.Bucket(key), nil
}
@@ -224,9 +229,9 @@ func (b *Bucket) DeleteBucket(key []byte) error {
delete(b.buckets, string(key))
// Release all bucket pages to freelist.
- b.tx.forEachPage(child.root, 0, func(p *page, _ int) {
- b.tx.db.freelist.free(b.tx.id(), p)
- })
+ child.nodes = nil
+ child.rootNode = nil
+ child.free()
// Delete the node if we have a matching key.
c.node().del(key)
@@ -348,7 +353,7 @@ func (b *Bucket) ForEach(fn func(k, v []byte) error) error {
func (b *Bucket) Stats() BucketStats {
var s BucketStats
pageSize := b.tx.db.pageSize
- b.tx.forEachPage(b.root, 0, func(p *page, depth int) {
+ b.forEachPage(func(p *page, depth int) {
if (p.flags & leafPageFlag) != 0 {
s.LeafPageN++
s.KeyN += int(p.count)
@@ -375,21 +380,47 @@ func (b *Bucket) Stats() BucketStats {
return s
}
+// forEachPage iterates over every page in a bucket, including inline pages.
+func (b *Bucket) forEachPage(fn func(*page, int)) {
+ // If we have an inline page then just use that.
+ if b.page != nil {
+ fn(b.page, 0)
+ return
+ }
+
+ // Otherwise traverse the page hierarchy.
+ b.tx.forEachPage(b.root, 0, fn)
+}
+
// spill writes all the nodes for this bucket to dirty pages.
func (b *Bucket) spill() error {
// Spill all child buckets first.
for name, child := range b.buckets {
- if err := child.spill(); err != nil {
- return err
+ // If the child bucket is small enough and it has no child buckets then
+ // write it inline into the parent bucket's page. Otherwise spill it
+ // like a normal bucket and make the parent value a pointer to the page.
+ var value []byte
+ if child.inlineable() {
+ child.free()
+ value = child.write()
+ } else {
+ if err := child.spill(); err != nil {
+ return err
+ }
+
+ // Update the child bucket header in this bucket.
+ value = make([]byte, unsafe.Sizeof(bucket{}))
+ bucket := (*bucket)(unsafe.Pointer(&value[0]))
+ *bucket = *child.bucket
}
- // Update the child bucket header in this bucket.
- value := make([]byte, unsafe.Sizeof(bucket{}))
- bucket := (*bucket)(unsafe.Pointer(&value[0]))
- *bucket = *child.bucket
+ // Skip writing the bucket if there are no materialized nodes.
+ if child.rootNode == nil {
+ continue
+ }
// Update parent node.
- c := b.Cursor()
+ var c = b.Cursor()
k, _, flags := c.seek([]byte(name))
_assert(bytes.Equal([]byte(name), k), "misplaced bucket header: %x -> %x", []byte(name), k)
_assert(flags&bucketLeafFlag != 0, "unexpected bucket header flag: %x", flags)
@@ -413,6 +444,54 @@ func (b *Bucket) spill() error {
return nil
}
+// inlineable returns true if a bucket is small enough to be written inline
+// and if it contains no subbuckets. Otherwise returns false.
+func (b *Bucket) inlineable() bool {
+ var n = b.rootNode
+
+ // Bucket must only contain a single leaf node.
+ if n == nil || !n.isLeaf {
+ return false
+ }
+
+ // Bucket is not inlineable if it contains subbuckets or if it goes beyond
+ // our threshold for inline bucket size.
+ var size = pageHeaderSize
+ for _, inode := range n.inodes {
+ size += leafPageElementSize + len(inode.key) + len(inode.value)
+
+ if inode.flags&bucketLeafFlag != 0 {
+ return false
+ } else if size > b.maxInlineBucketSize() {
+ return false
+ }
+ }
+
+ return true
+}
+
+// Returns the maximum total size of a bucket to make it a candidate for inlining.
+func (b *Bucket) maxInlineBucketSize() int {
+ return b.tx.db.pageSize / 4
+}
+
+// write allocates and writes a bucket to a byte slice.
+func (b *Bucket) write() []byte {
+ // Allocate the appropriate size.
+ var n = b.rootNode
+ var value = make([]byte, bucketHeaderSize+pageHeaderSize+n.size())
+
+ // Write a bucket header.
+ var bucket = (*bucket)(unsafe.Pointer(&value[0]))
+ *bucket = *b.bucket
+
+ // Convert byte slice to a fake page and write the root node.
+ var p = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
+ n.write(p)
+
+ return value
+}
+
// rebalance attempts to balance all nodes.
func (b *Bucket) rebalance() {
for _, n := range b.nodes {
@@ -431,14 +510,22 @@ func (b *Bucket) node(pgid pgid, parent *node) *node {
return n
}
- // Otherwise create a branch node and cache it.
+ // Otherwise create a node and cache it.
n := &node{bucket: b, parent: parent}
if parent == nil {
b.rootNode = n
} else {
parent.children = append(parent.children, n)
}
- n.read(b.tx.page(pgid))
+
+ // Use the inline page if this is an inline bucket.
+ var p = b.page
+ if p == nil {
+ p = b.tx.page(pgid)
+ }
+
+ // Read the page into the node and cache it.
+ n.read(p)
b.nodes[pgid] = n
// Update statistics.
@@ -447,6 +534,19 @@ func (b *Bucket) node(pgid pgid, parent *node) *node {
return n
}
+// free recursively frees all pages in the bucket.
+func (b *Bucket) free() {
+ if b.root == 0 {
+ return
+ }
+
+ var tx = b.tx
+ tx.forEachPage(b.root, 0, func(p *page, _ int) {
+ tx.db.freelist.free(tx.id(), p)
+ })
+ b.root = 0
+}
+
// dereference removes all references to the old mmap.
func (b *Bucket) dereference() {
for _, n := range b.nodes {
@@ -464,11 +564,24 @@ func (b *Bucket) dereference() {
// pageNode returns the in-memory node, if it exists.
// Otherwise returns the underlying page.
func (b *Bucket) pageNode(id pgid) (*page, *node) {
+ // Inline buckets have a fake page embedded in their value so treat them
+ // differently. We'll return the rootNode (if available) or the fake page.
+ if b.root == 0 {
+ _assert(id == 0, "inline bucket non-zero page access(2): %d != 0", id)
+ if b.rootNode != nil {
+ return nil, b.rootNode
+ }
+ return b.page, nil
+ }
+
+ // Check the node cache for non-inline buckets.
if b.nodes != nil {
if n := b.nodes[id]; n != nil {
return nil, n
}
}
+
+ // Finally lookup the page from the transaction if no node is materialized.
return b.tx.page(id), nil
}
diff --git a/bucket_test.go b/bucket_test.go
index dc89bcc..8fcf84c 100644
--- a/bucket_test.go
+++ b/bucket_test.go
@@ -583,18 +583,18 @@ func TestBucket_Stats_Small(t *testing.T) {
db.View(func(tx *Tx) error {
b := tx.Bucket([]byte("whozawhats"))
stats := b.Stats()
- assert.Equal(t, stats.BranchPageN, 0)
- assert.Equal(t, stats.BranchOverflowN, 0)
- assert.Equal(t, stats.LeafPageN, 1)
- assert.Equal(t, stats.LeafOverflowN, 0)
- assert.Equal(t, stats.KeyN, 1)
- assert.Equal(t, stats.Depth, 1)
+ assert.Equal(t, 0, stats.BranchPageN)
+ assert.Equal(t, 0, stats.BranchOverflowN)
+ assert.Equal(t, 1, stats.LeafPageN)
+ assert.Equal(t, 0, stats.LeafOverflowN)
+ assert.Equal(t, 1, stats.KeyN)
+ assert.Equal(t, 1, stats.Depth)
if os.Getpagesize() != 4096 {
// Incompatible page size
- assert.Equal(t, stats.BranchInuse, 0)
- assert.Equal(t, stats.BranchAlloc, 0)
- assert.Equal(t, stats.LeafInuse, 38)
- assert.Equal(t, stats.LeafAlloc, 4096)
+ assert.Equal(t, 0, stats.BranchInuse)
+ assert.Equal(t, 0, stats.BranchAlloc)
+ assert.Equal(t, 38, stats.LeafInuse)
+ assert.Equal(t, 4096, stats.LeafAlloc)
}
return nil
})
diff --git a/db.go b/db.go
index 9754f21..d96a161 100644
--- a/db.go
+++ b/db.go
@@ -563,6 +563,11 @@ func (db *DB) Check() error {
}
func (db *DB) checkBucket(b *Bucket, reachable map[pgid]*page, errors *ErrorList) {
+ // Ignore inline buckets.
+ if b.root == 0 {
+ return
+ }
+
// Check every page used by this bucket.
b.tx.forEachPage(b.root, 0, func(p *page, _ int) {
// Ensure each page is only referenced once.
@@ -576,7 +581,6 @@ func (db *DB) checkBucket(b *Bucket, reachable map[pgid]*page, errors *ErrorList
// Retrieve page info.
info, err := b.tx.Page(int(p.id))
- // warnf("[page] %d + %d (%s)", p.id, p.overflow, info.Type)
if err != nil {
*errors = append(*errors, err)
} else if info == nil {
diff --git a/db_test.go b/db_test.go
index b7e2a2f..903f65e 100644
--- a/db_test.go
+++ b/db_test.go
@@ -275,7 +275,7 @@ func TestDB_Stats(t *testing.T) {
return err
})
stats := db.Stats()
- assert.Equal(t, 3, stats.TxStats.PageCount)
+ assert.Equal(t, 2, stats.TxStats.PageCount)
})
}
@@ -324,13 +324,7 @@ func TestDB_Consistency(t *testing.T) {
if p, _ := tx.Page(5); assert.NotNil(t, p) {
assert.Equal(t, "leaf", p.Type) // root leaf
}
- if p, _ := tx.Page(6); assert.NotNil(t, p) {
- assert.Equal(t, "leaf", p.Type)
- }
- if p, _ := tx.Page(7); assert.NotNil(t, p) {
- assert.Equal(t, "free", p.Type)
- }
- p, _ := tx.Page(8)
+ p, _ := tx.Page(6)
assert.Nil(t, p)
return nil
})
diff --git a/freelist.go b/freelist.go
index ebe2810..6c527fe 100644
--- a/freelist.go
+++ b/freelist.go
@@ -1,6 +1,7 @@
package bolt
import (
+ "fmt"
"sort"
"unsafe"
)
@@ -63,7 +64,7 @@ func (f *freelist) free(txid txid, p *page) {
}
f.pending[txid] = ids
- // DEBUG ONLY: f.check()
+ f.check() // DEBUG ONLY:
}
// release moves all page ids for a transaction id (or older) to the freelist.
@@ -114,7 +115,6 @@ func (f *freelist) write(p *page) {
// check verifies there are no double free pages.
// This is slow so it should only be used while debugging.
// If errors are found then a panic invoked.
-/*
func (f *freelist) check() {
var lookup = make(map[pgid]txid)
for _, id := range f.ids {
@@ -132,7 +132,6 @@ func (f *freelist) check() {
}
}
}
-*/
type reverseSortedPgids []pgid
diff --git a/node.go b/node.go
index fd737ea..72145c4 100644
--- a/node.go
+++ b/node.go
@@ -2,6 +2,8 @@ package bolt
import (
"bytes"
+ "fmt"
+ "io"
"sort"
"unsafe"
)
@@ -191,6 +193,8 @@ func (n *node) write(p *page) {
copy(b[0:], item.value)
b = b[len(item.value):]
}
+
+ // DEBUG ONLY: n.dump(os.Stderr)
}
// split breaks up a node into smaller nodes, if appropriate.
@@ -261,6 +265,7 @@ func (n *node) spill() error {
// Add node's page to the freelist if it's not new.
if n.pgid > 0 {
tx.db.freelist.free(tx.id(), tx.page(n.pgid))
+ n.pgid = 0
}
// Spill nodes by deepest first.
@@ -273,8 +278,8 @@ func (n *node) spill() error {
}
// Write the node.
- node.write(p)
node.pgid = p.id
+ node.write(p)
// Insert into parent inodes.
if node.parent != nil {
@@ -302,8 +307,8 @@ func (n *node) spill() error {
}
// Write the new root.
- parent.write(p)
parent.pgid = p.id
+ parent.write(p)
}
return nil
@@ -477,6 +482,31 @@ func (n *node) free() {
}
}
+// dump writes the contents of the node for debugging purposes.
+func (n *node) dump(w io.Writer) {
+ // Write node header.
+ var typ = "branch"
+ if n.isLeaf {
+ typ = "leaf"
+ }
+ fmt.Fprintf(w, "[NODE %d {type=%s count=%d}]\n", n.pgid, typ, len(n.inodes))
+
+ // Write out abbreviated version of each item.
+ for _, item := range n.inodes {
+ if n.isLeaf {
+ if item.flags&bucketLeafFlag != 0 {
+ bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
+ fmt.Fprintf(w, "[L \"%s\" -> (bucket root=%d)]\n", item.key, bucket.root)
+ } else {
+ fmt.Fprintf(w, "[L \"%s\" -> \"%s\"]\n", item.key, item.value)
+ }
+ } else {
+ fmt.Fprintf(w, "[B \"%s\" -> pgid=%d]\n", item.key, item.pgid)
+ }
+ }
+ fmt.Fprint(w, "\n")
+}
+
// inode represents an internal node inside of a node.
// It can be used to point to elements in a page or point
// to an element which hasn't been added to a page yet.
diff --git a/tx_test.go b/tx_test.go
index 61fa5d9..e466765 100644
--- a/tx_test.go
+++ b/tx_test.go
@@ -219,7 +219,7 @@ func TestTx_DeleteBucket(t *testing.T) {
db.Update(func(tx *Tx) error {
// Verify that the bucket's page is free.
- assert.Equal(t, []pgid{7, 6, root, 2}, db.freelist.all())
+ assert.Equal(t, []pgid{5, 4}, db.freelist.all())
// Create the bucket again and make sure there's not a phantom value.
b, err := tx.CreateBucket([]byte("widgets"))