diff options
Diffstat (limited to 'transaction.go')
-rw-r--r-- | transaction.go | 2424 |
1 files changed, 1214 insertions, 1210 deletions
diff --git a/transaction.go b/transaction.go index 1064e8e..de3bd0d 100644 --- a/transaction.go +++ b/transaction.go @@ -16,7 +16,7 @@ type Transaction interface { type transaction struct { id int flags int - db *db + db *DB parent *transaction child *transaction nextPageNumber int @@ -25,7 +25,7 @@ type transaction struct { dirtyList []int reader *reader // TODO: bucketxs []*bucketx - buckets []*bucket + buckets []*Bucket bucketFlags []int cursors []*cursor // Implicit from slices? TODO: MDB_dbi mt_numdbs; @@ -35,41 +35,40 @@ type transaction struct { // ntxn represents a nested transaction. type ntxn struct { transaction *transaction /**< the transaction */ - pageState pageState /**< parent transaction's saved freestate */ + pageState pageState /**< parent transaction's saved freestate */ } - func (t *transaction) allocPage(num int) *page { /* - MDB_env *env = txn->mt_env; - MDB_page *ret = env->me_dpages; - size_t psize = env->me_psize, sz = psize, off; - // For ! #MDB_NOMEMINIT, psize counts how much to init. - // For a single page alloc, we init everything after the page header. - // For multi-page, we init the final page; if the caller needed that - // many pages they will be filling in at least up to the last page. - if (num == 1) { - if (ret) { - VGMEMP_ALLOC(env, ret, sz); - VGMEMP_DEFINED(ret, sizeof(ret->mp_next)); - env->me_dpages = ret->mp_next; - return ret; + MDB_env *env = txn->mt_env; + MDB_page *ret = env->me_dpages; + size_t psize = env->me_psize, sz = psize, off; + // For ! #MDB_NOMEMINIT, psize counts how much to init. + // For a single page alloc, we init everything after the page header. + // For multi-page, we init the final page; if the caller needed that + // many pages they will be filling in at least up to the last page. + if (num == 1) { + if (ret) { + VGMEMP_ALLOC(env, ret, sz); + VGMEMP_DEFINED(ret, sizeof(ret->mp_next)); + env->me_dpages = ret->mp_next; + return ret; + } + psize -= off = PAGEHDRSZ; + } else { + sz *= num; + off = sz - psize; } - psize -= off = PAGEHDRSZ; - } else { - sz *= num; - off = sz - psize; - } - if ((ret = malloc(sz)) != NULL) { - VGMEMP_ALLOC(env, ret, sz); - if (!(env->me_flags & MDB_NOMEMINIT)) { - memset((char *)ret + off, 0, psize); - ret->mp_pad = 0; + if ((ret = malloc(sz)) != NULL) { + VGMEMP_ALLOC(env, ret, sz); + if (!(env->me_flags & MDB_NOMEMINIT)) { + memset((char *)ret + off, 0, psize); + ret->mp_pad = 0; + } + } else { + txn->mt_flags |= MDB_TXN_ERROR; } - } else { - txn->mt_flags |= MDB_TXN_ERROR; - } - return ret; + return ret; */ return nil } @@ -77,19 +76,19 @@ func (t *transaction) allocPage(num int) *page { // Find oldest txnid still referenced. Expects txn->mt_txnid > 0. func (t *transaction) oldest() int { /* - int i; - txnid_t mr, oldest = txn->mt_txnid - 1; - if (txn->mt_env->me_txns) { - MDB_reader *r = txn->mt_env->me_txns->mti_readers; - for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { - if (r[i].mr_pid) { - mr = r[i].mr_txnid; - if (oldest > mr) - oldest = mr; + int i; + txnid_t mr, oldest = txn->mt_txnid - 1; + if (txn->mt_env->me_txns) { + MDB_reader *r = txn->mt_env->me_txns->mti_readers; + for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { + if (r[i].mr_pid) { + mr = r[i].mr_txnid; + if (oldest > mr) + oldest = mr; + } } } - } - return oldest; + return oldest; */ return 0 } @@ -97,19 +96,19 @@ func (t *transaction) oldest() int { // Add a page to the txn's dirty list func (t *transaction) dirty(p *page) { /* - MDB_ID2 mid; - int rc, (*insert)(MDB_ID2L, MDB_ID2 *); - - if (txn->mt_env->me_flags & MDB_WRITEMAP) { - insert = mdb_mid2l_append; - } else { - insert = mdb_mid2l_insert; - } - mid.mid = mp->mp_pgno; - mid.mptr = mp; - rc = insert(txn->mt_u.dirty_list, &mid); - mdb_tassert(txn, rc == 0); - txn->mt_dirty_room--; + MDB_ID2 mid; + int rc, (*insert)(MDB_ID2L, MDB_ID2 *); + + if (txn->mt_env->me_flags & MDB_WRITEMAP) { + insert = mdb_mid2l_append; + } else { + insert = mdb_mid2l_insert; + } + mid.mid = mp->mp_pgno; + mid.mptr = mp; + rc = insert(txn->mt_u.dirty_list, &mid); + mdb_tassert(txn, rc == 0); + txn->mt_dirty_room--; */ } @@ -122,53 +121,53 @@ func (t *transaction) dirty(p *page) { // mp wasn't spilled. func (t *transaction) unspill(p *page) *page { /* - MDB_env *env = txn->mt_env; - const MDB_txn *tx2; - unsigned x; - pgno_t pgno = mp->mp_pgno, pn = pgno << 1; - - for (tx2 = txn; tx2; tx2=tx2->mt_parent) { - if (!tx2->mt_spill_pgs) - continue; - x = mdb_midl_search(tx2->mt_spill_pgs, pn); - if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { - MDB_page *np; - int num; - if (txn->mt_dirty_room == 0) - return MDB_TXN_FULL; - if (IS_OVERFLOW(mp)) - num = mp->mp_pages; - else - num = 1; - if (env->me_flags & MDB_WRITEMAP) { - np = mp; - } else { - np = mdb_page_malloc(txn, num); - if (!np) - return ENOMEM; - if (num > 1) - memcpy(np, mp, num * env->me_psize); + MDB_env *env = txn->mt_env; + const MDB_txn *tx2; + unsigned x; + pgno_t pgno = mp->mp_pgno, pn = pgno << 1; + + for (tx2 = txn; tx2; tx2=tx2->mt_parent) { + if (!tx2->mt_spill_pgs) + continue; + x = mdb_midl_search(tx2->mt_spill_pgs, pn); + if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { + MDB_page *np; + int num; + if (txn->mt_dirty_room == 0) + return MDB_TXN_FULL; + if (IS_OVERFLOW(mp)) + num = mp->mp_pages; else - mdb_page_copy(np, mp, env->me_psize); + num = 1; + if (env->me_flags & MDB_WRITEMAP) { + np = mp; + } else { + np = mdb_page_malloc(txn, num); + if (!np) + return ENOMEM; + if (num > 1) + memcpy(np, mp, num * env->me_psize); + else + mdb_page_copy(np, mp, env->me_psize); + } + if (tx2 == txn) { + // If in current txn, this page is no longer spilled. + // If it happens to be the last page, truncate the spill list. + // Otherwise mark it as deleted by setting the LSB. + if (x == txn->mt_spill_pgs[0]) + txn->mt_spill_pgs[0]--; + else + txn->mt_spill_pgs[x] |= 1; + } // otherwise, if belonging to a parent txn, the + // page remains spilled until child commits + + mdb_page_dirty(txn, np); + np->mp_flags |= P_DIRTY; + *ret = np; + break; } - if (tx2 == txn) { - // If in current txn, this page is no longer spilled. - // If it happens to be the last page, truncate the spill list. - // Otherwise mark it as deleted by setting the LSB. - if (x == txn->mt_spill_pgs[0]) - txn->mt_spill_pgs[0]--; - else - txn->mt_spill_pgs[x] |= 1; - } // otherwise, if belonging to a parent txn, the - // page remains spilled until child commits - - mdb_page_dirty(txn, np); - np->mp_flags |= P_DIRTY; - *ret = np; - break; } - } - return MDB_SUCCESS; + return MDB_SUCCESS; */ return nil } @@ -176,37 +175,37 @@ func (t *transaction) unspill(p *page) *page { // Back up parent txn's cursors, then grab the originals for tracking func (t *transaction) shadow(dst *transaction) error { /* - MDB_cursor *mc, *bk; - MDB_xcursor *mx; - size_t size; - int i; - - for (i = src->mt_numdbs; --i >= 0; ) { - if ((mc = src->mt_cursors[i]) != NULL) { - size = sizeof(MDB_cursor); - if (mc->mc_xcursor) - size += sizeof(MDB_xcursor); - for (; mc; mc = bk->mc_next) { - bk = malloc(size); - if (!bk) - return ENOMEM; - *bk = *mc; - mc->mc_backup = bk; - mc->mc_db = &dst->mt_dbs[i]; - // Kill pointers into src - and dst to reduce abuse: The - // user may not use mc until dst ends. Otherwise we'd... - mc->mc_txn = NULL; // ...set this to dst - mc->mc_dbflag = NULL; // ...and &dst->mt_dbflags[i] - if ((mx = mc->mc_xcursor) != NULL) { - *(MDB_xcursor *)(bk+1) = *mx; - mx->mx_cursor.mc_txn = NULL; // ...and dst. + MDB_cursor *mc, *bk; + MDB_xcursor *mx; + size_t size; + int i; + + for (i = src->mt_numdbs; --i >= 0; ) { + if ((mc = src->mt_cursors[i]) != NULL) { + size = sizeof(MDB_cursor); + if (mc->mc_xcursor) + size += sizeof(MDB_xcursor); + for (; mc; mc = bk->mc_next) { + bk = malloc(size); + if (!bk) + return ENOMEM; + *bk = *mc; + mc->mc_backup = bk; + mc->mc_db = &dst->mt_dbs[i]; + // Kill pointers into src - and dst to reduce abuse: The + // user may not use mc until dst ends. Otherwise we'd... + mc->mc_txn = NULL; // ...set this to dst + mc->mc_dbflag = NULL; // ...and &dst->mt_dbflags[i] + if ((mx = mc->mc_xcursor) != NULL) { + *(MDB_xcursor *)(bk+1) = *mx; + mx->mx_cursor.mc_txn = NULL; // ...and dst. + } + mc->mc_next = dst->mt_cursors[i]; + dst->mt_cursors[i] = mc; } - mc->mc_next = dst->mt_cursors[i]; - dst->mt_cursors[i] = mc; } } - } - return MDB_SUCCESS; + return MDB_SUCCESS; */ return nil } @@ -217,36 +216,36 @@ func (t *transaction) shadow(dst *transaction) error { // @return 0 on success, non-zero on failure. func (t *transaction) closeCursors(merge bool) { /* - MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; - MDB_xcursor *mx; - int i; - - for (i = txn->mt_numdbs; --i >= 0; ) { - for (mc = cursors[i]; mc; mc = next) { - next = mc->mc_next; - if ((bk = mc->mc_backup) != NULL) { - if (merge) { - // Commit changes to parent txn - mc->mc_next = bk->mc_next; - mc->mc_backup = bk->mc_backup; - mc->mc_txn = bk->mc_txn; - mc->mc_db = bk->mc_db; - mc->mc_dbflag = bk->mc_dbflag; - if ((mx = mc->mc_xcursor) != NULL) - mx->mx_cursor.mc_txn = bk->mc_txn; - } else { - // Abort nested txn - *mc = *bk; - if ((mx = mc->mc_xcursor) != NULL) - *mx = *(MDB_xcursor *)(bk+1); + MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; + MDB_xcursor *mx; + int i; + + for (i = txn->mt_numdbs; --i >= 0; ) { + for (mc = cursors[i]; mc; mc = next) { + next = mc->mc_next; + if ((bk = mc->mc_backup) != NULL) { + if (merge) { + // Commit changes to parent txn + mc->mc_next = bk->mc_next; + mc->mc_backup = bk->mc_backup; + mc->mc_txn = bk->mc_txn; + mc->mc_db = bk->mc_db; + mc->mc_dbflag = bk->mc_dbflag; + if ((mx = mc->mc_xcursor) != NULL) + mx->mx_cursor.mc_txn = bk->mc_txn; + } else { + // Abort nested txn + *mc = *bk; + if ((mx = mc->mc_xcursor) != NULL) + *mx = *(MDB_xcursor *)(bk+1); + } + mc = bk; } - mc = bk; + // Only malloced cursors are permanently tracked. + free(mc); } - // Only malloced cursors are permanently tracked. - free(mc); + cursors[i] = NULL; } - cursors[i] = NULL; - } */ } @@ -255,168 +254,168 @@ func (t *transaction) closeCursors(merge bool) { // @return 0 on success, non-zero on failure. func (t *transaction) renew() error { /* - MDB_env *env = txn->mt_env; - MDB_txninfo *ti = env->me_txns; - MDB_meta *meta; - unsigned int i, nr; - uint16_t x; - int rc, new_notls = 0; - - // Setup db info - txn->mt_numdbs = env->me_numdbs; - txn->mt_dbxs = env->me_dbxs; // mostly static anyway - - if (txn->mt_flags & MDB_TXN_RDONLY) { - if (!ti) { - meta = env->me_metas[ mdb_env_pick_meta(env) ]; - txn->mt_txnid = meta->mm_txnid; - txn->mt_u.reader = NULL; - } else { - MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : - pthread_getspecific(env->me_txkey); - if (r) { - if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1) - return MDB_BAD_RSLOT; + MDB_env *env = txn->mt_env; + MDB_txninfo *ti = env->me_txns; + MDB_meta *meta; + unsigned int i, nr; + uint16_t x; + int rc, new_notls = 0; + + // Setup db info + txn->mt_numdbs = env->me_numdbs; + txn->mt_dbxs = env->me_dbxs; // mostly static anyway + + if (txn->mt_flags & MDB_TXN_RDONLY) { + if (!ti) { + meta = env->me_metas[ mdb_env_pick_meta(env) ]; + txn->mt_txnid = meta->mm_txnid; + txn->mt_u.reader = NULL; + } else { + MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : + pthread_getspecific(env->me_txkey); + if (r) { + if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1) + return MDB_BAD_RSLOT; + } else { + MDB_PID_T pid = env->me_pid; + pthread_t tid = pthread_self(); + + if (!(env->me_flags & MDB_LIVE_READER)) { + rc = mdb_reader_pid(env, Pidset, pid); + if (rc) + return rc; + env->me_flags |= MDB_LIVE_READER; + } + + LOCK_MUTEX_R(env); + nr = ti->mti_numreaders; + for (i=0; i<nr; i++) + if (ti->mti_readers[i].mr_pid == 0) + break; + if (i == env->me_maxreaders) { + UNLOCK_MUTEX_R(env); + return MDB_READERS_FULL; + } + ti->mti_readers[i].mr_pid = pid; + ti->mti_readers[i].mr_tid = tid; + if (i == nr) + ti->mti_numreaders = ++nr; + // Save numreaders for un-mutexed mdb_env_close() + env->me_numreaders = nr; + UNLOCK_MUTEX_R(env); + + r = &ti->mti_readers[i]; + new_notls = (env->me_flags & MDB_NOTLS); + if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) { + r->mr_pid = 0; + return rc; + } + } + txn->mt_txnid = r->mr_txnid = ti->mti_txnid; + txn->mt_u.reader = r; + meta = env->me_metas[txn->mt_txnid & 1]; + } } else { - MDB_PID_T pid = env->me_pid; - pthread_t tid = pthread_self(); + if (ti) { + LOCK_MUTEX_W(env); - if (!(env->me_flags & MDB_LIVE_READER)) { - rc = mdb_reader_pid(env, Pidset, pid); - if (rc) - return rc; - env->me_flags |= MDB_LIVE_READER; + txn->mt_txnid = ti->mti_txnid; + meta = env->me_metas[txn->mt_txnid & 1]; + } else { + meta = env->me_metas[ mdb_env_pick_meta(env) ]; + txn->mt_txnid = meta->mm_txnid; } + txn->mt_txnid++; + #if MDB_DEBUG + if (txn->mt_txnid == mdb_debug_start) + mdb_debug = 1; + #endif + txn->mt_dirty_room = MDB_IDL_UM_MAX; + txn->mt_u.dirty_list = env->me_dirty_list; + txn->mt_u.dirty_list[0].mid = 0; + txn->mt_free_pgs = env->me_free_pgs; + txn->mt_free_pgs[0] = 0; + txn->mt_spill_pgs = NULL; + env->me_txn = txn; + } - LOCK_MUTEX_R(env); - nr = ti->mti_numreaders; - for (i=0; i<nr; i++) - if (ti->mti_readers[i].mr_pid == 0) - break; - if (i == env->me_maxreaders) { - UNLOCK_MUTEX_R(env); - return MDB_READERS_FULL; - } - ti->mti_readers[i].mr_pid = pid; - ti->mti_readers[i].mr_tid = tid; - if (i == nr) - ti->mti_numreaders = ++nr; - // Save numreaders for un-mutexed mdb_env_close() - env->me_numreaders = nr; - UNLOCK_MUTEX_R(env); - - r = &ti->mti_readers[i]; - new_notls = (env->me_flags & MDB_NOTLS); - if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) { - r->mr_pid = 0; - return rc; - } + // Copy the DB info and flags + memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db)); + + // Moved to here to avoid a data race in read TXNs + txn->mt_next_pgno = meta->mm_last_pg+1; + + for (i=2; i<txn->mt_numdbs; i++) { + x = env->me_dbflags[i]; + txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; + txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_STALE : 0; } - txn->mt_txnid = r->mr_txnid = ti->mti_txnid; - txn->mt_u.reader = r; - meta = env->me_metas[txn->mt_txnid & 1]; - } - } else { - if (ti) { - LOCK_MUTEX_W(env); + txn->mt_dbflags[0] = txn->mt_dbflags[1] = DB_VALID; - txn->mt_txnid = ti->mti_txnid; - meta = env->me_metas[txn->mt_txnid & 1]; - } else { - meta = env->me_metas[ mdb_env_pick_meta(env) ]; - txn->mt_txnid = meta->mm_txnid; - } - txn->mt_txnid++; -#if MDB_DEBUG - if (txn->mt_txnid == mdb_debug_start) - mdb_debug = 1; -#endif - txn->mt_dirty_room = MDB_IDL_UM_MAX; - txn->mt_u.dirty_list = env->me_dirty_list; - txn->mt_u.dirty_list[0].mid = 0; - txn->mt_free_pgs = env->me_free_pgs; - txn->mt_free_pgs[0] = 0; - txn->mt_spill_pgs = NULL; - env->me_txn = txn; - } - - // Copy the DB info and flags - memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db)); - - // Moved to here to avoid a data race in read TXNs - txn->mt_next_pgno = meta->mm_last_pg+1; - - for (i=2; i<txn->mt_numdbs; i++) { - x = env->me_dbflags[i]; - txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; - txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_STALE : 0; - } - txn->mt_dbflags[0] = txn->mt_dbflags[1] = DB_VALID; - - if (env->me_maxpg < txn->mt_next_pgno) { - mdb_txn_reset0(txn, "renew0-mapfail"); - if (new_notls) { - txn->mt_u.reader->mr_pid = 0; - txn->mt_u.reader = NULL; - } - return MDB_MAP_RESIZED; - } + if (env->me_maxpg < txn->mt_next_pgno) { + mdb_txn_reset0(txn, "renew0-mapfail"); + if (new_notls) { + txn->mt_u.reader->mr_pid = 0; + txn->mt_u.reader = NULL; + } + return MDB_MAP_RESIZED; + } - return MDB_SUCCESS; + return MDB_SUCCESS; */ return nil } func (t *transaction) Renew() error { /* - int rc; - - if (!txn || txn->mt_dbxs) // A reset txn has mt_dbxs==NULL - return EINVAL; - - if (txn->mt_env->me_flags & MDB_FATAL_ERROR) { - DPUTS("environment had fatal error, must shutdown!"); - return MDB_PANIC; - } - - rc = mdb_txn_renew0(txn); - if (rc == MDB_SUCCESS) { - DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", - txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', - (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root)); - } - return rc; + int rc; + + if (!txn || txn->mt_dbxs) // A reset txn has mt_dbxs==NULL + return EINVAL; + + if (txn->mt_env->me_flags & MDB_FATAL_ERROR) { + DPUTS("environment had fatal error, must shutdown!"); + return MDB_PANIC; + } + + rc = mdb_txn_renew0(txn); + if (rc == MDB_SUCCESS) { + DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root)); + } + return rc; */ return nil } -func (t *transaction) DB() DB { +func (t *transaction) DB() *DB { return t.db } // Export or close DBI handles opened in this txn. func (t *transaction) updateBuckets(keep bool) { /* - int i; - MDB_dbi n = txn->mt_numdbs; - MDB_env *env = txn->mt_env; - unsigned char *tdbflags = txn->mt_dbflags; - - for (i = n; --i >= 2;) { - if (tdbflags[i] & DB_NEW) { - if (keep) { - env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; - } else { - char *ptr = env->me_dbxs[i].md_name.mv_data; - env->me_dbxs[i].md_name.mv_data = NULL; - env->me_dbxs[i].md_name.mv_size = 0; - env->me_dbflags[i] = 0; - free(ptr); + int i; + MDB_dbi n = txn->mt_numdbs; + MDB_env *env = txn->mt_env; + unsigned char *tdbflags = txn->mt_dbflags; + + for (i = n; --i >= 2;) { + if (tdbflags[i] & DB_NEW) { + if (keep) { + env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; + } else { + char *ptr = env->me_dbxs[i].md_name.mv_data; + env->me_dbxs[i].md_name.mv_data = NULL; + env->me_dbxs[i].md_name.mv_size = 0; + env->me_dbflags[i] = 0; + free(ptr); + } } } - } - if (keep && env->me_numdbs < n) - env->me_numdbs = n; + if (keep && env->me_numdbs < n) + env->me_numdbs = n; */ } @@ -426,80 +425,80 @@ func (t *transaction) updateBuckets(keep bool) { // @param[in] act why the transaction is being reset func (t *transaction) reset(act string) { /* - MDB_env *env = txn->mt_env; + MDB_env *env = txn->mt_env; - // Close any DBI handles opened in this txn - mdb_dbis_update(txn, 0); + // Close any DBI handles opened in this txn + mdb_dbis_update(txn, 0); - DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", - act, txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', - (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); + DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + act, txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); - if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { - if (txn->mt_u.reader) { - txn->mt_u.reader->mr_txnid = (txnid_t)-1; - if (!(env->me_flags & MDB_NOTLS)) - txn->mt_u.reader = NULL; // txn does not own reader - } - txn->mt_numdbs = 0; // close nothing if called again - txn->mt_dbxs = NULL; // mark txn as reset - } else { - mdb_cursors_close(txn, 0); + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + if (txn->mt_u.reader) { + txn->mt_u.reader->mr_txnid = (txnid_t)-1; + if (!(env->me_flags & MDB_NOTLS)) + txn->mt_u.reader = NULL; // txn does not own reader + } + txn->mt_numdbs = 0; // close nothing if called again + txn->mt_dbxs = NULL; // mark txn as reset + } else { + mdb_cursors_close(txn, 0); - if (!(env->me_flags & MDB_WRITEMAP)) { - mdb_dlist_free(txn); - } - mdb_midl_free(env->me_pghead); - - if (txn->mt_parent) { - txn->mt_parent->mt_child = NULL; - env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; - mdb_midl_free(txn->mt_free_pgs); - mdb_midl_free(txn->mt_spill_pgs); - free(txn->mt_u.dirty_list); - return; - } + if (!(env->me_flags & MDB_WRITEMAP)) { + mdb_dlist_free(txn); + } + mdb_midl_free(env->me_pghead); + + if (txn->mt_parent) { + txn->mt_parent->mt_child = NULL; + env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; + mdb_midl_free(txn->mt_free_pgs); + mdb_midl_free(txn->mt_spill_pgs); + free(txn->mt_u.dirty_list); + return; + } - if (mdb_midl_shrink(&txn->mt_free_pgs)) - env->me_free_pgs = txn->mt_free_pgs; - env->me_pghead = NULL; - env->me_pglast = 0; + if (mdb_midl_shrink(&txn->mt_free_pgs)) + env->me_free_pgs = txn->mt_free_pgs; + env->me_pghead = NULL; + env->me_pglast = 0; - env->me_txn = NULL; - // The writer mutex was locked in mdb_txn_begin. - if (env->me_txns) - UNLOCK_MUTEX_W(env); - } + env->me_txn = NULL; + // The writer mutex was locked in mdb_txn_begin. + if (env->me_txns) + UNLOCK_MUTEX_W(env); + } */ } func (t *transaction) Reset() { /* - if (txn == NULL) - return; + if (txn == NULL) + return; - // This call is only valid for read-only txns - if (!(txn->mt_flags & MDB_TXN_RDONLY)) - return; + // This call is only valid for read-only txns + if (!(txn->mt_flags & MDB_TXN_RDONLY)) + return; - mdb_txn_reset0(txn, "reset"); + mdb_txn_reset0(txn, "reset"); */ } func (t *transaction) Abort() { /* - if (txn == NULL) - return; + if (txn == NULL) + return; - if (txn->mt_child) - mdb_txn_abort(txn->mt_child); + if (txn->mt_child) + mdb_txn_abort(txn->mt_child); - mdb_txn_reset0(txn, "abort"); - // Free reader slot tied to this txn (if MDB_NOTLS && writable FS) - if ((txn->mt_flags & MDB_TXN_RDONLY) && txn->mt_u.reader) - txn->mt_u.reader->mr_pid = 0; + mdb_txn_reset0(txn, "abort"); + // Free reader slot tied to this txn (if MDB_NOTLS && writable FS) + if ((txn->mt_flags & MDB_TXN_RDONLY) && txn->mt_u.reader) + txn->mt_u.reader->mr_pid = 0; - free(txn); + free(txn); */ } @@ -507,154 +506,154 @@ func (t *transaction) Abort() { // This changes the freelist. Keep trying until it stabilizes. func (t *transaction) saveFreelist() error { /* - // env->me_pghead[] can grow and shrink during this call. - // env->me_pglast and txn->mt_free_pgs[] can only grow. - // Page numbers cannot disappear from txn->mt_free_pgs[]. - MDB_cursor mc; - MDB_env *env = txn->mt_env; - int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; - txnid_t pglast = 0, head_id = 0; - pgno_t freecnt = 0, *free_pgs, *mop; - ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; - - mdb_cursor_init(&mc, txn, FREE_DBI, NULL); - - if (env->me_pghead) { - // Make sure first page of freeDB is touched and on freelist - rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY); - if (rc && rc != MDB_NOTFOUND) - return rc; - } + // env->me_pghead[] can grow and shrink during this call. + // env->me_pglast and txn->mt_free_pgs[] can only grow. + // Page numbers cannot disappear from txn->mt_free_pgs[]. + MDB_cursor mc; + MDB_env *env = txn->mt_env; + int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; + txnid_t pglast = 0, head_id = 0; + pgno_t freecnt = 0, *free_pgs, *mop; + ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; + + mdb_cursor_init(&mc, txn, FREE_DBI, NULL); + + if (env->me_pghead) { + // Make sure first page of freeDB is touched and on freelist + rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY); + if (rc && rc != MDB_NOTFOUND) + return rc; + } - // MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) - clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) - ? SSIZE_MAX : maxfree_1pg; + // MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) + clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) + ? SSIZE_MAX : maxfree_1pg; - for (;;) { - // Come back here after each Put() in case freelist changed - MDB_val key, data; - pgno_t *pgs; - ssize_t j; + for (;;) { + // Come back here after each Put() in case freelist changed + MDB_val key, data; + pgno_t *pgs; + ssize_t j; - // If using records from freeDB which we have not yet - // deleted, delete them and any we reserved for me_pghead. - while (pglast < env->me_pglast) { - rc = mdb_cursor_first(&mc, &key, NULL); - if (rc) - return rc; - pglast = head_id = *(txnid_t *)key.mv_data; - total_room = head_room = 0; - mdb_tassert(txn, pglast <= env->me_pglast); - rc = mdb_cursor_del(&mc, 0); - if (rc) - return rc; - } + // If using records from freeDB which we have not yet + // deleted, delete them and any we reserved for me_pghead. + while (pglast < env->me_pglast) { + rc = mdb_cursor_first(&mc, &key, NULL); + if (rc) + return rc; + pglast = head_id = *(txnid_t *)key.mv_data; + total_room = head_room = 0; + mdb_tassert(txn, pglast <= env->me_pglast); + rc = mdb_cursor_del(&mc, 0); + if (rc) + return rc; + } - // Save the IDL of pages freed by this txn, to a single record - if (freecnt < txn->mt_free_pgs[0]) { - if (!freecnt) { - // Make sure last page of freeDB is touched and on freelist - rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY); - if (rc && rc != MDB_NOTFOUND) - return rc; - } - free_pgs = txn->mt_free_pgs; - // Write to last page of freeDB - key.mv_size = sizeof(txn->mt_txnid); - key.mv_data = &txn->mt_txnid; - do { - freecnt = free_pgs[0]; - data.mv_size = MDB_IDL_SIZEOF(free_pgs); + // Save the IDL of pages freed by this txn, to a single record + if (freecnt < txn->mt_free_pgs[0]) { + if (!freecnt) { + // Make sure last page of freeDB is touched and on freelist + rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY); + if (rc && rc != MDB_NOTFOUND) + return rc; + } + free_pgs = txn->mt_free_pgs; + // Write to last page of freeDB + key.mv_size = sizeof(txn->mt_txnid); + key.mv_data = &txn->mt_txnid; + do { + freecnt = free_pgs[0]; + data.mv_size = MDB_IDL_SIZEOF(free_pgs); + rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); + if (rc) + return rc; + // Retry if mt_free_pgs[] grew during the Put() + free_pgs = txn->mt_free_pgs; + } while (freecnt < free_pgs[0]); + mdb_midl_sort(free_pgs); + memcpy(data.mv_data, free_pgs, data.mv_size); + #if (MDB_DEBUG) > 1 + { + unsigned int i = free_pgs[0]; + DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u", + txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i)); + for (; i; i--) + DPRINTF(("IDL %"Z"u", free_pgs[i])); + } + #endif + continue; + } + + mop = env->me_pghead; + mop_len = mop ? mop[0] : 0; + + // Reserve records for me_pghead[]. Split it if multi-page, + // to avoid searching freeDB for a page range. Use keys in + // range [1,me_pglast]: Smaller than txnid of oldest reader. + if (total_room >= mop_len) { + if (total_room == mop_len || --more < 0) + break; + } else if (head_room >= maxfree_1pg && head_id > 1) { + // Keep current record (overflow page), add a new one + head_id--; + head_room = 0; + } + // (Re)write {key = head_id, IDL length = head_room} + total_room -= head_room; + head_room = mop_len - total_room; + if (head_room > maxfree_1pg && head_id > 1) { + // Overflow multi-page for part of me_pghead + head_room /= head_id; // amortize page sizes + head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); + } else if (head_room < 0) { + // Rare case, not bothering to delete this record + head_room = 0; + } + key.mv_size = sizeof(head_id); + key.mv_data = &head_id; + data.mv_size = (head_room + 1) * sizeof(pgno_t); rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); if (rc) return rc; - // Retry if mt_free_pgs[] grew during the Put() - free_pgs = txn->mt_free_pgs; - } while (freecnt < free_pgs[0]); - mdb_midl_sort(free_pgs); - memcpy(data.mv_data, free_pgs, data.mv_size); -#if (MDB_DEBUG) > 1 - { - unsigned int i = free_pgs[0]; - DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u", - txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i)); - for (; i; i--) - DPRINTF(("IDL %"Z"u", free_pgs[i])); + // IDL is initially empty, zero out at least the length + pgs = (pgno_t *)data.mv_data; + j = head_room > clean_limit ? head_room : 0; + do { + pgs[j] = 0; + } while (--j >= 0); + total_room += head_room; } -#endif - continue; - } - - mop = env->me_pghead; - mop_len = mop ? mop[0] : 0; - // Reserve records for me_pghead[]. Split it if multi-page, - // to avoid searching freeDB for a page range. Use keys in - // range [1,me_pglast]: Smaller than txnid of oldest reader. - if (total_room >= mop_len) { - if (total_room == mop_len || --more < 0) - break; - } else if (head_room >= maxfree_1pg && head_id > 1) { - // Keep current record (overflow page), add a new one - head_id--; - head_room = 0; - } - // (Re)write {key = head_id, IDL length = head_room} - total_room -= head_room; - head_room = mop_len - total_room; - if (head_room > maxfree_1pg && head_id > 1) { - // Overflow multi-page for part of me_pghead - head_room /= head_id; // amortize page sizes - head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); - } else if (head_room < 0) { - // Rare case, not bothering to delete this record - head_room = 0; - } - key.mv_size = sizeof(head_id); - key.mv_data = &head_id; - data.mv_size = (head_room + 1) * sizeof(pgno_t); - rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); - if (rc) - return rc; - // IDL is initially empty, zero out at least the length - pgs = (pgno_t *)data.mv_data; - j = head_room > clean_limit ? head_room : 0; - do { - pgs[j] = 0; - } while (--j >= 0); - total_room += head_room; - } - - // Fill in the reserved me_pghead records - rc = MDB_SUCCESS; - if (mop_len) { - MDB_val key, data; - - mop += mop_len; - rc = mdb_cursor_first(&mc, &key, &data); - for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { - unsigned flags = MDB_CURRENT; - txnid_t id = *(txnid_t *)key.mv_data; - ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; - MDB_ID save; - - mdb_tassert(txn, len >= 0 && id <= env->me_pglast); - key.mv_data = &id; - if (len > mop_len) { - len = mop_len; - data.mv_size = (len + 1) * sizeof(MDB_ID); - flags = 0; + // Fill in the reserved me_pghead records + rc = MDB_SUCCESS; + if (mop_len) { + MDB_val key, data; + + mop += mop_len; + rc = mdb_cursor_first(&mc, &key, &data); + for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { + unsigned flags = MDB_CURRENT; + txnid_t id = *(txnid_t *)key.mv_data; + ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; + MDB_ID save; + + mdb_tassert(txn, len >= 0 && id <= env->me_pglast); + key.mv_data = &id; + if (len > mop_len) { + len = mop_len; + data.mv_size = (len + 1) * sizeof(MDB_ID); + flags = 0; + } + data.mv_data = mop -= len; + save = mop[0]; + mop[0] = len; + rc = mdb_cursor_put(&mc, &key, &data, flags); + mop[0] = save; + if (rc || !(mop_len -= len)) + break; + } } - data.mv_data = mop -= len; - save = mop[0]; - mop[0] = len; - rc = mdb_cursor_put(&mc, &key, &data, flags); - mop[0] = save; - if (rc || !(mop_len -= len)) - break; - } - } - return rc; + return rc; */ return nil } @@ -663,342 +662,342 @@ func (t *transaction) saveFreelist() error { // @param[in] txn the transaction that's being committed // @param[in] keep number of initial pages in dirty_list to keep dirty. // @return 0 on success, non-zero on failure. -func (t *transaction) flush(keep bool) { +func (t *transaction) flush(keep bool) error { /* - MDB_env *env = txn->mt_env; - MDB_ID2L dl = txn->mt_u.dirty_list; - unsigned psize = env->me_psize, j; - int i, pagecount = dl[0].mid, rc; - size_t size = 0, pos = 0; - pgno_t pgno = 0; - MDB_page *dp = NULL; -#ifdef _WIN32 - OVERLAPPED ov; -#else - struct iovec iov[MDB_COMMIT_PAGES]; - ssize_t wpos = 0, wsize = 0, wres; - size_t next_pos = 1; // impossible pos, so pos != next_pos - int n = 0; -#endif - - j = i = keep; - - if (env->me_flags & MDB_WRITEMAP) { - // Clear dirty flags - while (++i <= pagecount) { - dp = dl[i].mptr; - // Don't flush this page yet - if (dp->mp_flags & P_KEEP) { - dp->mp_flags ^= P_KEEP; - dl[++j] = dl[i]; - continue; - } - dp->mp_flags &= ~P_DIRTY; - } - goto done; - } - - // Write the pages - for (;;) { - if (++i <= pagecount) { - dp = dl[i].mptr; - // Don't flush this page yet - if (dp->mp_flags & P_KEEP) { - dp->mp_flags ^= P_KEEP; - dl[i].mid = 0; - continue; - } - pgno = dl[i].mid; - // clear dirty flag - dp->mp_flags &= ~P_DIRTY; - pos = pgno * psize; - size = psize; - if (IS_OVERFLOW(dp)) size *= dp->mp_pages; - } -#ifdef _WIN32 - else break; - - // Windows actually supports scatter/gather I/O, but only on - // unbuffered file handles. Since we're relying on the OS page - // cache for all our data, that's self-defeating. So we just - // write pages one at a time. We use the ov structure to set - // the write offset, to at least save the overhead of a Seek - // system call. - DPRINTF(("committing page %"Z"u", pgno)); - memset(&ov, 0, sizeof(ov)); - ov.Offset = pos & 0xffffffff; - ov.OffsetHigh = pos >> 16 >> 16; - if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) { - rc = ErrCode(); - DPRINTF(("WriteFile: %d", rc)); - return rc; - } -#else - // Write up to MDB_COMMIT_PAGES dirty pages at a time. - if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { - if (n) { - // Write previous page(s) -#ifdef MDB_USE_PWRITEV - wres = pwritev(env->me_fd, iov, n, wpos); -#else - if (n == 1) { - wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); - } else { - if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { - rc = ErrCode(); - DPRINTF(("lseek: %s", strerror(rc))); - return rc; + MDB_env *env = txn->mt_env; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned psize = env->me_psize, j; + int i, pagecount = dl[0].mid, rc; + size_t size = 0, pos = 0; + pgno_t pgno = 0; + MDB_page *dp = NULL; + #ifdef _WIN32 + OVERLAPPED ov; + #else + struct iovec iov[MDB_COMMIT_PAGES]; + ssize_t wpos = 0, wsize = 0, wres; + size_t next_pos = 1; // impossible pos, so pos != next_pos + int n = 0; + #endif + + j = i = keep; + + if (env->me_flags & MDB_WRITEMAP) { + // Clear dirty flags + while (++i <= pagecount) { + dp = dl[i].mptr; + // Don't flush this page yet + if (dp->mp_flags & P_KEEP) { + dp->mp_flags ^= P_KEEP; + dl[++j] = dl[i]; + continue; } - wres = writev(env->me_fd, iov, n); + dp->mp_flags &= ~P_DIRTY; } -#endif - if (wres != wsize) { - if (wres < 0) { - rc = ErrCode(); - DPRINTF(("Write error: %s", strerror(rc))); - } else { - rc = EIO; // TODO: Use which error code? - DPUTS("short write, filesystem full?"); + goto done; + } + + // Write the pages + for (;;) { + if (++i <= pagecount) { + dp = dl[i].mptr; + // Don't flush this page yet + if (dp->mp_flags & P_KEEP) { + dp->mp_flags ^= P_KEEP; + dl[i].mid = 0; + continue; } + pgno = dl[i].mid; + // clear dirty flag + dp->mp_flags &= ~P_DIRTY; + pos = pgno * psize; + size = psize; + if (IS_OVERFLOW(dp)) size *= dp->mp_pages; + } + #ifdef _WIN32 + else break; + + // Windows actually supports scatter/gather I/O, but only on + // unbuffered file handles. Since we're relying on the OS page + // cache for all our data, that's self-defeating. So we just + // write pages one at a time. We use the ov structure to set + // the write offset, to at least save the overhead of a Seek + // system call. + DPRINTF(("committing page %"Z"u", pgno)); + memset(&ov, 0, sizeof(ov)); + ov.Offset = pos & 0xffffffff; + ov.OffsetHigh = pos >> 16 >> 16; + if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) { + rc = ErrCode(); + DPRINTF(("WriteFile: %d", rc)); return rc; } - n = 0; - } - if (i > pagecount) - break; - wpos = pos; - wsize = 0; - } - DPRINTF(("committing page %"Z"u", pgno)); - next_pos = pos + size; - iov[n].iov_len = size; - iov[n].iov_base = (char *)dp; - wsize += size; - n++; -#endif // _WIN32 - } - - for (i = keep; ++i <= pagecount; ) { - dp = dl[i].mptr; - // This is a page we skipped above - if (!dl[i].mid) { - dl[++j] = dl[i]; - dl[j].mid = dp->mp_pgno; - continue; - } - mdb_dpage_free(env, dp); - } - -done: - i--; - txn->mt_dirty_room += i - j; - dl[0].mid = j; - return MDB_SUCCESS; -} - -int -mdb_txn_commit(MDB_txn *txn) -{ - int rc; - unsigned int i; - MDB_env *env; - - if (txn == NULL || txn->mt_env == NULL) - return EINVAL; - - if (txn->mt_child) { - rc = mdb_txn_commit(txn->mt_child); - txn->mt_child = NULL; - if (rc) - goto fail; - } - - env = txn->mt_env; - - if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { - mdb_dbis_update(txn, 1); - txn->mt_numdbs = 2; // so txn_abort() doesn't close any new handles - mdb_txn_abort(txn); - return MDB_SUCCESS; - } - - if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) { - DPUTS("error flag is set, can't commit"); - if (txn->mt_parent) - txn->mt_parent->mt_flags |= MDB_TXN_ERROR; - rc = MDB_BAD_TXN; - goto fail; - } - - if (txn->mt_parent) { - MDB_txn *parent = txn->mt_parent; - MDB_ID2L dst, src; - MDB_IDL pspill; - unsigned x, y, len, ps_len; - - // Append our free list to parent's - rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); - if (rc) - goto fail; - mdb_midl_free(txn->mt_free_pgs); - // Failures after this must either undo the changes - // to the parent or set MDB_TXN_ERROR in the parent. - - parent->mt_next_pgno = txn->mt_next_pgno; - parent->mt_flags = txn->mt_flags; - - // Merge our cursors into parent's and close them - mdb_cursors_close(txn, 1); - - // Update parent's DB table. - memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); - parent->mt_numdbs = txn->mt_numdbs; - parent->mt_dbflags[0] = txn->mt_dbflags[0]; - parent->mt_dbflags[1] = txn->mt_dbflags[1]; - for (i=2; i<txn->mt_numdbs; i++) { - // preserve parent's DB_NEW status - x = parent->mt_dbflags[i] & DB_NEW; - parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; - } - - dst = parent->mt_u.dirty_list; - src = txn->mt_u.dirty_list; - // Remove anything in our dirty list from parent's spill list - if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { - x = y = ps_len; - pspill[0] = (pgno_t)-1; - // Mark our dirty pages as deleted in parent spill list - for (i=0, len=src[0].mid; ++i <= len; ) { - MDB_ID pn = src[i].mid << 1; - while (pn > pspill[x]) - x--; - if (pn == pspill[x]) { - pspill[x] = 1; - y = --x; + #else + // Write up to MDB_COMMIT_PAGES dirty pages at a time. + if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { + if (n) { + // Write previous page(s) + #ifdef MDB_USE_PWRITEV + wres = pwritev(env->me_fd, iov, n, wpos); + #else + if (n == 1) { + wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); + } else { + if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { + rc = ErrCode(); + DPRINTF(("lseek: %s", strerror(rc))); + return rc; + } + wres = writev(env->me_fd, iov, n); + } + #endif + if (wres != wsize) { + if (wres < 0) { + rc = ErrCode(); + DPRINTF(("Write error: %s", strerror(rc))); + } else { + rc = EIO; // TODO: Use which error code? + DPUTS("short write, filesystem full?"); + } + return rc; + } + n = 0; + } + if (i > pagecount) + break; + wpos = pos; + wsize = 0; } + DPRINTF(("committing page %"Z"u", pgno)); + next_pos = pos + size; + iov[n].iov_len = size; + iov[n].iov_base = (char *)dp; + wsize += size; + n++; + #endif // _WIN32 } - // Squash deleted pagenums if we deleted any - for (x=y; ++x <= ps_len; ) - if (!(pspill[x] & 1)) - pspill[++y] = pspill[x]; - pspill[0] = y; - } - // Find len = length of merging our dirty list with parent's - x = dst[0].mid; - dst[0].mid = 0; // simplify loops - if (parent->mt_parent) { - len = x + src[0].mid; - y = mdb_mid2l_search(src, dst[x].mid + 1) - 1; - for (i = x; y && i; y--) { - pgno_t yp = src[y].mid; - while (yp < dst[i].mid) - i--; - if (yp == dst[i].mid) { - i--; - len--; + for (i = keep; ++i <= pagecount; ) { + dp = dl[i].mptr; + // This is a page we skipped above + if (!dl[i].mid) { + dl[++j] = dl[i]; + dl[j].mid = dp->mp_pgno; + continue; } + mdb_dpage_free(env, dp); } - } else { // Simplify the above for single-ancestor case - len = MDB_IDL_UM_MAX - txn->mt_dirty_room; - } - // Merge our dirty list with parent's - y = src[0].mid; - for (i = len; y; dst[i--] = src[y--]) { - pgno_t yp = src[y].mid; - while (yp < dst[x].mid) - dst[i--] = dst[x--]; - if (yp == dst[x].mid) - free(dst[x--].mptr); - } - mdb_tassert(txn, i == x); - dst[0].mid = len; - free(txn->mt_u.dirty_list); - parent->mt_dirty_room = txn->mt_dirty_room; - if (txn->mt_spill_pgs) { - if (parent->mt_spill_pgs) { - // TODO: Prevent failure here, so parent does not fail - rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); - if (rc) - parent->mt_flags |= MDB_TXN_ERROR; - mdb_midl_free(txn->mt_spill_pgs); - mdb_midl_sort(parent->mt_spill_pgs); - } else { - parent->mt_spill_pgs = txn->mt_spill_pgs; - } + + done: + i--; + txn->mt_dirty_room += i - j; + dl[0].mid = j; + return MDB_SUCCESS; } - parent->mt_child = NULL; - mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); - free(txn); - return rc; - } + int + mdb_txn_commit(MDB_txn *txn) + { + int rc; + unsigned int i; + MDB_env *env; - if (txn != env->me_txn) { - DPUTS("attempt to commit unknown transaction"); - rc = EINVAL; - goto fail; - } + if (txn == NULL || txn->mt_env == NULL) + return EINVAL; - mdb_cursors_close(txn, 0); + if (txn->mt_child) { + rc = mdb_txn_commit(txn->mt_child); + txn->mt_child = NULL; + if (rc) + goto fail; + } - if (!txn->mt_u.dirty_list[0].mid && - !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) - goto done; + env = txn->mt_env; - DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u", - txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + mdb_dbis_update(txn, 1); + txn->mt_numdbs = 2; // so txn_abort() doesn't close any new handles + mdb_txn_abort(txn); + return MDB_SUCCESS; + } - // Update DB root pointers - if (txn->mt_numdbs > 2) { - MDB_cursor mc; - MDB_dbi i; - MDB_val data; - data.mv_size = sizeof(MDB_db); + if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) { + DPUTS("error flag is set, can't commit"); + if (txn->mt_parent) + txn->mt_parent->mt_flags |= MDB_TXN_ERROR; + rc = MDB_BAD_TXN; + goto fail; + } - mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); - for (i = 2; i < txn->mt_numdbs; i++) { - if (txn->mt_dbflags[i] & DB_DIRTY) { - data.mv_data = &txn->mt_dbs[i]; - rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0); + if (txn->mt_parent) { + MDB_txn *parent = txn->mt_parent; + MDB_ID2L dst, src; + MDB_IDL pspill; + unsigned x, y, len, ps_len; + + // Append our free list to parent's + rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); if (rc) goto fail; + mdb_midl_free(txn->mt_free_pgs); + // Failures after this must either undo the changes + // to the parent or set MDB_TXN_ERROR in the parent. + + parent->mt_next_pgno = txn->mt_next_pgno; + parent->mt_flags = txn->mt_flags; + + // Merge our cursors into parent's and close them + mdb_cursors_close(txn, 1); + + // Update parent's DB table. + memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + parent->mt_numdbs = txn->mt_numdbs; + parent->mt_dbflags[0] = txn->mt_dbflags[0]; + parent->mt_dbflags[1] = txn->mt_dbflags[1]; + for (i=2; i<txn->mt_numdbs; i++) { + // preserve parent's DB_NEW status + x = parent->mt_dbflags[i] & DB_NEW; + parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; + } + + dst = parent->mt_u.dirty_list; + src = txn->mt_u.dirty_list; + // Remove anything in our dirty list from parent's spill list + if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { + x = y = ps_len; + pspill[0] = (pgno_t)-1; + // Mark our dirty pages as deleted in parent spill list + for (i=0, len=src[0].mid; ++i <= len; ) { + MDB_ID pn = src[i].mid << 1; + while (pn > pspill[x]) + x--; + if (pn == pspill[x]) { + pspill[x] = 1; + y = --x; + } + } + // Squash deleted pagenums if we deleted any + for (x=y; ++x <= ps_len; ) + if (!(pspill[x] & 1)) + pspill[++y] = pspill[x]; + pspill[0] = y; + } + + // Find len = length of merging our dirty list with parent's + x = dst[0].mid; + dst[0].mid = 0; // simplify loops + if (parent->mt_parent) { + len = x + src[0].mid; + y = mdb_mid2l_search(src, dst[x].mid + 1) - 1; + for (i = x; y && i; y--) { + pgno_t yp = src[y].mid; + while (yp < dst[i].mid) + i--; + if (yp == dst[i].mid) { + i--; + len--; + } + } + } else { // Simplify the above for single-ancestor case + len = MDB_IDL_UM_MAX - txn->mt_dirty_room; + } + // Merge our dirty list with parent's + y = src[0].mid; + for (i = len; y; dst[i--] = src[y--]) { + pgno_t yp = src[y].mid; + while (yp < dst[x].mid) + dst[i--] = dst[x--]; + if (yp == dst[x].mid) + free(dst[x--].mptr); + } + mdb_tassert(txn, i == x); + dst[0].mid = len; + free(txn->mt_u.dirty_list); + parent->mt_dirty_room = txn->mt_dirty_room; + if (txn->mt_spill_pgs) { + if (parent->mt_spill_pgs) { + // TODO: Prevent failure here, so parent does not fail + rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); + if (rc) + parent->mt_flags |= MDB_TXN_ERROR; + mdb_midl_free(txn->mt_spill_pgs); + mdb_midl_sort(parent->mt_spill_pgs); + } else { + parent->mt_spill_pgs = txn->mt_spill_pgs; + } + } + + parent->mt_child = NULL; + mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); + free(txn); + return rc; + } + + if (txn != env->me_txn) { + DPUTS("attempt to commit unknown transaction"); + rc = EINVAL; + goto fail; + } + + mdb_cursors_close(txn, 0); + + if (!txn->mt_u.dirty_list[0].mid && + !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) + goto done; + + DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u", + txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); + + // Update DB root pointers + if (txn->mt_numdbs > 2) { + MDB_cursor mc; + MDB_dbi i; + MDB_val data; + data.mv_size = sizeof(MDB_db); + + mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); + for (i = 2; i < txn->mt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + data.mv_data = &txn->mt_dbs[i]; + rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0); + if (rc) + goto fail; + } + } } - } - } - rc = mdb_freelist_save(txn); - if (rc) - goto fail; + rc = mdb_freelist_save(txn); + if (rc) + goto fail; - mdb_midl_free(env->me_pghead); - env->me_pghead = NULL; - if (mdb_midl_shrink(&txn->mt_free_pgs)) - env->me_free_pgs = txn->mt_free_pgs; + mdb_midl_free(env->me_pghead); + env->me_pghead = NULL; + if (mdb_midl_shrink(&txn->mt_free_pgs)) + env->me_free_pgs = txn->mt_free_pgs; -#if (MDB_DEBUG) > 2 - mdb_audit(txn); -#endif + #if (MDB_DEBUG) > 2 + mdb_audit(txn); + #endif - if ((rc = mdb_page_flush(txn, 0)) || - (rc = mdb_env_sync(env, 0)) || - (rc = mdb_env_write_meta(txn))) - goto fail; + if ((rc = mdb_page_flush(txn, 0)) || + (rc = mdb_env_sync(env, 0)) || + (rc = mdb_env_write_meta(txn))) + goto fail; -done: - env->me_pglast = 0; - env->me_txn = NULL; - mdb_dbis_update(txn, 1); + done: + env->me_pglast = 0; + env->me_txn = NULL; + mdb_dbis_update(txn, 1); - if (env->me_txns) - UNLOCK_MUTEX_W(env); - free(txn); + if (env->me_txns) + UNLOCK_MUTEX_W(env); + free(txn); - return MDB_SUCCESS; + return MDB_SUCCESS; -fail: - mdb_txn_abort(txn); - return rc; + fail: + mdb_txn_abort(txn); + return rc; */ return nil } @@ -1008,118 +1007,118 @@ fail: // @return 0 on success, non-zero on failure. func (t *transaction) writeMeta() error { /* - MDB_env *env; - MDB_meta meta, metab, *mp; - off_t off; - int rc, len, toggle; - char *ptr; - HANDLE mfd; -#ifdef _WIN32 - OVERLAPPED ov; -#else - int r2; -#endif - - toggle = txn->mt_txnid & 1; - DPRINTF(("writing meta page %d for root page %"Z"u", - toggle, txn->mt_dbs[MAIN_DBI].md_root)); - - env = txn->mt_env; - mp = env->me_metas[toggle]; - - if (env->me_flags & MDB_WRITEMAP) { - // Persist any increases of mapsize config - if (env->me_mapsize > mp->mm_mapsize) - mp->mm_mapsize = env->me_mapsize; - mp->mm_dbs[0] = txn->mt_dbs[0]; - mp->mm_dbs[1] = txn->mt_dbs[1]; - mp->mm_last_pg = txn->mt_next_pgno - 1; - mp->mm_txnid = txn->mt_txnid; - if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { - unsigned meta_size = env->me_psize; - rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; - ptr = env->me_map; - if (toggle) { -#ifndef _WIN32 // POSIX msync() requires ptr = start of OS page - if (meta_size < env->me_os_psize) - meta_size += meta_size; - else -#endif - ptr += meta_size; + MDB_env *env; + MDB_meta meta, metab, *mp; + off_t off; + int rc, len, toggle; + char *ptr; + HANDLE mfd; + #ifdef _WIN32 + OVERLAPPED ov; + #else + int r2; + #endif + + toggle = txn->mt_txnid & 1; + DPRINTF(("writing meta page %d for root page %"Z"u", + toggle, txn->mt_dbs[MAIN_DBI].md_root)); + + env = txn->mt_env; + mp = env->me_metas[toggle]; + + if (env->me_flags & MDB_WRITEMAP) { + // Persist any increases of mapsize config + if (env->me_mapsize > mp->mm_mapsize) + mp->mm_mapsize = env->me_mapsize; + mp->mm_dbs[0] = txn->mt_dbs[0]; + mp->mm_dbs[1] = txn->mt_dbs[1]; + mp->mm_last_pg = txn->mt_next_pgno - 1; + mp->mm_txnid = txn->mt_txnid; + if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { + unsigned meta_size = env->me_psize; + rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; + ptr = env->me_map; + if (toggle) { + #ifndef _WIN32 // POSIX msync() requires ptr = start of OS page + if (meta_size < env->me_os_psize) + meta_size += meta_size; + else + #endif + ptr += meta_size; + } + if (MDB_MSYNC(ptr, meta_size, rc)) { + rc = ErrCode(); + goto fail; + } + } + goto done; } - if (MDB_MSYNC(ptr, meta_size, rc)) { - rc = ErrCode(); - goto fail; + metab.mm_txnid = env->me_metas[toggle]->mm_txnid; + metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg; + + ptr = (char *)&meta; + if (env->me_mapsize > mp->mm_mapsize) { + // Persist any increases of mapsize config + meta.mm_mapsize = env->me_mapsize; + off = offsetof(MDB_meta, mm_mapsize); + } else { + off = offsetof(MDB_meta, mm_dbs[0].md_depth); } - } - goto done; - } - metab.mm_txnid = env->me_metas[toggle]->mm_txnid; - metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg; - - ptr = (char *)&meta; - if (env->me_mapsize > mp->mm_mapsize) { - // Persist any increases of mapsize config - meta.mm_mapsize = env->me_mapsize; - off = offsetof(MDB_meta, mm_mapsize); - } else { - off = offsetof(MDB_meta, mm_dbs[0].md_depth); - } - len = sizeof(MDB_meta) - off; - - ptr += off; - meta.mm_dbs[0] = txn->mt_dbs[0]; - meta.mm_dbs[1] = txn->mt_dbs[1]; - meta.mm_last_pg = txn->mt_next_pgno - 1; - meta.mm_txnid = txn->mt_txnid; - - if (toggle) - off += env->me_psize; - off += PAGEHDRSZ; - - // Write to the SYNC fd - mfd = env->me_flags & (MDB_NOSYNC|MDB_NOMETASYNC) ? - env->me_fd : env->me_mfd; -#ifdef _WIN32 - { - memset(&ov, 0, sizeof(ov)); - ov.Offset = off; - if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov)) - rc = -1; - } -#else - rc = pwrite(mfd, ptr, len, off); -#endif - if (rc != len) { - rc = rc < 0 ? ErrCode() : EIO; - DPUTS("write failed, disk error?"); - // On a failure, the pagecache still contains the new data. - // Write some old data back, to prevent it from being used. - // Use the non-SYNC fd; we know it will fail anyway. - meta.mm_last_pg = metab.mm_last_pg; - meta.mm_txnid = metab.mm_txnid; -#ifdef _WIN32 - memset(&ov, 0, sizeof(ov)); - ov.Offset = off; - WriteFile(env->me_fd, ptr, len, NULL, &ov); -#else - r2 = pwrite(env->me_fd, ptr, len, off); - (void)r2; // Silence warnings. We don't care about pwrite's return value -#endif -fail: - env->me_flags |= MDB_FATAL_ERROR; - return rc; - } -done: - // Memory ordering issues are irrelevant; since the entire writer - // is wrapped by wmutex, all of these changes will become visible - // after the wmutex is unlocked. Since the DB is multi-version, - // readers will get consistent data regardless of how fresh or - // how stale their view of these values is. - if (env->me_txns) - env->me_txns->mti_txnid = txn->mt_txnid; - - return MDB_SUCCESS; + len = sizeof(MDB_meta) - off; + + ptr += off; + meta.mm_dbs[0] = txn->mt_dbs[0]; + meta.mm_dbs[1] = txn->mt_dbs[1]; + meta.mm_last_pg = txn->mt_next_pgno - 1; + meta.mm_txnid = txn->mt_txnid; + + if (toggle) + off += env->me_psize; + off += PAGEHDRSZ; + + // Write to the SYNC fd + mfd = env->me_flags & (MDB_NOSYNC|MDB_NOMETASYNC) ? + env->me_fd : env->me_mfd; + #ifdef _WIN32 + { + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov)) + rc = -1; + } + #else + rc = pwrite(mfd, ptr, len, off); + #endif + if (rc != len) { + rc = rc < 0 ? ErrCode() : EIO; + DPUTS("write failed, disk error?"); + // On a failure, the pagecache still contains the new data. + // Write some old data back, to prevent it from being used. + // Use the non-SYNC fd; we know it will fail anyway. + meta.mm_last_pg = metab.mm_last_pg; + meta.mm_txnid = metab.mm_txnid; + #ifdef _WIN32 + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + WriteFile(env->me_fd, ptr, len, NULL, &ov); + #else + r2 = pwrite(env->me_fd, ptr, len, off); + (void)r2; // Silence warnings. We don't care about pwrite's return value + #endif + fail: + env->me_flags |= MDB_FATAL_ERROR; + return rc; + } + done: + // Memory ordering issues are irrelevant; since the entire writer + // is wrapped by wmutex, all of these changes will become visible + // after the wmutex is unlocked. Since the DB is multi-version, + // readers will get consistent data regardless of how fresh or + // how stale their view of these values is. + if (env->me_txns) + env->me_txns->mti_txnid = txn->mt_txnid; + + return MDB_SUCCESS; */ return nil } @@ -1132,53 +1131,53 @@ done: // @return 0 on success, non-zero on failure. func (t *transaction) getPage(id int) (*page, int, error) { /* - MDB_env *env = txn->mt_env; - MDB_page *p = NULL; - int level; - - if (!((txn->mt_flags & MDB_TXN_RDONLY) | (env->me_flags & MDB_WRITEMAP))) { - MDB_txn *tx2 = txn; - level = 1; - do { - MDB_ID2L dl = tx2->mt_u.dirty_list; - unsigned x; - // Spilled pages were dirtied in this txn and flushed - // because the dirty list got full. Bring this page - // back in from the map (but don't unspill it here, - // leave that unless page_touch happens again). - if (tx2->mt_spill_pgs) { - MDB_ID pn = pgno << 1; - x = mdb_midl_search(tx2->mt_spill_pgs, pn); - if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { - p = (MDB_page *)(env->me_map + env->me_psize * pgno); - goto done; - } + MDB_env *env = txn->mt_env; + MDB_page *p = NULL; + int level; + + if (!((txn->mt_flags & MDB_TXN_RDONLY) | (env->me_flags & MDB_WRITEMAP))) { + MDB_txn *tx2 = txn; + level = 1; + do { + MDB_ID2L dl = tx2->mt_u.dirty_list; + unsigned x; + // Spilled pages were dirtied in this txn and flushed + // because the dirty list got full. Bring this page + // back in from the map (but don't unspill it here, + // leave that unless page_touch happens again). + if (tx2->mt_spill_pgs) { + MDB_ID pn = pgno << 1; + x = mdb_midl_search(tx2->mt_spill_pgs, pn); + if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { + p = (MDB_page *)(env->me_map + env->me_psize * pgno); + goto done; + } + } + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + p = dl[x].mptr; + goto done; + } + } + level++; + } while ((tx2 = tx2->mt_parent) != NULL); } - if (dl[0].mid) { - unsigned x = mdb_mid2l_search(dl, pgno); - if (x <= dl[0].mid && dl[x].mid == pgno) { - p = dl[x].mptr; - goto done; - } + + if (pgno < txn->mt_next_pgno) { + level = 0; + p = (MDB_page *)(env->me_map + env->me_psize * pgno); + } else { + DPRINTF(("page %"Z"u not found", pgno)); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PAGE_NOTFOUND; } - level++; - } while ((tx2 = tx2->mt_parent) != NULL); - } - - if (pgno < txn->mt_next_pgno) { - level = 0; - p = (MDB_page *)(env->me_map + env->me_psize * pgno); - } else { - DPRINTF(("page %"Z"u not found", pgno)); - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PAGE_NOTFOUND; - } - -done: - *ret = p; - if (lvl) - *lvl = level; - return MDB_SUCCESS; + + done: + *ret = p; + if (lvl) + *lvl = level; + return MDB_SUCCESS; */ return nil, 0, nil @@ -1191,349 +1190,354 @@ done: // @return 0 on success, non-zero on failure. func (t *transaction) readNode(leaf *node, data []byte) error { /* - MDB_page *omp; // overflow page - pgno_t pgno; - int rc; + MDB_page *omp; // overflow page + pgno_t pgno; + int rc; - if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { - data->mv_size = NODEDSZ(leaf); - data->mv_data = NODEDATA(leaf); - return MDB_SUCCESS; - } + if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { + data->mv_size = NODEDSZ(leaf); + data->mv_data = NODEDATA(leaf); + return MDB_SUCCESS; + } - // Read overflow data. - data->mv_size = NODEDSZ(leaf); - memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); - if ((rc = mdb_page_get(txn, pgno, &omp, NULL)) != 0) { - DPRINTF(("read overflow page %"Z"u failed", pgno)); - return rc; - } - data->mv_data = METADATA(omp); + // Read overflow data. + data->mv_size = NODEDSZ(leaf); + memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); + if ((rc = mdb_page_get(txn, pgno, &omp, NULL)) != 0) { + DPRINTF(("read overflow page %"Z"u failed", pgno)); + return rc; + } + data->mv_data = METADATA(omp); - return MDB_SUCCESS; + return MDB_SUCCESS; */ return nil } func (t *transaction) Get(bucket Bucket, key []byte) ([]byte, error) { /* - MDB_cursor mc; - MDB_xcursor mx; - int exact = 0; - DKBUF; + MDB_cursor mc; + MDB_xcursor mx; + int exact = 0; + DKBUF; - if (key == NULL || data == NULL) - return EINVAL; + if (key == NULL || data == NULL) + return EINVAL; - DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key))); + DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key))); - if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) - return EINVAL; + if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) + return EINVAL; - if (txn->mt_flags & MDB_TXN_ERROR) - return MDB_BAD_TXN; + if (txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; - mdb_cursor_init(&mc, txn, dbi, &mx); - return mdb_cursor_set(&mc, key, data, MDB_SET, &exact); + mdb_cursor_init(&mc, txn, dbi, &mx); + return mdb_cursor_set(&mc, key, data, MDB_SET, &exact); */ return nil, nil } func (t *transaction) Cursor(b Bucket) (Cursor, error) { /* - MDB_cursor *mc; - size_t size = sizeof(MDB_cursor); + MDB_cursor *mc; + size_t size = sizeof(MDB_cursor); - if (txn == NULL || ret == NULL || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) - return EINVAL; + if (txn == NULL || ret == NULL || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) + return EINVAL; - if (txn->mt_flags & MDB_TXN_ERROR) - return MDB_BAD_TXN; + if (txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; - // Allow read access to the freelist - if (!dbi && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) - return EINVAL; + // Allow read access to the freelist + if (!dbi && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + return EINVAL; - if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) - size += sizeof(MDB_xcursor); + if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) + size += sizeof(MDB_xcursor); - if ((mc = malloc(size)) != NULL) { - mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); - if (txn->mt_cursors) { - mc->mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = mc; - mc->mc_flags |= C_UNTRACK; + if ((mc = malloc(size)) != NULL) { + mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); + if (txn->mt_cursors) { + mc->mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = mc; + mc->mc_flags |= C_UNTRACK; + } + } else { + return ENOMEM; } - } else { - return ENOMEM; - } - *ret = mc; + *ret = mc; - return MDB_SUCCESS; + return MDB_SUCCESS; */ - return nil + return nil, nil } -func (t *transaction) Renew(c Cursor) error { +func (t *transaction) Renew1(c Cursor) error { /* - if (txn == NULL || mc == NULL || mc->mc_dbi >= txn->mt_numdbs) - return EINVAL; + if (txn == NULL || mc == NULL || mc->mc_dbi >= txn->mt_numdbs) + return EINVAL; - if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) - return EINVAL; + if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) + return EINVAL; - mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); - return MDB_SUCCESS; + mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); + return MDB_SUCCESS; */ + return nil } -func (t *transaction) Delete(b *bucket, key []byte, data []byte) error { +func (t *transaction) Delete(b *Bucket, key []byte, data []byte) error { /* - MDB_cursor mc; - MDB_xcursor mx; - MDB_cursor_op op; - MDB_val rdata, *xdata; - int rc, exact; - DKBUF; - - if (key == NULL) - return EINVAL; - - DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key))); - - if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) - return EINVAL; - - if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) - return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - - mdb_cursor_init(&mc, txn, dbi, &mx); - - exact = 0; - if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { - // must ignore any data - data = NULL; - } - if (data) { - op = MDB_GET_BOTH; - rdata = *data; - xdata = &rdata; - } else { - op = MDB_SET; - xdata = NULL; - } - rc = mdb_cursor_set(&mc, key, xdata, op, &exact); - if (rc == 0) { - // let mdb_page_split know about this cursor if needed: - // delete will trigger a rebalance; if it needs to move - // a node from one page to another, it will have to - // update the parent's separator key(s). If the new sepkey - // is larger than the current one, the parent page may - // run out of space, triggering a split. We need this - // cursor to be consistent until the end of the rebalance. - mc.mc_flags |= C_UNTRACK; - mc.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &mc; - rc = mdb_cursor_del(&mc, data ? 0 : MDB_NODUPDATA); - txn->mt_cursors[dbi] = mc.mc_next; - } - return rc; + MDB_cursor mc; + MDB_xcursor mx; + MDB_cursor_op op; + MDB_val rdata, *xdata; + int rc, exact; + DKBUF; + + if (key == NULL) + return EINVAL; + + DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key))); + + if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) + return EINVAL; + + if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) + return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + mdb_cursor_init(&mc, txn, dbi, &mx); + + exact = 0; + if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { + // must ignore any data + data = NULL; + } + if (data) { + op = MDB_GET_BOTH; + rdata = *data; + xdata = &rdata; + } else { + op = MDB_SET; + xdata = NULL; + } + rc = mdb_cursor_set(&mc, key, xdata, op, &exact); + if (rc == 0) { + // let mdb_page_split know about this cursor if needed: + // delete will trigger a rebalance; if it needs to move + // a node from one page to another, it will have to + // update the parent's separator key(s). If the new sepkey + // is larger than the current one, the parent page may + // run out of space, triggering a split. We need this + // cursor to be consistent until the end of the rebalance. + mc.mc_flags |= C_UNTRACK; + mc.mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = &mc; + rc = mdb_cursor_del(&mc, data ? 0 : MDB_NODUPDATA); + txn->mt_cursors[dbi] = mc.mc_next; + } + return rc; */ return nil } func (t *transaction) Put(b Bucket, key []byte, data []byte, flags int) error { /* - MDB_cursor mc; - MDB_xcursor mx; + MDB_cursor mc; + MDB_xcursor mx; - if (key == NULL || data == NULL) - return EINVAL; + if (key == NULL || data == NULL) + return EINVAL; - if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) - return EINVAL; + if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) + return EINVAL; - if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags) - return EINVAL; + if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags) + return EINVAL; - mdb_cursor_init(&mc, txn, dbi, &mx); - return mdb_cursor_put(&mc, key, data, flags); + mdb_cursor_init(&mc, txn, dbi, &mx); + return mdb_cursor_put(&mc, key, data, flags); */ + return nil } -func (t *transaction) Bucket(name string, flags int) (Bucket, error) { +func (t *transaction) Bucket(name string, flags int) (*Bucket, error) { /* - MDB_val key, data; - MDB_dbi i; - MDB_cursor mc; - int rc, dbflag, exact; - unsigned int unused = 0; - size_t len; - - if (txn->mt_dbxs[FREE_DBI].md_cmp == NULL) { - mdb_default_cmp(txn, FREE_DBI); - } - - if ((flags & VALID_FLAGS) != flags) - return EINVAL; - if (txn->mt_flags & MDB_TXN_ERROR) - return MDB_BAD_TXN; - - // main DB? - if (!name) { - *dbi = MAIN_DBI; - if (flags & PERSISTENT_FLAGS) { - uint16_t f2 = flags & PERSISTENT_FLAGS; - // make sure flag changes get committed - if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) { - txn->mt_dbs[MAIN_DBI].md_flags |= f2; - txn->mt_flags |= MDB_TXN_DIRTY; + MDB_val key, data; + MDB_dbi i; + MDB_cursor mc; + int rc, dbflag, exact; + unsigned int unused = 0; + size_t len; + + if (txn->mt_dbxs[FREE_DBI].md_cmp == NULL) { + mdb_default_cmp(txn, FREE_DBI); + } + + if ((flags & VALID_FLAGS) != flags) + return EINVAL; + if (txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; + + // main DB? + if (!name) { + *dbi = MAIN_DBI; + if (flags & PERSISTENT_FLAGS) { + uint16_t f2 = flags & PERSISTENT_FLAGS; + // make sure flag changes get committed + if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) { + txn->mt_dbs[MAIN_DBI].md_flags |= f2; + txn->mt_flags |= MDB_TXN_DIRTY; + } } + mdb_default_cmp(txn, MAIN_DBI); + return MDB_SUCCESS; } - mdb_default_cmp(txn, MAIN_DBI); - return MDB_SUCCESS; - } - - if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { - mdb_default_cmp(txn, MAIN_DBI); - } - - // Is the DB already open? - len = strlen(name); - for (i=2; i<txn->mt_numdbs; i++) { - if (!txn->mt_dbxs[i].md_name.mv_size) { - // Remember this free slot - if (!unused) unused = i; - continue; + + if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { + mdb_default_cmp(txn, MAIN_DBI); } - if (len == txn->mt_dbxs[i].md_name.mv_size && - !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { - *dbi = i; - return MDB_SUCCESS; + + // Is the DB already open? + len = strlen(name); + for (i=2; i<txn->mt_numdbs; i++) { + if (!txn->mt_dbxs[i].md_name.mv_size) { + // Remember this free slot + if (!unused) unused = i; + continue; + } + if (len == txn->mt_dbxs[i].md_name.mv_size && + !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { + *dbi = i; + return MDB_SUCCESS; + } } - } - - // If no free slot and max hit, fail - if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs) - return MDB_DBS_FULL; - - // Cannot mix named databases with some mainDB flags - if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY)) - return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; - - // Find the DB info - dbflag = DB_NEW|DB_VALID; - exact = 0; - key.mv_size = len; - key.mv_data = (void *)name; - mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); - rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact); - if (rc == MDB_SUCCESS) { - // make sure this is actually a DB - MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); - if (!(node->mn_flags & F_SUBDATA)) - return MDB_INCOMPATIBLE; - } else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) { - // Create if requested - MDB_db dummy; - data.mv_size = sizeof(MDB_db); - data.mv_data = &dummy; - memset(&dummy, 0, sizeof(dummy)); - dummy.md_root = P_INVALID; - dummy.md_flags = flags & PERSISTENT_FLAGS; - rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA); - dbflag |= DB_DIRTY; - } - - // OK, got info, add to table - if (rc == MDB_SUCCESS) { - unsigned int slot = unused ? unused : txn->mt_numdbs; - txn->mt_dbxs[slot].md_name.mv_data = strdup(name); - txn->mt_dbxs[slot].md_name.mv_size = len; - txn->mt_dbxs[slot].md_rel = NULL; - txn->mt_dbflags[slot] = dbflag; - memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); - *dbi = slot; - mdb_default_cmp(txn, slot); - if (!unused) { - txn->mt_numdbs++; + + // If no free slot and max hit, fail + if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs) + return MDB_DBS_FULL; + + // Cannot mix named databases with some mainDB flags + if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY)) + return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; + + // Find the DB info + dbflag = DB_NEW|DB_VALID; + exact = 0; + key.mv_size = len; + key.mv_data = (void *)name; + mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); + rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact); + if (rc == MDB_SUCCESS) { + // make sure this is actually a DB + MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); + if (!(node->mn_flags & F_SUBDATA)) + return MDB_INCOMPATIBLE; + } else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) { + // Create if requested + MDB_db dummy; + data.mv_size = sizeof(MDB_db); + data.mv_data = &dummy; + memset(&dummy, 0, sizeof(dummy)); + dummy.md_root = P_INVALID; + dummy.md_flags = flags & PERSISTENT_FLAGS; + rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA); + dbflag |= DB_DIRTY; } - } - return rc; + // OK, got info, add to table + if (rc == MDB_SUCCESS) { + unsigned int slot = unused ? unused : txn->mt_numdbs; + txn->mt_dbxs[slot].md_name.mv_data = strdup(name); + txn->mt_dbxs[slot].md_name.mv_size = len; + txn->mt_dbxs[slot].md_rel = NULL; + txn->mt_dbflags[slot] = dbflag; + memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); + *dbi = slot; + mdb_default_cmp(txn, slot); + if (!unused) { + txn->mt_numdbs++; + } + } + + return rc; */ return nil, nil } func (t *transaction) Stat(b Bucket) *Stat { - if (txn == NULL || arg == NULL || dbi >= txn->mt_numdbs) - return EINVAL; - - if (txn->mt_dbflags[dbi] & DB_STALE) { - MDB_cursor mc; - MDB_xcursor mx; - /* Stale, must read the DB's root. cursor_init does it for us. */ - mdb_cursor_init(&mc, txn, dbi, &mx); - } - return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); + /* + if (txn == NULL || arg == NULL || dbi >= txn->mt_numdbs) + return EINVAL; + + if (txn->mt_dbflags[dbi] & DB_STALE) { + MDB_cursor mc; + MDB_xcursor mx; + // Stale, must read the DB's root. cursor_init does it for us. + mdb_cursor_init(&mc, txn, dbi, &mx); + } + return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); + */ + return nil } func (t *transaction) BucketFlags(b Bucket) (int, error) { /* - // We could return the flags for the FREE_DBI too but what's the point? - if (txn == NULL || dbi < MAIN_DBI || dbi >= txn->mt_numdbs) - return EINVAL; - *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; - return MDB_SUCCESS; + // We could return the flags for the FREE_DBI too but what's the point? + if (txn == NULL || dbi < MAIN_DBI || dbi >= txn->mt_numdbs) + return EINVAL; + *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; + return MDB_SUCCESS; */ return 0, nil } -func (t *transaction) Drop(b Bucket int del) error { +func (t *transaction) Drop(b *Bucket, del int) error { /* - MDB_cursor *mc, *m2; - int rc; + MDB_cursor *mc, *m2; + int rc; - if (!txn || !dbi || dbi >= txn->mt_numdbs || (unsigned)del > 1 || !(txn->mt_dbflags[dbi] & DB_VALID)) - return EINVAL; + if (!txn || !dbi || dbi >= txn->mt_numdbs || (unsigned)del > 1 || !(txn->mt_dbflags[dbi] & DB_VALID)) + return EINVAL; - if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) - return EACCES; + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + return EACCES; - rc = mdb_cursor_open(txn, dbi, &mc); - if (rc) - return rc; + rc = mdb_cursor_open(txn, dbi, &mc); + if (rc) + return rc; - rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); - // Invalidate the dropped DB's cursors - for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) - m2->mc_flags &= ~(C_INITIALIZED|C_EOF); - if (rc) - goto leave; - - // Can't delete the main DB - if (del && dbi > MAIN_DBI) { - rc = mdb_del(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL); - if (!rc) { - txn->mt_dbflags[dbi] = DB_STALE; - mdb_dbi_close(txn->mt_env, dbi); - } - } else { - // reset the DB record, mark it dirty - txn->mt_dbflags[dbi] |= DB_DIRTY; - txn->mt_dbs[dbi].md_depth = 0; - txn->mt_dbs[dbi].md_branch_pages = 0; - txn->mt_dbs[dbi].md_leaf_pages = 0; - txn->mt_dbs[dbi].md_overflow_pages = 0; - txn->mt_dbs[dbi].md_entries = 0; - txn->mt_dbs[dbi].md_root = P_INVALID; - - txn->mt_flags |= MDB_TXN_DIRTY; - } -leave: - mdb_cursor_close(mc); - return rc; + rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); + // Invalidate the dropped DB's cursors + for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) + m2->mc_flags &= ~(C_INITIALIZED|C_EOF); + if (rc) + goto leave; + + // Can't delete the main DB + if (del && dbi > MAIN_DBI) { + rc = mdb_del(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL); + if (!rc) { + txn->mt_dbflags[dbi] = DB_STALE; + mdb_dbi_close(txn->mt_env, dbi); + } + } else { + // reset the DB record, mark it dirty + txn->mt_dbflags[dbi] |= DB_DIRTY; + txn->mt_dbs[dbi].md_depth = 0; + txn->mt_dbs[dbi].md_branch_pages = 0; + txn->mt_dbs[dbi].md_leaf_pages = 0; + txn->mt_dbs[dbi].md_overflow_pages = 0; + txn->mt_dbs[dbi].md_entries = 0; + txn->mt_dbs[dbi].md_root = P_INVALID; + + txn->mt_flags |= MDB_TXN_DIRTY; + } + leave: + mdb_cursor_close(mc); + return rc; */ return nil } |