pax_global_header00006660000000000000000000000064137217311650014520gustar00rootroot0000000000000052 comment=d5a25b83fbf4f3f61ff03a9202e36f5b75544426 badger-2.2007.2/000077500000000000000000000000001372173116500131765ustar00rootroot00000000000000badger-2.2007.2/.deepsource.toml000066400000000000000000000003251372173116500163070ustar00rootroot00000000000000version = 1 test_patterns = [ 'integration/testgc/**', '**/*_test.go' ] exclude_patterns = [ ] [[analyzers]] name = 'go' enabled = true [analyzers.meta] import_path = 'github.com/dgraph-io/badger' badger-2.2007.2/.github/000077500000000000000000000000001372173116500145365ustar00rootroot00000000000000badger-2.2007.2/.github/CODEOWNERS000066400000000000000000000003211372173116500161250ustar00rootroot00000000000000# CODEOWNERS info: https://help.github.com/en/articles/about-code-owners # Owners are automatically requested for review for PRs that changes code # that they own. * @manishrjain @ashish-goswami @jarifibrahim badger-2.2007.2/.github/ISSUE_TEMPLATE000066400000000000000000000011671372173116500166510ustar00rootroot00000000000000 ### What version of Go are you using (`go version`)?
$ go version

### What operating system are you using? ### What version of Badger are you using? ### Does this issue reproduce with the latest master? ### Steps to Reproduce the issue ### What Badger options were set? ### What did you do? ### What did you expect to see? ### What did you see instead? badger-2.2007.2/.github/stale.yml000066400000000000000000000015261372173116500163750ustar00rootroot00000000000000# Number of days of inactivity before an issue becomes stale daysUntilStale: 30 # Number of days of inactivity before a stale issue is closed daysUntilClose: 7 # Issues with these labels will never be considered stale exemptLabels: - skip/stale - status/accepted # Label to use when marking an issue as stale staleLabel: status/stale # Comment to post when marking an issue as stale. Set to `false` to disable markComment: > This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions. # Comment to post when closing a stale issue. Set to `false` to disable closeComment: > This issue was marked as stale and no activity has occurred since then, therefore it will now be closed. Please, reopen if the issue is still relevant. badger-2.2007.2/.gitignore000066400000000000000000000000211372173116500151570ustar00rootroot00000000000000p/ badger-test*/ badger-2.2007.2/.golangci.yml000066400000000000000000000005041372173116500155610ustar00rootroot00000000000000run: tests: false linters-settings: lll: line-length: 100 linters: disable-all: true enable: - errcheck - ineffassign - gas - gofmt - golint - gosimple - govet - lll - varcheck - unused issues: exclude-rules: - linters: - gosec text: "G404: " badger-2.2007.2/.travis.yml000066400000000000000000000045141372173116500153130ustar00rootroot00000000000000language: go go: - "1.12" - "1.13" - tip os: - osx env: jobs: - GOARCH=386 - GOARCH=amd64 global: - secure: CRkV2+/jlO0gXzzS50XGxfMS117FNwiVjxNY/LeWq06RKD+dDCPxTJl3JCNe3l0cYEPAglV2uMMYukDiTqJ7e+HI4nh4N4mv6lwx39N8dAvJe1x5ITS2T4qk4kTjuQb1Q1vw/ZOxoQqmvNKj2uRmBdJ/HHmysbRJ1OzCWML3OXdUwJf0AYlJzTjpMfkOKr7sTtE4rwyyQtd4tKH1fGdurgI9ZuFd9qvYxK2qcJhsQ6CNqMXt+7FkVkN1rIPmofjjBTNryzUr4COFXuWH95aDAif19DeBW4lbNgo1+FpDsrgmqtuhl6NAuptI8q/imow2KXBYJ8JPXsxW8DVFj0IIp0RCd3GjaEnwBEbxAyiIHLfW7AudyTS/dJOvZffPqXnuJ8xj3OPIdNe4xY0hWl8Ju2HhKfLOAHq7VadHZWd3IHLil70EiL4/JLD1rNbMImUZisFaA8pyrcIvYYebjOnk4TscwKFLedClRSX1XsMjWWd0oykQtrdkHM2IxknnBpaLu7mFnfE07f6dkG0nlpyu4SCLey7hr5FdcEmljA0nIxTSYDg6035fQkBEAbe7hlESOekkVNT9IZPwG+lmt3vU4ofi6NqNbJecOuSB+h36IiZ9s4YQtxYNnLgW14zjuFGGyT5smc3IjBT7qngDjKIgyrSVoRkY/8udy9qbUgvBeW8= jobs: allow_failures: - go: tip exclude: # Exclude builds for 386 architecture on go 1.12 and tip # Since we don't want it to run for 32 bit - go: "1.12" env: GOARCH=386 - go: tip env: GOARCH=386 include: # Define one extra linux build, which we use to run cross # compiled 32 bit tests - os: linux arch: arm64 go: "1.14" env: go_32=yes notifications: email: false slack: secure: X7uBLWYbuUhf8QFE16CoS5z7WvFR8EN9j6cEectMW6mKZ3vwXGwVXRIPsgUq/606DsQdCCx34MR8MRWYGlu6TBolbSe9y0EP0i46yipPz22YtuT7umcVUbGEyx8MZKgG0v1u/zA0O4aCsOBpGAA3gxz8h3JlEHDt+hv6U8xRsSllVLzLSNb5lwxDtcfEDxVVqP47GMEgjLPM28Pyt5qwjk7o5a4YSVzkfdxBXxd3gWzFUWzJ5E3cTacli50dK4GVfiLcQY2aQYoYO7AAvDnvP+TPfjDkBlUEE4MUz5CDIN51Xb+WW33sX7g+r3Bj7V5IRcF973RiYkpEh+3eoiPnyWyxhDZBYilty3b+Hysp6d4Ov/3I3ll7Bcny5+cYjakjkMH3l9w3gs6Y82GlpSLSJshKWS8vPRsxFe0Pstj6QSJXTd9EBaFr+l1ScXjJv/Sya9j8N9FfTuOTESWuaL1auX4Y7zEEVHlA8SCNOO8K0eTfxGZnC/YcIHsR8rePEAcFxfOYQppkyLF/XvAtnb/LMUuu0g4y2qNdme6Oelvyar1tFEMRtbl4mRCdu/krXBFtkrsfUaVY6WTPdvXAGotsFJ0wuA53zGVhlcd3+xAlSlR3c1QX95HIMeivJKb5L4nTjP+xnrmQNtnVk+tG4LSH2ltuwcZSSczModtcBmRefrk= script: >- if [ $TRAVIS_OS_NAME = "linux" ] && [ $go_32 ]; then uname -a GOOS=linux GOARCH=arm go test -v ./... # Another round of tests after turning off mmap. GOOS=linux GOARCH=arm go test -v -vlog_mmap=false github.com/dgraph-io/badger else go test -v ./... # Another round of tests after turning off mmap. go test -v -vlog_mmap=false github.com/dgraph-io/badger fi badger-2.2007.2/CHANGELOG.md000066400000000000000000000362601372173116500150160ustar00rootroot00000000000000# Changelog All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [2.2007.2] - 2020-08-31 ### Fixed - Compaction: Use separate compactors for L0, L1 (#1466) - Rework Block and Index cache (#1473) - Add IsClosed method (#1478) - Cleanup: Avoid truncating in vlog.Open on error (#1465) - Cleanup: Do not close cache before compactions (#1464) ### New APIs - Badger.DB - BlockCacheMetrics (#1473) - IndexCacheMetrics (#1473) - Badger.Option - WithBlockCacheSize (#1473) - WithIndexCacheSize (#1473) ### Removed APIs [Breaking Changes] - Badger.DB - DataCacheMetrics (#1473) - BfCacheMetrics (#1473) - Badger.Option - WithMaxCacheSize (#1473) - WithMaxBfCacheSize (#1473) - WithKeepBlockIndicesInCache (#1473) - WithKeepBlocksInCache (#1473) ## [2.2007.1] - 2020-08-19 ### Fixed - Remove vlog file if bootstrap, syncDir or mmap fails (#1434) - levels: Compaction incorrectly drops some delete markers (#1422) - Replay: Update head for LSM entires also (#1456) ## [2.2007.0] - 2020-08-10 ### Fixed - Add a limit to the size of the batches sent over a stream. (#1412) - Fix Sequence generates duplicate values (#1281) - Fix race condition in DoesNotHave (#1287) - Fail fast if cgo is disabled and compression is ZSTD (#1284) - Proto: make badger/v2 compatible with v1 (#1293) - Proto: Rename dgraph.badger.v2.pb to badgerpb2 (#1314) - Handle duplicates in ManagedWriteBatch (#1315) - Ensure `bitValuePointer` flag is cleared for LSM entry values written to LSM (#1313) - DropPrefix: Return error on blocked writes (#1329) - Confirm `badgerMove` entry required before rewrite (#1302) - Drop move keys when its key prefix is dropped (#1331) - Iterator: Always add key to txn.reads (#1328) - Restore: Account for value size as well (#1358) - Compaction: Expired keys and delete markers are never purged (#1354) - GC: Consider size of value while rewriting (#1357) - Force KeepL0InMemory to be true when InMemory is true (#1375) - Rework DB.DropPrefix (#1381) - Update head while replaying value log (#1372) - Avoid panic on multiple closer.Signal calls (#1401) - Return error if the vlog writes exceeds more than 4GB (#1400) ### Performance - Clean up transaction oracle as we go (#1275) - Use cache for storing block offsets (#1336) ### Features - Support disabling conflict detection (#1344) - Add leveled logging (#1249) - Support entry version in Write batch (#1310) - Add Write method to batch write (#1321) - Support multiple iterators in read-write transactions (#1286) ### New APIs - Badger.DB - NewManagedWriteBatch (#1310) - DropPrefix (#1381) - Badger.Option - WithDetectConflicts (#1344) - WithKeepBlockIndicesInCache (#1336) - WithKeepBlocksInCache (#1336) - Badger.WriteBatch - DeleteAt (#1310) - SetEntryAt (#1310) - Write (#1321) ### Changes to Default Options - DefaultOptions: Set KeepL0InMemory to false (#1345) - Increase default valueThreshold from 32B to 1KB (#1346) ### Deprecated - Badger.Option - WithEventLogging (#1203) ### Reverts This sections lists the changes which were reverted because of non-reproducible crashes. - Compress/Encrypt Blocks in the background (#1227) ## [2.0.3] - 2020-03-24 ### Fixed - Add support for watching nil prefix in subscribe API (#1246) ### Performance - Compress/Encrypt Blocks in the background (#1227) - Disable cache by default (#1257) ### Features - Add BypassDirLock option (#1243) - Add separate cache for bloomfilters (#1260) ### New APIs - badger.DB - BfCacheMetrics (#1260) - DataCacheMetrics (#1260) - badger.Options - WithBypassLockGuard (#1243) - WithLoadBloomsOnOpen (#1260) - WithMaxBfCacheSize (#1260) ## [2.0.3] - 2020-03-24 ### Fixed - Add support for watching nil prefix in subscribe API (#1246) ### Performance - Compress/Encrypt Blocks in the background (#1227) - Disable cache by default (#1257) ### Features - Add BypassDirLock option (#1243) - Add separate cache for bloomfilters (#1260) ### New APIs - badger.DB - BfCacheMetrics (#1260) - DataCacheMetrics (#1260) - badger.Options - WithBypassLockGuard (#1243) - WithLoadBloomsOnOpen (#1260) - WithMaxBfCacheSize (#1260) ## [2.0.2] - 2020-03-02 ### Fixed - Cast sz to uint32 to fix compilation on 32 bit. (#1175) - Fix checkOverlap in compaction. (#1166) - Avoid sync in inmemory mode. (#1190) - Support disabling the cache completely. (#1185) - Add support for caching bloomfilters. (#1204) - Fix int overflow for 32bit. (#1216) - Remove the 'this entry should've caught' log from value.go. (#1170) - Rework concurrency semantics of valueLog.maxFid. (#1187) ### Performance - Use fastRand instead of locked-rand in skiplist. (#1173) - Improve write stalling on level 0 and 1. (#1186) - Disable compression and set ZSTD Compression Level to 1. (#1191) ## [2.0.1] - 2020-01-02 ### New APIs - badger.Options - WithInMemory (f5b6321) - WithZSTDCompressionLevel (3eb4e72) - Badger.TableInfo - EstimatedSz (f46f8ea) ### Features - Introduce in-memory mode in badger. (#1113) ### Fixed - Limit manifest's change set size. (#1119) - Cast idx to uint32 to fix compilation on i386. (#1118) - Fix request increment ref bug. (#1121) - Fix windows dataloss issue. (#1134) - Fix VerifyValueChecksum checks. (#1138) - Fix encryption in stream writer. (#1146) - Fix segmentation fault in vlog.Read. (header.Decode) (#1150) - Fix merge iterator duplicates issue. (#1157) ### Performance - Set level 15 as default compression level in Zstd. (#1111) - Optimize createTable in stream_writer.go. (#1132) ## [2.0.0] - 2019-11-12 ### New APIs - badger.DB - NewWriteBatchAt (7f43769) - CacheMetrics (b9056f1) - badger.Options - WithMaxCacheSize (b9056f1) - WithEventLogging (75c6a44) - WithBlockSize (1439463) - WithBloomFalsePositive (1439463) - WithKeepL0InMemory (ee70ff2) - WithVerifyValueChecksum (ee70ff2) - WithCompression (5f3b061) - WithEncryptionKey (a425b0e) - WithEncryptionKeyRotationDuration (a425b0e) - WithChecksumVerificationMode (7b4083d) ### Features - Data cache to speed up lookups and iterations. (#1066) - Data compression. (#1013) - Data encryption-at-rest. (#1042) ### Fixed - Fix deadlock when flushing discard stats. (#976) - Set move key's expiresAt for keys with TTL. (#1006) - Fix unsafe usage in Decode. (#1097) - Fix race condition on db.orc.nextTxnTs. (#1101) - Fix level 0 GC dataloss bug. (#1090) - Fix deadlock in discard stats. (#1070) - Support checksum verification for values read from vlog. (#1052) - Store entire L0 in memory. (#963) - Fix table.Smallest/Biggest and iterator Prefix bug. (#997) - Use standard proto functions for Marshal/Unmarshal and Size. (#994) - Fix boundaries on GC batch size. (#987) - VlogSize to store correct directory name to expvar.Map. (#956) - Fix transaction too big issue in restore. (#957) - Fix race condition in updateDiscardStats. (#973) - Cast results of len to uint32 to fix compilation in i386 arch. (#961) - Making the stream writer APIs goroutine-safe. (#959) - Fix prefix bug in key iterator and allow all versions. (#950) - Drop discard stats if we can't unmarshal it. (#936) - Fix race condition in flushDiscardStats function. (#921) - Ensure rewrite in vlog is within transactional limits. (#911) - Fix discard stats moved by GC bug. (#929) - Fix busy-wait loop in Watermark. (#920) ### Performance - Introduce fast merge iterator. (#1080) - Binary search based table picker. (#983) - Flush vlog buffer if it grows beyond threshold. (#1067) - Introduce StreamDone in Stream Writer. (#1061) - Performance Improvements to block iterator. (#977) - Prevent unnecessary safecopy in iterator parseKV. (#971) - Use pointers instead of binary encoding. (#965) - Reuse block iterator inside table iterator. (#972) - [breaking/format] Remove vlen from entry header. (#945) - Replace FarmHash with AESHash for Oracle conflicts. (#952) - [breaking/format] Optimize Bloom filters. (#940) - [breaking/format] Use varint for header encoding (without header length). (#935) - Change file picking strategy in compaction. (#894) - [breaking/format] Block level changes. (#880) - [breaking/format] Add key-offset index to the end of SST table. (#881) ## [1.6.0] - 2019-07-01 This is a release including almost 200 commits, so expect many changes - some of them not backward compatible. Regarding backward compatibility in Badger versions, you might be interested on reading [VERSIONING.md](VERSIONING.md). _Note_: The hashes in parentheses correspond to the commits that impacted the given feature. ### New APIs - badger.DB - DropPrefix (291295e) - Flatten (7e41bba) - KeySplits (4751ef1) - MaxBatchCount (b65e2a3) - MaxBatchSize (b65e2a3) - PrintKeyValueHistogram (fd59907) - Subscribe (26128a7) - Sync (851e462) - badger.DefaultOptions() and badger.LSMOnlyOptions() (91ce687) - badger.Options.WithX methods - badger.Entry (e9447c9) - NewEntry - WithMeta - WithDiscard - WithTTL - badger.Item - KeySize (fd59907) - ValueSize (5242a99) - badger.IteratorOptions - PickTable (7d46029, 49a49e3) - Prefix (7d46029) - badger.Logger (fbb2778) - badger.Options - CompactL0OnClose (7e41bba) - Logger (3f66663) - LogRotatesToFlush (2237832) - badger.Stream (14cbd89, 3258067) - badger.StreamWriter (7116e16) - badger.TableInfo.KeyCount (fd59907) - badger.TableManifest (2017987) - badger.Tx.NewKeyIterator (49a49e3) - badger.WriteBatch (6daccf9, 7e78e80) ### Modified APIs #### Breaking changes: - badger.DefaultOptions and badger.LSMOnlyOptions are now functions rather than variables (91ce687) - badger.Item.Value now receives a function that returns an error (439fd46) - badger.Txn.Commit doesn't receive any params now (6daccf9) - badger.DB.Tables now receives a boolean (76b5341) #### Not breaking changes: - badger.LSMOptions changed values (799c33f) - badger.DB.NewIterator now allows multiple iterators per RO txn (41d9656) - badger.Options.TableLoadingMode's new default is options.MemoryMap (6b97bac) ### Removed APIs - badger.ManagedDB (d22c0e8) - badger.Options.DoNotCompact (7e41bba) - badger.Txn.SetWithX (e9447c9) ### Tools: - badger bank disect (13db058) - badger bank test (13db058) --mmap (03870e3) - badger fill (7e41bba) - badger flatten (7e41bba) - badger info --histogram (fd59907) --history --lookup --show-keys --show-meta --with-prefix (09e9b63) --show-internal (fb2eed9) - badger benchmark read (239041e) - badger benchmark write (6d3b67d) ## [1.5.5] - 2019-06-20 * Introduce support for Go Modules ## [1.5.3] - 2018-07-11 Bug Fixes: * Fix a panic caused due to item.vptr not copying over vs.Value, when looking for a move key. ## [1.5.2] - 2018-06-19 Bug Fixes: * Fix the way move key gets generated. * If a transaction has unclosed, or multiple iterators running simultaneously, throw a panic. Every iterator must be properly closed. At any point in time, only one iterator per transaction can be running. This is to avoid bugs in a transaction data structure which is thread unsafe. * *Warning: This change might cause panics in user code. Fix is to properly close your iterators, and only have one running at a time per transaction.* ## [1.5.1] - 2018-06-04 Bug Fixes: * Fix for infinite yieldItemValue recursion. #503 * Fix recursive addition of `badgerMove` prefix. https://github.com/dgraph-io/badger/commit/2e3a32f0ccac3066fb4206b28deb39c210c5266f * Use file size based window size for sampling, instead of fixing it to 10MB. #501 Cleanup: * Clarify comments and documentation. * Move badger tool one directory level up. ## [1.5.0] - 2018-05-08 * Introduce `NumVersionsToKeep` option. This option is used to discard many versions of the same key, which saves space. * Add a new `SetWithDiscard` method, which would indicate that all the older versions of the key are now invalid. Those versions would be discarded during compactions. * Value log GC moves are now bound to another keyspace to ensure latest versions of data are always at the top in LSM tree. * Introduce `ValueLogMaxEntries` to restrict the number of key-value pairs per value log file. This helps bound the time it takes to garbage collect one file. ## [1.4.0] - 2018-05-04 * Make mmap-ing of value log optional. * Run GC multiple times, based on recorded discard statistics. * Add MergeOperator. * Force compact L0 on clsoe (#439). * Add truncate option to warn about data loss (#452). * Discard key versions during compaction (#464). * Introduce new `LSMOnlyOptions`, to make Badger act like a typical LSM based DB. Bug fix: * (Temporary) Check max version across all tables in Get (removed in next release). * Update commit and read ts while loading from backup. * Ensure all transaction entries are part of the same value log file. * On commit, run unlock callbacks before doing writes (#413). * Wait for goroutines to finish before closing iterators (#421). ## [1.3.0] - 2017-12-12 * Add `DB.NextSequence()` method to generate monotonically increasing integer sequences. * Add `DB.Size()` method to return the size of LSM and value log files. * Tweaked mmap code to make Windows 32-bit builds work. * Tweaked build tags on some files to make iOS builds work. * Fix `DB.PurgeOlderVersions()` to not violate some constraints. ## [1.2.0] - 2017-11-30 * Expose a `Txn.SetEntry()` method to allow setting the key-value pair and all the metadata at the same time. ## [1.1.1] - 2017-11-28 * Fix bug where txn.Get was returing key deleted in same transaction. * Fix race condition while decrementing reference in oracle. * Update doneCommit in the callback for CommitAsync. * Iterator see writes of current txn. ## [1.1.0] - 2017-11-13 * Create Badger directory if it does not exist when `badger.Open` is called. * Added `Item.ValueCopy()` to avoid deadlocks in long-running iterations * Fixed 64-bit alignment issues to make Badger run on Arm v7 ## [1.0.1] - 2017-11-06 * Fix an uint16 overflow when resizing key slice [Unreleased]: https://github.com/dgraph-io/badger/compare/v2.2007.2...HEAD [2.2007.2]: https://github.com/dgraph-io/badger/compare/v2.2007.1...v2.2007.2 [2.2007.1]: https://github.com/dgraph-io/badger/compare/v2.2007.0...v2.2007.1 [2.2007.0]: https://github.com/dgraph-io/badger/compare/v2.0.3...v2.2007.0 [2.0.3]: https://github.com/dgraph-io/badger/compare/v2.0.2...v2.0.3 [2.0.2]: https://github.com/dgraph-io/badger/compare/v2.0.1...v2.0.2 [2.0.1]: https://github.com/dgraph-io/badger/compare/v2.0.0...v2.0.1 [2.0.0]: https://github.com/dgraph-io/badger/compare/v1.6.0...v2.0.0 [1.6.0]: https://github.com/dgraph-io/badger/compare/v1.5.5...v1.6.0 [1.5.5]: https://github.com/dgraph-io/badger/compare/v1.5.3...v1.5.5 [1.5.3]: https://github.com/dgraph-io/badger/compare/v1.5.2...v1.5.3 [1.5.2]: https://github.com/dgraph-io/badger/compare/v1.5.1...v1.5.2 [1.5.1]: https://github.com/dgraph-io/badger/compare/v1.5.0...v1.5.1 [1.5.0]: https://github.com/dgraph-io/badger/compare/v1.4.0...v1.5.0 [1.4.0]: https://github.com/dgraph-io/badger/compare/v1.3.0...v1.4.0 [1.3.0]: https://github.com/dgraph-io/badger/compare/v1.2.0...v1.3.0 [1.2.0]: https://github.com/dgraph-io/badger/compare/v1.1.1...v1.2.0 [1.1.1]: https://github.com/dgraph-io/badger/compare/v1.1.0...v1.1.1 [1.1.0]: https://github.com/dgraph-io/badger/compare/v1.0.1...v1.1.0 [1.0.1]: https://github.com/dgraph-io/badger/compare/v1.0.0...v1.0.1 badger-2.2007.2/CODE_OF_CONDUCT.md000066400000000000000000000001251372173116500157730ustar00rootroot00000000000000# Code of Conduct Our Code of Conduct can be found here: https://dgraph.io/conduct badger-2.2007.2/CONTRIBUTING.md000066400000000000000000000052431372173116500154330ustar00rootroot00000000000000# Contribution Guide * [Before you get started](#before-you-get-started) * [Code of Conduct](#code-of-conduct) * [Your First Contribution](#your-first-contribution) * [Find a good first topic](#find-a-good-first-topic) * [Setting up your development environment](#setting-up-your-development-environment) * [Fork the project](#fork-the-project) * [Clone the project](#clone-the-project) * [New branch for a new code](#new-branch-for-a-new-code) * [Test](#test) * [Commit and push](#commit-and-push) * [Create a Pull Request](#create-a-pull-request) * [Sign the CLA](#sign-the-cla) * [Get a code review](#get-a-code-review) ## Before you get started ### Code of Conduct Please make sure to read and observe our [Code of Conduct](./CODE_OF_CONDUCT.md). ## Your First Contribution ### Find a good first topic You can start by finding an existing issue with the [good first issue](https://github.com/dgraph-io/badger/labels/good%20first%20issue) or [help wanted](https://github.com/dgraph-io/badger/labels/help%20wanted) labels. These issues are well suited for new contributors. ## Setting up your development environment Badger uses [`Go Modules`](https://github.com/golang/go/wiki/Modules) to manage dependencies. The version of Go should be **1.12** or above. ### Fork the project - Visit https://github.com/dgraph-io/badger - Click the `Fork` button (top right) to create a fork of the repository ### Clone the project ```sh $ git clone https://github.com/$GITHUB_USER/badger $ cd badger $ git remote add upstream git@github.com:dgraph-io/badger.git # Never push to the upstream master git remote set-url --push upstream no_push ``` ### New branch for a new code Get your local master up to date: ```sh $ git fetch upstream $ git checkout master $ git rebase upstream/master ``` Create a new branch from the master: ```sh $ git checkout -b my_new_feature ``` And now you can finally add your changes to project. ### Test Build and run all tests: ```sh $ ./test.sh ``` ### Commit and push Commit your changes: ```sh $ git commit ``` When the changes are ready to review: ```sh $ git push origin my_new_feature ``` ### Create a Pull Request Just open `https://github.com/$GITHUB_USER/badger/pull/new/my_new_feature` and fill the PR description. ### Sign the CLA Click the **Sign in with Github to agree** button to sign the CLA. [An example](https://cla-assistant.io/dgraph-io/badger?pullRequest=1377). ### Get a code review If your pull request (PR) is opened, it will be assigned to one or more reviewers. Those reviewers will do a code review. To address review comments, you should commit the changes to the same branch of the PR on your fork. badger-2.2007.2/LICENSE000066400000000000000000000236751372173116500142200ustar00rootroot00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS badger-2.2007.2/README.md000066400000000000000000001167151372173116500144700ustar00rootroot00000000000000# BadgerDB [![GoDoc](https://godoc.org/github.com/dgraph-io/badger?status.svg)](https://godoc.org/github.com/dgraph-io/badger) [![Go Report Card](https://goreportcard.com/badge/github.com/dgraph-io/badger)](https://goreportcard.com/report/github.com/dgraph-io/badger) [![Sourcegraph](https://sourcegraph.com/github.com/dgraph-io/badger/-/badge.svg)](https://sourcegraph.com/github.com/dgraph-io/badger?badge) [![Build Status](https://teamcity.dgraph.io/guestAuth/app/rest/builds/buildType:(id:Badger_UnitTests)/statusIcon.svg)](https://teamcity.dgraph.io/viewLog.html?buildTypeId=Badger_UnitTests&buildId=lastFinished&guest=1) ![Appveyor](https://ci.appveyor.com/api/projects/status/github/dgraph-io/badger?branch=master&svg=true) [![Coverage Status](https://coveralls.io/repos/github/dgraph-io/badger/badge.svg?branch=master)](https://coveralls.io/github/dgraph-io/badger?branch=master) ![Badger mascot](images/diggy-shadow.png) BadgerDB is an embeddable, persistent and fast key-value (KV) database written in pure Go. It is the underlying database for [Dgraph](https://dgraph.io), a fast, distributed graph database. It's meant to be a performant alternative to non-Go-based key-value stores like RocksDB. ## Project Status [March 24, 2020] Badger is stable and is being used to serve data sets worth hundreds of terabytes. Badger supports concurrent ACID transactions with serializable snapshot isolation (SSI) guarantees. A Jepsen-style bank test runs nightly for 8h, with `--race` flag and ensures the maintenance of transactional guarantees. Badger has also been tested to work with filesystem level anomalies, to ensure persistence and consistency. Badger is being used by a number of projects which includes Dgraph, Jaeger Tracing, UsenetExpress, and many more. The list of projects using Badger can be found [here](#projects-using-badger). Badger v1.0 was released in Nov 2017, and the latest version that is data-compatible with v1.0 is v1.6.0. Badger v2.0 was released in Nov 2019 with a new storage format which won't be compatible with all of the v1.x. Badger v2.0 supports compression, encryption and uses a cache to speed up lookup. The [Changelog] is kept fairly up-to-date. For more details on our version naming schema please read [Choosing a version](#choosing-a-version). [Changelog]:https://github.com/dgraph-io/badger/blob/master/CHANGELOG.md ## Table of Contents * [Getting Started](#getting-started) + [Installing](#installing) - [Choosing a version](#choosing-a-version) + [Opening a database](#opening-a-database) + [Transactions](#transactions) - [Read-only transactions](#read-only-transactions) - [Read-write transactions](#read-write-transactions) - [Managing transactions manually](#managing-transactions-manually) + [Using key/value pairs](#using-keyvalue-pairs) + [Monotonically increasing integers](#monotonically-increasing-integers) * [Merge Operations](#merge-operations) + [Setting Time To Live(TTL) and User Metadata on Keys](#setting-time-to-livettl-and-user-metadata-on-keys) + [Iterating over keys](#iterating-over-keys) - [Prefix scans](#prefix-scans) - [Key-only iteration](#key-only-iteration) + [Stream](#stream) + [Garbage Collection](#garbage-collection) + [Database backup](#database-backup) + [Memory usage](#memory-usage) + [Statistics](#statistics) * [Resources](#resources) + [Blog Posts](#blog-posts) * [Contact](#contact) * [Design](#design) + [Comparisons](#comparisons) + [Benchmarks](#benchmarks) * [Projects Using Badger](#projects-using-badger) * [Contributing](#contributing) * [Frequently Asked Questions](#frequently-asked-questions) ## Getting Started ### Installing To start using Badger, install Go 1.12 or above and run `go get`: ```sh $ go get github.com/dgraph-io/badger/v2 ``` This will retrieve the library and install the `badger` command line utility into your `$GOBIN` path. ##### Note: Badger does not directly use CGO but it relies on https://github.com/DataDog/zstd for compression and it requires gcc/cgo. If you wish to use badger without gcc/cgo, you can run `CGO_ENABLED=0 go get github.com/dgraph-io/badger/...` which will download badger without the support for ZSTD compression algorithm. #### Choosing a version BadgerDB is a pretty special package from the point of view that the most important change we can make to it is not on its API but rather on how data is stored on disk. This is why we follow a version naming schema that differs from Semantic Versioning. - New major versions are released when the data format on disk changes in an incompatible way. - New minor versions are released whenever the API changes but data compatibility is maintained. Note that the changes on the API could be backward-incompatible - unlike Semantic Versioning. - New patch versions are released when there's no changes to the data format nor the API. Following these rules: - v1.5.0 and v1.6.0 can be used on top of the same files without any concerns, as their major version is the same, therefore the data format on disk is compatible. - v1.6.0 and v2.0.0 are data incompatible as their major version implies, so files created with v1.6.0 will need to be converted into the new format before they can be used by v2.0.0. For a longer explanation on the reasons behind using a new versioning naming schema, you can read [VERSIONING.md](VERSIONING.md). ### Opening a database The top-level object in Badger is a `DB`. It represents multiple files on disk in specific directories, which contain the data for a single database. To open your database, use the `badger.Open()` function, with the appropriate options. The `Dir` and `ValueDir` options are mandatory and must be specified by the client. They can be set to the same value to simplify things. ```go package main import ( "log" badger "github.com/dgraph-io/badger/v2" ) func main() { // Open the Badger database located in the /tmp/badger directory. // It will be created if it doesn't exist. db, err := badger.Open(badger.DefaultOptions("/tmp/badger")) if err != nil { log.Fatal(err) } defer db.Close()  // Your code here… } ``` Please note that Badger obtains a lock on the directories so multiple processes cannot open the same database at the same time. #### In-Memory Mode/Diskless Mode By default, Badger ensures all the data is persisted to the disk. It also supports a pure in-memory mode. When Badger is running in in-memory mode, all the data is stored in the memory. Reads and writes are much faster in in-memory mode, but all the data stored in Badger will be lost in case of a crash or close. To open badger in in-memory mode, set the `InMemory` option. ``` opt := badger.DefaultOptions("").WithInMemory(true) ``` ### Transactions #### Read-only transactions To start a read-only transaction, you can use the `DB.View()` method: ```go err := db.View(func(txn *badger.Txn) error {  // Your code here…  return nil }) ``` You cannot perform any writes or deletes within this transaction. Badger ensures that you get a consistent view of the database within this closure. Any writes that happen elsewhere after the transaction has started, will not be seen by calls made within the closure. #### Read-write transactions To start a read-write transaction, you can use the `DB.Update()` method: ```go err := db.Update(func(txn *badger.Txn) error {  // Your code here…  return nil }) ``` All database operations are allowed inside a read-write transaction. Always check the returned error value. If you return an error within your closure it will be passed through. An `ErrConflict` error will be reported in case of a conflict. Depending on the state of your application, you have the option to retry the operation if you receive this error. An `ErrTxnTooBig` will be reported in case the number of pending writes/deletes in the transaction exceeds a certain limit. In that case, it is best to commit the transaction and start a new transaction immediately. Here is an example (we are not checking for errors in some places for simplicity): ```go updates := make(map[string]string) txn := db.NewTransaction(true) for k,v := range updates { if err := txn.Set([]byte(k),[]byte(v)); err == badger.ErrTxnTooBig { _ = txn.Commit() txn = db.NewTransaction(true) _ = txn.Set([]byte(k),[]byte(v)) } } _ = txn.Commit() ``` #### Managing transactions manually The `DB.View()` and `DB.Update()` methods are wrappers around the `DB.NewTransaction()` and `Txn.Commit()` methods (or `Txn.Discard()` in case of read-only transactions). These helper methods will start the transaction, execute a function, and then safely discard your transaction if an error is returned. This is the recommended way to use Badger transactions. However, sometimes you may want to manually create and commit your transactions. You can use the `DB.NewTransaction()` function directly, which takes in a boolean argument to specify whether a read-write transaction is required. For read-write transactions, it is necessary to call `Txn.Commit()` to ensure the transaction is committed. For read-only transactions, calling `Txn.Discard()` is sufficient. `Txn.Commit()` also calls `Txn.Discard()` internally to cleanup the transaction, so just calling `Txn.Commit()` is sufficient for read-write transaction. However, if your code doesn’t call `Txn.Commit()` for some reason (for e.g it returns prematurely with an error), then please make sure you call `Txn.Discard()` in a `defer` block. Refer to the code below. ```go // Start a writable transaction. txn := db.NewTransaction(true) defer txn.Discard() // Use the transaction... err := txn.Set([]byte("answer"), []byte("42")) if err != nil { return err } // Commit the transaction and check for error. if err := txn.Commit(); err != nil { return err } ``` The first argument to `DB.NewTransaction()` is a boolean stating if the transaction should be writable. Badger allows an optional callback to the `Txn.Commit()` method. Normally, the callback can be set to `nil`, and the method will return after all the writes have succeeded. However, if this callback is provided, the `Txn.Commit()` method returns as soon as it has checked for any conflicts. The actual writing to the disk happens asynchronously, and the callback is invoked once the writing has finished, or an error has occurred. This can improve the throughput of the application in some cases. But it also means that a transaction is not durable until the callback has been invoked with a `nil` error value. ### Using key/value pairs To save a key/value pair, use the `Txn.Set()` method: ```go err := db.Update(func(txn *badger.Txn) error { err := txn.Set([]byte("answer"), []byte("42")) return err }) ``` Key/Value pair can also be saved by first creating `Entry`, then setting this `Entry` using `Txn.SetEntry()`. `Entry` also exposes methods to set properties on it. ```go err := db.Update(func(txn *badger.Txn) error { e := badger.NewEntry([]byte("answer"), []byte("42")) err := txn.SetEntry(e) return err }) ``` This will set the value of the `"answer"` key to `"42"`. To retrieve this value, we can use the `Txn.Get()` method: ```go err := db.View(func(txn *badger.Txn) error { item, err := txn.Get([]byte("answer")) handle(err) var valNot, valCopy []byte err := item.Value(func(val []byte) error { // This func with val would only be called if item.Value encounters no error. // Accessing val here is valid. fmt.Printf("The answer is: %s\n", val) // Copying or parsing val is valid. valCopy = append([]byte{}, val...) // Assigning val slice to another variable is NOT OK. valNot = val // Do not do this. return nil }) handle(err) // DO NOT access val here. It is the most common cause of bugs. fmt.Printf("NEVER do this. %s\n", valNot) // You must copy it to use it outside item.Value(...). fmt.Printf("The answer is: %s\n", valCopy) // Alternatively, you could also use item.ValueCopy(). valCopy, err = item.ValueCopy(nil) handle(err) fmt.Printf("The answer is: %s\n", valCopy) return nil }) ``` `Txn.Get()` returns `ErrKeyNotFound` if the value is not found. Please note that values returned from `Get()` are only valid while the transaction is open. If you need to use a value outside of the transaction then you must use `copy()` to copy it to another byte slice. Use the `Txn.Delete()` method to delete a key. ### Monotonically increasing integers To get unique monotonically increasing integers with strong durability, you can use the `DB.GetSequence` method. This method returns a `Sequence` object, which is thread-safe and can be used concurrently via various goroutines. Badger would lease a range of integers to hand out from memory, with the bandwidth provided to `DB.GetSequence`. The frequency at which disk writes are done is determined by this lease bandwidth and the frequency of `Next` invocations. Setting a bandwidth too low would do more disk writes, setting it too high would result in wasted integers if Badger is closed or crashes. To avoid wasted integers, call `Release` before closing Badger. ```go seq, err := db.GetSequence(key, 1000) defer seq.Release() for { num, err := seq.Next() } ``` ### Merge Operations Badger provides support for ordered merge operations. You can define a func of type `MergeFunc` which takes in an existing value, and a value to be _merged_ with it. It returns a new value which is the result of the _merge_ operation. All values are specified in byte arrays. For e.g., here is a merge function (`add`) which appends a `[]byte` value to an existing `[]byte` value. ```Go // Merge function to append one byte slice to another func add(originalValue, newValue []byte) []byte { return append(originalValue, newValue...) } ``` This function can then be passed to the `DB.GetMergeOperator()` method, along with a key, and a duration value. The duration specifies how often the merge function is run on values that have been added using the `MergeOperator.Add()` method. `MergeOperator.Get()` method can be used to retrieve the cumulative value of the key associated with the merge operation. ```Go key := []byte("merge") m := db.GetMergeOperator(key, add, 200*time.Millisecond) defer m.Stop() m.Add([]byte("A")) m.Add([]byte("B")) m.Add([]byte("C")) res, _ := m.Get() // res should have value ABC encoded ``` Example: Merge operator which increments a counter ```Go func uint64ToBytes(i uint64) []byte { var buf [8]byte binary.BigEndian.PutUint64(buf[:], i) return buf[:] } func bytesToUint64(b []byte) uint64 { return binary.BigEndian.Uint64(b) } // Merge function to add two uint64 numbers func add(existing, new []byte) []byte { return uint64ToBytes(bytesToUint64(existing) + bytesToUint64(new)) } ``` It can be used as ```Go key := []byte("merge") m := db.GetMergeOperator(key, add, 200*time.Millisecond) defer m.Stop() m.Add(uint64ToBytes(1)) m.Add(uint64ToBytes(2)) m.Add(uint64ToBytes(3)) res, _ := m.Get() // res should have value 6 encoded ``` ### Setting Time To Live(TTL) and User Metadata on Keys Badger allows setting an optional Time to Live (TTL) value on keys. Once the TTL has elapsed, the key will no longer be retrievable and will be eligible for garbage collection. A TTL can be set as a `time.Duration` value using the `Entry.WithTTL()` and `Txn.SetEntry()` API methods. ```go err := db.Update(func(txn *badger.Txn) error { e := badger.NewEntry([]byte("answer"), []byte("42")).WithTTL(time.Hour) err := txn.SetEntry(e) return err }) ``` An optional user metadata value can be set on each key. A user metadata value is represented by a single byte. It can be used to set certain bits along with the key to aid in interpreting or decoding the key-value pair. User metadata can be set using `Entry.WithMeta()` and `Txn.SetEntry()` API methods. ```go err := db.Update(func(txn *badger.Txn) error { e := badger.NewEntry([]byte("answer"), []byte("42")).WithMeta(byte(1)) err := txn.SetEntry(e) return err }) ``` `Entry` APIs can be used to add the user metadata and TTL for same key. This `Entry` then can be set using `Txn.SetEntry()`. ```go err := db.Update(func(txn *badger.Txn) error { e := badger.NewEntry([]byte("answer"), []byte("42")).WithMeta(byte(1)).WithTTL(time.Hour) err := txn.SetEntry(e) return err }) ``` ### Iterating over keys To iterate over keys, we can use an `Iterator`, which can be obtained using the `Txn.NewIterator()` method. Iteration happens in byte-wise lexicographical sorting order. ```go err := db.View(func(txn *badger.Txn) error { opts := badger.DefaultIteratorOptions opts.PrefetchSize = 10 it := txn.NewIterator(opts) defer it.Close() for it.Rewind(); it.Valid(); it.Next() { item := it.Item() k := item.Key() err := item.Value(func(v []byte) error { fmt.Printf("key=%s, value=%s\n", k, v) return nil }) if err != nil { return err } } return nil }) ``` The iterator allows you to move to a specific point in the list of keys and move forward or backward through the keys one at a time. By default, Badger prefetches the values of the next 100 items. You can adjust that with the `IteratorOptions.PrefetchSize` field. However, setting it to a value higher than `GOMAXPROCS` (which we recommend to be 128 or higher) shouldn’t give any additional benefits. You can also turn off the fetching of values altogether. See section below on key-only iteration. #### Prefix scans To iterate over a key prefix, you can combine `Seek()` and `ValidForPrefix()`: ```go db.View(func(txn *badger.Txn) error { it := txn.NewIterator(badger.DefaultIteratorOptions) defer it.Close() prefix := []byte("1234") for it.Seek(prefix); it.ValidForPrefix(prefix); it.Next() { item := it.Item() k := item.Key() err := item.Value(func(v []byte) error { fmt.Printf("key=%s, value=%s\n", k, v) return nil }) if err != nil { return err } } return nil }) ``` #### Key-only iteration Badger supports a unique mode of iteration called _key-only_ iteration. It is several order of magnitudes faster than regular iteration, because it involves access to the LSM-tree only, which is usually resident entirely in RAM. To enable key-only iteration, you need to set the `IteratorOptions.PrefetchValues` field to `false`. This can also be used to do sparse reads for selected keys during an iteration, by calling `item.Value()` only when required. ```go err := db.View(func(txn *badger.Txn) error { opts := badger.DefaultIteratorOptions opts.PrefetchValues = false it := txn.NewIterator(opts) defer it.Close() for it.Rewind(); it.Valid(); it.Next() { item := it.Item() k := item.Key() fmt.Printf("key=%s\n", k) } return nil }) ``` ### Stream Badger provides a Stream framework, which concurrently iterates over all or a portion of the DB, converting data into custom key-values, and streams it out serially to be sent over network, written to disk, or even written back to Badger. This is a lot faster way to iterate over Badger than using a single Iterator. Stream supports Badger in both managed and normal mode. Stream uses the natural boundaries created by SSTables within the LSM tree, to quickly generate key ranges. Each goroutine then picks a range and runs an iterator to iterate over it. Each iterator iterates over all versions of values and is created from the same transaction, thus working over a snapshot of the DB. Every time a new key is encountered, it calls `ChooseKey(item)`, followed by `KeyToList(key, itr)`. This allows a user to select or reject that key, and if selected, convert the value versions into custom key-values. The goroutine batches up 4MB worth of key-values, before sending it over to a channel. Another goroutine further batches up data from this channel using *smart batching* algorithm and calls `Send` serially. This framework is designed for high throughput key-value iteration, spreading the work of iteration across many goroutines. `DB.Backup` uses this framework to provide full and incremental backups quickly. Dgraph is a heavy user of this framework. In fact, this framework was developed and used within Dgraph, before getting ported over to Badger. ```go stream := db.NewStream() // db.NewStreamAt(readTs) for managed mode. // -- Optional settings stream.NumGo = 16 // Set number of goroutines to use for iteration. stream.Prefix = []byte("some-prefix") // Leave nil for iteration over the whole DB. stream.LogPrefix = "Badger.Streaming" // For identifying stream logs. Outputs to Logger. // ChooseKey is called concurrently for every key. If left nil, assumes true by default. stream.ChooseKey = func(item *badger.Item) bool { return bytes.HasSuffix(item.Key(), []byte("er")) } // KeyToList is called concurrently for chosen keys. This can be used to convert // Badger data into custom key-values. If nil, uses stream.ToList, a default // implementation, which picks all valid key-values. stream.KeyToList = nil // -- End of optional settings. // Send is called serially, while Stream.Orchestrate is running. stream.Send = func(list *pb.KVList) error { return proto.MarshalText(w, list) // Write to w. } // Run the stream if err := stream.Orchestrate(context.Background()); err != nil { return err } // Done. ``` ### Garbage Collection Badger values need to be garbage collected, because of two reasons: * Badger keeps values separately from the LSM tree. This means that the compaction operations that clean up the LSM tree do not touch the values at all. Values need to be cleaned up separately. * Concurrent read/write transactions could leave behind multiple values for a single key, because they are stored with different versions. These could accumulate, and take up unneeded space beyond the time these older versions are needed. Badger relies on the client to perform garbage collection at a time of their choosing. It provides the following method, which can be invoked at an appropriate time: * `DB.RunValueLogGC()`: This method is designed to do garbage collection while Badger is online. Along with randomly picking a file, it uses statistics generated by the LSM-tree compactions to pick files that are likely to lead to maximum space reclamation. It is recommended to be called during periods of low activity in your system, or periodically. One call would only result in removal of at max one log file. As an optimization, you could also immediately re-run it whenever it returns nil error (indicating a successful value log GC), as shown below. ```go ticker := time.NewTicker(5 * time.Minute) defer ticker.Stop() for range ticker.C { again: err := db.RunValueLogGC(0.7) if err == nil { goto again } } ``` * `DB.PurgeOlderVersions()`: This method is **DEPRECATED** since v1.5.0. Now, Badger's LSM tree automatically discards older/invalid versions of keys. **Note: The RunValueLogGC method would not garbage collect the latest value log.** ### Database backup There are two public API methods `DB.Backup()` and `DB.Load()` which can be used to do online backups and restores. Badger v0.9 provides a CLI tool `badger`, which can do offline backup/restore. Make sure you have `$GOPATH/bin` in your PATH to use this tool. The command below will create a version-agnostic backup of the database, to a file `badger.bak` in the current working directory ``` badger backup --dir ``` To restore `badger.bak` in the current working directory to a new database: ``` badger restore --dir ``` See `badger --help` for more details. If you have a Badger database that was created using v0.8 (or below), you can use the `badger_backup` tool provided in v0.8.1, and then restore it using the command above to upgrade your database to work with the latest version. ``` badger_backup --dir --backup-file badger.bak ``` We recommend all users to use the `Backup` and `Restore` APIs and tools. However, Badger is also rsync-friendly because all files are immutable, barring the latest value log which is append-only. So, rsync can be used as rudimentary way to perform a backup. In the following script, we repeat rsync to ensure that the LSM tree remains consistent with the MANIFEST file while doing a full backup. ``` #!/bin/bash set -o history set -o histexpand # Makes a complete copy of a Badger database directory. # Repeat rsync if the MANIFEST and SSTables are updated. rsync -avz --delete db/ dst while !! | grep -q "(MANIFEST\|\.sst)$"; do :; done ``` ### Memory usage Badger's memory usage can be managed by tweaking several options available in the `Options` struct that is passed in when opening the database using `DB.Open`. - `Options.ValueLogLoadingMode` can be set to `options.FileIO` (instead of the default `options.MemoryMap`) to avoid memory-mapping log files. This can be useful in environments with low RAM. - Number of memtables (`Options.NumMemtables`) - If you modify `Options.NumMemtables`, also adjust `Options.NumLevelZeroTables` and `Options.NumLevelZeroTablesStall` accordingly. - Number of concurrent compactions (`Options.NumCompactors`) - Mode in which LSM tree is loaded (`Options.TableLoadingMode`) - Size of table (`Options.MaxTableSize`) - Size of value log file (`Options.ValueLogFileSize`) If you want to decrease the memory usage of Badger instance, tweak these options (ideally one at a time) until you achieve the desired memory usage. ### Statistics Badger records metrics using the [expvar] package, which is included in the Go standard library. All the metrics are documented in [y/metrics.go][metrics] file. `expvar` package adds a handler in to the default HTTP server (which has to be started explicitly), and serves up the metrics at the `/debug/vars` endpoint. These metrics can then be collected by a system like [Prometheus], to get better visibility into what Badger is doing. [expvar]: https://golang.org/pkg/expvar/ [metrics]: https://github.com/dgraph-io/badger/blob/master/y/metrics.go [Prometheus]: https://prometheus.io/ ## Resources ### Blog Posts 1. [Introducing Badger: A fast key-value store written natively in Go](https://open.dgraph.io/post/badger/) 2. [Make Badger crash resilient with ALICE](https://blog.dgraph.io/post/alice/) 3. [Badger vs LMDB vs BoltDB: Benchmarking key-value databases in Go](https://blog.dgraph.io/post/badger-lmdb-boltdb/) 4. [Concurrent ACID Transactions in Badger](https://blog.dgraph.io/post/badger-txn/) ## Design Badger was written with these design goals in mind: - Write a key-value database in pure Go. - Use latest research to build the fastest KV database for data sets spanning terabytes. - Optimize for SSDs. Badger’s design is based on a paper titled _[WiscKey: Separating Keys from Values in SSD-conscious Storage][wisckey]_. [wisckey]: https://www.usenix.org/system/files/conference/fast16/fast16-papers-lu.pdf ### Comparisons | Feature | Badger | RocksDB | BoltDB | | ------- | ------ | ------- | ------ | | Design | LSM tree with value log | LSM tree only | B+ tree | | High Read throughput | Yes | No | Yes | | High Write throughput | Yes | Yes | No | | Designed for SSDs | Yes (with latest research 1) | Not specifically 2 | No | | Embeddable | Yes | Yes | Yes | | Sorted KV access | Yes | Yes | Yes | | Pure Go (no Cgo) | Yes | No | Yes | | Transactions | Yes, ACID, concurrent with SSI3 | Yes (but non-ACID) | Yes, ACID | | Snapshots | Yes | Yes | Yes | | TTL support | Yes | Yes | No | | 3D access (key-value-version) | Yes4 | No | No | 1 The [WISCKEY paper][wisckey] (on which Badger is based) saw big wins with separating values from keys, significantly reducing the write amplification compared to a typical LSM tree. 2 RocksDB is an SSD optimized version of LevelDB, which was designed specifically for rotating disks. As such RocksDB's design isn't aimed at SSDs. 3 SSI: Serializable Snapshot Isolation. For more details, see the blog post [Concurrent ACID Transactions in Badger](https://blog.dgraph.io/post/badger-txn/) 4 Badger provides direct access to value versions via its Iterator API. Users can also specify how many versions to keep per key via Options. ### Benchmarks We have run comprehensive benchmarks against RocksDB, Bolt and LMDB. The benchmarking code, and the detailed logs for the benchmarks can be found in the [badger-bench] repo. More explanation, including graphs can be found the blog posts (linked above). [badger-bench]: https://github.com/dgraph-io/badger-bench ## Projects Using Badger Below is a list of known projects that use Badger: * [Dgraph](https://github.com/dgraph-io/dgraph) - Distributed graph database. * [Jaeger](https://github.com/jaegertracing/jaeger) - Distributed tracing platform. * [go-ipfs](https://github.com/ipfs/go-ipfs) - Go client for the InterPlanetary File System (IPFS), a new hypermedia distribution protocol. * [Riot](https://github.com/go-ego/riot) - An open-source, distributed search engine. * [emitter](https://github.com/emitter-io/emitter) - Scalable, low latency, distributed pub/sub broker with message storage, uses MQTT, gossip and badger. * [OctoSQL](https://github.com/cube2222/octosql) - Query tool that allows you to join, analyse and transform data from multiple databases using SQL. * [Dkron](https://dkron.io/) - Distributed, fault tolerant job scheduling system. * [Sandglass](https://github.com/celrenheit/sandglass) - distributed, horizontally scalable, persistent, time sorted message queue. * [TalariaDB](https://github.com/grab/talaria) - Grab's Distributed, low latency time-series database. * [Sloop](https://github.com/salesforce/sloop) - Salesforce's Kubernetes History Visualization Project. * [Immudb](https://github.com/codenotary/immudb) - Lightweight, high-speed immutable database for systems and applications. * [Usenet Express](https://usenetexpress.com/) - Serving over 300TB of data with Badger. * [gorush](https://github.com/appleboy/gorush) - A push notification server written in Go. * [0-stor](https://github.com/zero-os/0-stor) - Single device object store. * [Dispatch Protocol](https://github.com/dispatchlabs/disgo) - Blockchain protocol for distributed application data analytics. * [GarageMQ](https://github.com/valinurovam/garagemq) - AMQP server written in Go. * [RedixDB](https://alash3al.github.io/redix/) - A real-time persistent key-value store with the same redis protocol. * [BBVA](https://github.com/BBVA/raft-badger) - Raft backend implementation using BadgerDB for Hashicorp raft. * [Fantom](https://github.com/Fantom-foundation/go-lachesis) - aBFT Consensus platform for distributed applications. * [decred](https://github.com/decred/dcrdata) - An open, progressive, and self-funding cryptocurrency with a system of community-based governance integrated into its blockchain. * [OpenNetSys](https://github.com/opennetsys/c3-go) - Create useful dApps in any software language. * [HoneyTrap](https://github.com/honeytrap/honeytrap) - An extensible and opensource system for running, monitoring and managing honeypots. * [Insolar](https://github.com/insolar/insolar) - Enterprise-ready blockchain platform. * [IoTeX](https://github.com/iotexproject/iotex-core) - The next generation of the decentralized network for IoT powered by scalability- and privacy-centric blockchains. * [go-sessions](https://github.com/kataras/go-sessions) - The sessions manager for Go net/http and fasthttp. * [Babble](https://github.com/mosaicnetworks/babble) - BFT Consensus platform for distributed applications. * [Tormenta](https://github.com/jpincas/tormenta) - Embedded object-persistence layer / simple JSON database for Go projects. * [BadgerHold](https://github.com/timshannon/badgerhold) - An embeddable NoSQL store for querying Go types built on Badger * [Goblero](https://github.com/didil/goblero) - Pure Go embedded persistent job queue backed by BadgerDB * [Surfline](https://www.surfline.com) - Serving global wave and weather forecast data with Badger. * [Cete](https://github.com/mosuka/cete) - Simple and highly available distributed key-value store built on Badger. Makes it easy bringing up a cluster of Badger with Raft consensus algorithm by hashicorp/raft. * [Volument](https://volument.com/) - A new take on website analytics backed by Badger. * [KVdb](https://kvdb.io/) - Hosted key-value store and serverless platform built on top of Badger. If you are using Badger in a project please send a pull request to add it to the list. ## Contributing If you're interested in contributing to Badger see [CONTRIBUTING.md](./CONTRIBUTING.md). ## Frequently Asked Questions ### My writes are getting stuck. Why? **Update: With the new `Value(func(v []byte))` API, this deadlock can no longer happen.** The following is true for users on Badger v1.x. This can happen if a long running iteration with `Prefetch` is set to false, but a `Item::Value` call is made internally in the loop. That causes Badger to acquire read locks over the value log files to avoid value log GC removing the file from underneath. As a side effect, this also blocks a new value log GC file from being created, when the value log file boundary is hit. Please see Github issues [#293](https://github.com/dgraph-io/badger/issues/293) and [#315](https://github.com/dgraph-io/badger/issues/315). There are multiple workarounds during iteration: 1. Use `Item::ValueCopy` instead of `Item::Value` when retrieving value. 1. Set `Prefetch` to true. Badger would then copy over the value and release the file lock immediately. 1. When `Prefetch` is false, don't call `Item::Value` and do a pure key-only iteration. This might be useful if you just want to delete a lot of keys. 1. Do the writes in a separate transaction after the reads. ### My writes are really slow. Why? Are you creating a new transaction for every single key update, and waiting for it to `Commit` fully before creating a new one? This will lead to very low throughput. We have created `WriteBatch` API which provides a way to batch up many updates into a single transaction and `Commit` that transaction using callbacks to avoid blocking. This amortizes the cost of a transaction really well, and provides the most efficient way to do bulk writes. ```go wb := db.NewWriteBatch() defer wb.Cancel() for i := 0; i < N; i++ { err := wb.Set(key(i), value(i), 0) // Will create txns as needed. handle(err) } handle(wb.Flush()) // Wait for all txns to finish. ``` Note that `WriteBatch` API does not allow any reads. For read-modify-write workloads, you should be using the `Transaction` API. ### I don't see any disk writes. Why? If you're using Badger with `SyncWrites=false`, then your writes might not be written to value log and won't get synced to disk immediately. Writes to LSM tree are done inmemory first, before they get compacted to disk. The compaction would only happen once `MaxTableSize` has been reached. So, if you're doing a few writes and then checking, you might not see anything on disk. Once you `Close` the database, you'll see these writes on disk. ### Reverse iteration doesn't give me the right results. Just like forward iteration goes to the first key which is equal or greater than the SEEK key, reverse iteration goes to the first key which is equal or lesser than the SEEK key. Therefore, SEEK key would not be part of the results. You can typically add a `0xff` byte as a suffix to the SEEK key to include it in the results. See the following issues: [#436](https://github.com/dgraph-io/badger/issues/436) and [#347](https://github.com/dgraph-io/badger/issues/347). ### Which instances should I use for Badger? We recommend using instances which provide local SSD storage, without any limit on the maximum IOPS. In AWS, these are storage optimized instances like i3. They provide local SSDs which clock 100K IOPS over 4KB blocks easily. ### I'm getting a closed channel error. Why? ``` panic: close of closed channel panic: send on closed channel ``` If you're seeing panics like above, this would be because you're operating on a closed DB. This can happen, if you call `Close()` before sending a write, or multiple times. You should ensure that you only call `Close()` once, and all your read/write operations finish before closing. ### Are there any Go specific settings that I should use? We *highly* recommend setting a high number for `GOMAXPROCS`, which allows Go to observe the full IOPS throughput provided by modern SSDs. In Dgraph, we have set it to 128. For more details, [see this thread](https://groups.google.com/d/topic/golang-nuts/jPb_h3TvlKE/discussion). ### Are there any Linux specific settings that I should use? We recommend setting `max file descriptors` to a high number depending upon the expected size of your data. On Linux and Mac, you can check the file descriptor limit with `ulimit -n -H` for the hard limit and `ulimit -n -S` for the soft limit. A soft limit of `65535` is a good lower bound. You can adjust the limit as needed. ### I see "manifest has unsupported version: X (we support Y)" error. This error means you have a badger directory which was created by an older version of badger and you're trying to open in a newer version of badger. The underlying data format can change across badger versions and users will have to migrate their data directory. Badger data can be migrated from version X of badger to version Y of badger by following the steps listed below. Assume you were on badger v1.6.0 and you wish to migrate to v2.0.0 version. 1. Install badger version v1.6.0 - `cd $GOPATH/src/github.com/dgraph-io/badger` - `git checkout v1.6.0` - `cd badger && go install` This should install the old badger binary in your $GOBIN. 2. Create Backup - `badger backup --dir path/to/badger/directory -f badger.backup` 3. Install badger version v2.0.0 - `cd $GOPATH/src/github.com/dgraph-io/badger` - `git checkout v2.0.0` - `cd badger && go install` This should install new badger binary in your $GOBIN 4. Install badger version v2.0.0 - `badger restore --dir path/to/new/badger/directory -f badger.backup` This will create a new directory on `path/to/new/badger/directory` and add badger data in newer format to it. NOTE - The above steps shouldn't cause any data loss but please ensure the new data is valid before deleting the old badger directory. ### Why do I need gcc to build badger? Does badger need CGO? Badger does not directly use CGO but it relies on https://github.com/DataDog/zstd library for zstd compression and the library requires `gcc/cgo`. You can build badger without cgo by running `CGO_ENABLED=0 go build`. This will build badger without the support for ZSTD compression algorithm. ## Contact - Please use [discuss.dgraph.io](https://discuss.dgraph.io) for questions, feature requests and discussions. - Please use [Github issue tracker](https://github.com/dgraph-io/badger/issues) for filing bugs or feature requests. - Join [![Slack Status](http://slack.dgraph.io/badge.svg)](http://slack.dgraph.io). - Follow us on Twitter [@dgraphlabs](https://twitter.com/dgraphlabs). badger-2.2007.2/VERSIONING.md000066400000000000000000000046201372173116500152050ustar00rootroot00000000000000# Serialization Versioning: Semantic Versioning for databases Semantic Versioning, commonly known as SemVer, is a great idea that has been very widely adopted as a way to decide how to name software versions. The whole concept is very well summarized on semver.org with the following lines: > Given a version number MAJOR.MINOR.PATCH, increment the: > > 1. MAJOR version when you make incompatible API changes, > 2. MINOR version when you add functionality in a backwards-compatible manner, and > 3. PATCH version when you make backwards-compatible bug fixes. > > Additional labels for pre-release and build metadata are available as extensions to the > MAJOR.MINOR.PATCH format. Unfortunately, API changes are not the most important changes for libraries that serialize data for later consumption. For these libraries, such as BadgerDB, changes to the API are much easier to handle than change to the data format used to store data on disk. ## Serialization Version specification Serialization Versioning, like Semantic Versioning, uses 3 numbers and also calls them MAJOR.MINOR.PATCH, but the semantics of the numbers are slightly modified: Given a version number MAJOR.MINOR.PATCH, increment the: - MAJOR version when you make changes that require a transformation of the dataset before it can be used again. - MINOR version when old datasets are still readable but the API might have changed in backwards-compatible or incompatible ways. - PATCH version when you make backwards-compatible bug fixes. Additional labels for pre-release and build metadata are available as extensions to the MAJOR.MINOR.PATCH format. Following this naming strategy, migration from v1.x to v2.x requires a migration strategy for your existing dataset, and as such has to be carefully planned. Migrations in between different minor versions (e.g. v1.5.x and v1.6.x) might break your build, as the API *might* have changed, but once your code compiles there's no need for any data migration. Lastly, changes in between two different patch versions should never break your build or dataset. For more background on our decision to adopt Serialization Versioning, read the blog post [Semantic Versioning, Go Modules, and Databases][blog] and the original proposal on [this comment on Dgraph's Discuss forum][discuss]. [blog]: https://blog.dgraph.io/post/serialization-versioning/ [discuss]: https://discuss.dgraph.io/t/go-modules-on-badger-and-dgraph/4662/7badger-2.2007.2/appveyor.yml000066400000000000000000000023761372173116500155760ustar00rootroot00000000000000# version format version: "{build}" # Operating system (build VM template) os: Windows Server 2012 R2 # Platform. platform: x64 clone_folder: c:\gopath\src\github.com\dgraph-io\badger # Environment variables environment: GOVERSION: 1.12 GOPATH: c:\gopath GO111MODULE: on # scripts that run after cloning repository install: - set PATH=%GOPATH%\bin;c:\go\bin;c:\msys64\mingw64\bin;%PATH% - go version - go env - python --version - gcc --version # To run your custom scripts instead of automatic MSBuild build_script: # We need to disable firewall - https://github.com/appveyor/ci/issues/1579#issuecomment-309830648 - ps: Disable-NetFirewallRule -DisplayName 'File and Printer Sharing (SMB-Out)' - cd c:\gopath\src\github.com\dgraph-io\badger - git branch - go get -t ./... # To run your custom scripts instead of automatic tests test_script: # Unit tests - ps: Add-AppveyorTest "Unit Tests" -Outcome Running - go test -v github.com/dgraph-io/badger/... - go test -v -vlog_mmap=false github.com/dgraph-io/badger/... - ps: Update-AppveyorTest "Unit Tests" -Outcome Passed notifications: - provider: Email to: - pawan@dgraph.io on_build_failure: true on_build_status_changed: true # to disable deployment deploy: off badger-2.2007.2/backup.go000066400000000000000000000163111372173116500147740ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "bufio" "bytes" "context" "encoding/binary" "io" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/y" "github.com/golang/protobuf/proto" ) // flushThreshold determines when a buffer will be flushed. When performing a // backup/restore, the entries will be batched up until the total size of batch // is more than flushThreshold or entry size (without the value size) is more // than the maxBatchSize. const flushThreshold = 100 << 20 // Backup is a wrapper function over Stream.Backup to generate full and incremental backups of the // DB. For more control over how many goroutines are used to generate the backup, or if you wish to // backup only a certain range of keys, use Stream.Backup directly. func (db *DB) Backup(w io.Writer, since uint64) (uint64, error) { stream := db.NewStream() stream.LogPrefix = "DB.Backup" return stream.Backup(w, since) } // Backup dumps a protobuf-encoded list of all entries in the database into the // given writer, that are newer than the specified version. It returns a // timestamp indicating when the entries were dumped which can be passed into a // later invocation to generate an incremental dump, of entries that have been // added/modified since the last invocation of Stream.Backup(). // // This can be used to backup the data in a database at a given point in time. func (stream *Stream) Backup(w io.Writer, since uint64) (uint64, error) { stream.KeyToList = func(key []byte, itr *Iterator) (*pb.KVList, error) { list := &pb.KVList{} for ; itr.Valid(); itr.Next() { item := itr.Item() if !bytes.Equal(item.Key(), key) { return list, nil } if item.Version() < since { // Ignore versions less than given timestamp, or skip older // versions of the given key. return list, nil } var valCopy []byte if !item.IsDeletedOrExpired() { // No need to copy value, if item is deleted or expired. var err error valCopy, err = item.ValueCopy(nil) if err != nil { stream.db.opt.Errorf("Key [%x, %d]. Error while fetching value [%v]\n", item.Key(), item.Version(), err) return nil, err } } // clear txn bits meta := item.meta &^ (bitTxn | bitFinTxn) kv := &pb.KV{ Key: item.KeyCopy(nil), Value: valCopy, UserMeta: []byte{item.UserMeta()}, Version: item.Version(), ExpiresAt: item.ExpiresAt(), Meta: []byte{meta}, } list.Kv = append(list.Kv, kv) switch { case item.DiscardEarlierVersions(): // If we need to discard earlier versions of this item, add a delete // marker just below the current version. list.Kv = append(list.Kv, &pb.KV{ Key: item.KeyCopy(nil), Version: item.Version() - 1, Meta: []byte{bitDelete}, }) return list, nil case item.IsDeletedOrExpired(): return list, nil } } return list, nil } var maxVersion uint64 stream.Send = func(list *pb.KVList) error { for _, kv := range list.Kv { if maxVersion < kv.Version { maxVersion = kv.Version } } return writeTo(list, w) } if err := stream.Orchestrate(context.Background()); err != nil { return 0, err } return maxVersion, nil } func writeTo(list *pb.KVList, w io.Writer) error { if err := binary.Write(w, binary.LittleEndian, uint64(proto.Size(list))); err != nil { return err } buf, err := proto.Marshal(list) if err != nil { return err } _, err = w.Write(buf) return err } // KVLoader is used to write KVList objects in to badger. It can be used to restore a backup. type KVLoader struct { db *DB throttle *y.Throttle entries []*Entry entriesSize int64 totalSize int64 } // NewKVLoader returns a new instance of KVLoader. func (db *DB) NewKVLoader(maxPendingWrites int) *KVLoader { return &KVLoader{ db: db, throttle: y.NewThrottle(maxPendingWrites), entries: make([]*Entry, 0, db.opt.maxBatchCount), } } // Set writes the key-value pair to the database. func (l *KVLoader) Set(kv *pb.KV) error { var userMeta, meta byte if len(kv.UserMeta) > 0 { userMeta = kv.UserMeta[0] } if len(kv.Meta) > 0 { meta = kv.Meta[0] } e := &Entry{ Key: y.KeyWithTs(kv.Key, kv.Version), Value: kv.Value, UserMeta: userMeta, ExpiresAt: kv.ExpiresAt, meta: meta, } estimatedSize := int64(e.estimateSize(l.db.opt.ValueThreshold)) // Flush entries if inserting the next entry would overflow the transactional limits. if int64(len(l.entries))+1 >= l.db.opt.maxBatchCount || l.entriesSize+estimatedSize >= l.db.opt.maxBatchSize || l.totalSize >= flushThreshold { if err := l.send(); err != nil { return err } } l.entries = append(l.entries, e) l.entriesSize += estimatedSize l.totalSize += estimatedSize + int64(len(e.Value)) return nil } func (l *KVLoader) send() error { if err := l.throttle.Do(); err != nil { return err } if err := l.db.batchSetAsync(l.entries, func(err error) { l.throttle.Done(err) }); err != nil { return err } l.entries = make([]*Entry, 0, l.db.opt.maxBatchCount) l.entriesSize = 0 l.totalSize = 0 return nil } // Finish is meant to be called after all the key-value pairs have been loaded. func (l *KVLoader) Finish() error { if len(l.entries) > 0 { if err := l.send(); err != nil { return err } } return l.throttle.Finish() } // Load reads a protobuf-encoded list of all entries from a reader and writes // them to the database. This can be used to restore the database from a backup // made by calling DB.Backup(). If more complex logic is needed to restore a badger // backup, the KVLoader interface should be used instead. // // DB.Load() should be called on a database that is not running any other // concurrent transactions while it is running. func (db *DB) Load(r io.Reader, maxPendingWrites int) error { br := bufio.NewReaderSize(r, 16<<10) unmarshalBuf := make([]byte, 1<<10) ldr := db.NewKVLoader(maxPendingWrites) for { var sz uint64 err := binary.Read(br, binary.LittleEndian, &sz) if err == io.EOF { break } else if err != nil { return err } if cap(unmarshalBuf) < int(sz) { unmarshalBuf = make([]byte, sz) } if _, err = io.ReadFull(br, unmarshalBuf[:sz]); err != nil { return err } list := &pb.KVList{} if err := proto.Unmarshal(unmarshalBuf[:sz], list); err != nil { return err } for _, kv := range list.Kv { if err := ldr.Set(kv); err != nil { return err } // Update nextTxnTs, memtable stores this // timestamp in badger head when flushed. if kv.Version >= db.orc.nextTxnTs { db.orc.nextTxnTs = kv.Version + 1 } } } if err := ldr.Finish(); err != nil { return err } db.orc.txnMark.Done(db.orc.nextTxnTs - 1) return nil } badger-2.2007.2/backup_test.go000066400000000000000000000322431372173116500160350ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "bytes" "fmt" "io/ioutil" "math/rand" "os" "path/filepath" "reflect" "strconv" "testing" "time" "github.com/dgraph-io/badger/v2/pb" "github.com/stretchr/testify/require" ) func TestBackupRestore1(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) db, err := Open(getTestOptions(dir)) require.NoError(t, err) // Write some stuff entries := []struct { key []byte val []byte userMeta byte version uint64 }{ {key: []byte("answer1"), val: []byte("42"), version: 1}, {key: []byte("answer2"), val: []byte("43"), userMeta: 1, version: 2}, } err = db.Update(func(txn *Txn) error { e := entries[0] err := txn.SetEntry(NewEntry(e.key, e.val).WithMeta(e.userMeta)) if err != nil { return err } return nil }) require.NoError(t, err) err = db.Update(func(txn *Txn) error { e := entries[1] err := txn.SetEntry(NewEntry(e.key, e.val).WithMeta(e.userMeta)) if err != nil { return err } return nil }) require.NoError(t, err) // Use different directory. dir, err = ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) bak, err := ioutil.TempFile(dir, "badgerbak") require.NoError(t, err) _, err = db.Backup(bak, 0) require.NoError(t, err) require.NoError(t, bak.Close()) require.NoError(t, db.Close()) db, err = Open(getTestOptions(dir)) require.NoError(t, err) defer db.Close() bak, err = os.Open(bak.Name()) require.NoError(t, err) defer bak.Close() require.NoError(t, db.Load(bak, 16)) err = db.View(func(txn *Txn) error { opts := DefaultIteratorOptions opts.AllVersions = true it := txn.NewIterator(opts) defer it.Close() var count int for it.Rewind(); it.Valid(); it.Next() { item := it.Item() val, err := item.ValueCopy(nil) if err != nil { return err } require.Equal(t, entries[count].key, item.Key()) require.Equal(t, entries[count].val, val) require.Equal(t, entries[count].version, item.Version()) require.Equal(t, entries[count].userMeta, item.UserMeta()) count++ } require.Equal(t, count, 2) return nil }) require.NoError(t, err) require.Equal(t, db.orc.nextTs(), uint64(3)) } func TestBackupRestore2(t *testing.T) { tmpdir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(tmpdir) s1Path := filepath.Join(tmpdir, "test1") s2Path := filepath.Join(tmpdir, "test2") s3Path := filepath.Join(tmpdir, "test3") db1, err := Open(getTestOptions(s1Path)) require.NoError(t, err) defer db1.Close() key1 := []byte("key1") key2 := []byte("key2") rawValue := []byte("NotLongValue") N := byte(251) err = db1.Update(func(tx *Txn) error { if err := tx.SetEntry(NewEntry(key1, rawValue)); err != nil { return err } return tx.SetEntry(NewEntry(key2, rawValue)) }) require.NoError(t, err) for i := byte(1); i < N; i++ { err = db1.Update(func(tx *Txn) error { if err := tx.SetEntry(NewEntry(append(key1, i), rawValue)); err != nil { return err } return tx.SetEntry(NewEntry(append(key2, i), rawValue)) }) require.NoError(t, err) } var backup bytes.Buffer _, err = db1.Backup(&backup, 0) require.NoError(t, err) fmt.Println("backup1 length:", backup.Len()) db2, err := Open(getTestOptions(s2Path)) require.NoError(t, err) defer db2.Close() err = db2.Load(&backup, 16) require.NoError(t, err) // Check nextTs is correctly set. require.Equal(t, db1.orc.nextTs(), db2.orc.nextTs()) for i := byte(1); i < N; i++ { err = db2.View(func(tx *Txn) error { k := append(key1, i) item, err := tx.Get(k) if err != nil { if err == ErrKeyNotFound { return fmt.Errorf("Key %q has been not found, but was set\n", k) } return err } v, err := item.ValueCopy(nil) if err != nil { return err } if !reflect.DeepEqual(v, rawValue) { return fmt.Errorf("Values not match, got %v, expected %v", v, rawValue) } return nil }) require.NoError(t, err) } for i := byte(1); i < N; i++ { err = db2.Update(func(tx *Txn) error { if err := tx.SetEntry(NewEntry(append(key1, i), rawValue)); err != nil { return err } return tx.SetEntry(NewEntry(append(key2, i), rawValue)) }) require.NoError(t, err) } backup.Reset() _, err = db2.Backup(&backup, 0) require.NoError(t, err) fmt.Println("backup2 length:", backup.Len()) db3, err := Open(getTestOptions(s3Path)) require.NoError(t, err) defer db3.Close() err = db3.Load(&backup, 16) require.NoError(t, err) // Check nextTs is correctly set. require.Equal(t, db2.orc.nextTs(), db3.orc.nextTs()) for i := byte(1); i < N; i++ { err = db3.View(func(tx *Txn) error { k := append(key1, i) item, err := tx.Get(k) if err != nil { if err == ErrKeyNotFound { return fmt.Errorf("Key %q has been not found, but was set\n", k) } return err } v, err := item.ValueCopy(nil) if err != nil { return err } if !reflect.DeepEqual(v, rawValue) { return fmt.Errorf("Values not match, got %v, expected %v", v, rawValue) } return nil }) require.NoError(t, err) } } var randSrc = rand.NewSource(time.Now().UnixNano()) func createEntries(n int) []*pb.KV { entries := make([]*pb.KV, n) for i := 0; i < n; i++ { entries[i] = &pb.KV{ Key: []byte(fmt.Sprint("key", i)), Value: []byte{1}, UserMeta: []byte{0}, Meta: []byte{0}, } } return entries } func populateEntries(db *DB, entries []*pb.KV) error { return db.Update(func(txn *Txn) error { var err error for i, e := range entries { if err = txn.SetEntry(NewEntry(e.Key, e.Value)); err != nil { return err } entries[i].Version = 1 } return nil }) } func TestBackup(t *testing.T) { test := func(t *testing.T, db *DB) { var bb bytes.Buffer N := 1000 entries := createEntries(N) require.NoError(t, populateEntries(db, entries)) _, err := db.Backup(&bb, 0) require.NoError(t, err) err = db.View(func(txn *Txn) error { opts := DefaultIteratorOptions it := txn.NewIterator(opts) defer it.Close() var count int for it.Rewind(); it.Valid(); it.Next() { item := it.Item() idx, err := strconv.Atoi(string(item.Key())[3:]) if err != nil { return err } if idx > N || !bytes.Equal(entries[idx].Key, item.Key()) { return fmt.Errorf("%s: %s", string(item.Key()), ErrKeyNotFound) } count++ } if N != count { return fmt.Errorf("wrong number of items: %d expected, %d actual", N, count) } return nil }) require.NoError(t, err) } t.Run("disk mode", func(t *testing.T) { tmpdir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(tmpdir) opt := DefaultOptions(filepath.Join(tmpdir, "backup0")) runBadgerTest(t, &opt, func(t *testing.T, db *DB) { test(t, db) }) }) t.Run("InMemory mode", func(t *testing.T) { opt := DefaultOptions("") opt.InMemory = true runBadgerTest(t, &opt, func(t *testing.T, db *DB) { test(t, db) }) }) } func TestBackupRestore3(t *testing.T) { var bb bytes.Buffer tmpdir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(tmpdir) N := 1000 entries := createEntries(N) var db1NextTs uint64 // backup { db1, err := Open(DefaultOptions(filepath.Join(tmpdir, "backup1"))) require.NoError(t, err) defer db1.Close() require.NoError(t, populateEntries(db1, entries)) _, err = db1.Backup(&bb, 0) require.NoError(t, err) db1NextTs = db1.orc.nextTs() require.NoError(t, db1.Close()) } require.True(t, len(entries) == N) require.True(t, bb.Len() > 0) // restore db2, err := Open(DefaultOptions(filepath.Join(tmpdir, "restore1"))) require.NoError(t, err) defer db2.Close() require.NotEqual(t, db1NextTs, db2.orc.nextTs()) require.NoError(t, db2.Load(&bb, 16)) require.Equal(t, db1NextTs, db2.orc.nextTs()) // verify err = db2.View(func(txn *Txn) error { opts := DefaultIteratorOptions it := txn.NewIterator(opts) defer it.Close() var count int for it.Rewind(); it.Valid(); it.Next() { item := it.Item() idx, err := strconv.Atoi(string(item.Key())[3:]) if err != nil { return err } if idx > N || !bytes.Equal(entries[idx].Key, item.Key()) { return fmt.Errorf("%s: %s", string(item.Key()), ErrKeyNotFound) } count++ } if N != count { return fmt.Errorf("wrong number of items: %d expected, %d actual", N, count) } return nil }) require.NoError(t, err) } func TestBackupLoadIncremental(t *testing.T) { tmpdir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(tmpdir) N := 100 entries := createEntries(N) updates := make(map[int]byte) var bb bytes.Buffer var db1NextTs uint64 // backup { db1, err := Open(DefaultOptions(filepath.Join(tmpdir, "backup2"))) require.NoError(t, err) defer db1.Close() require.NoError(t, populateEntries(db1, entries)) since, err := db1.Backup(&bb, 0) require.NoError(t, err) ints := rand.New(randSrc).Perm(N) // pick 10 items to mark as deleted. err = db1.Update(func(txn *Txn) error { for _, i := range ints[:10] { if err := txn.Delete(entries[i].Key); err != nil { return err } updates[i] = bitDelete } return nil }) require.NoError(t, err) since, err = db1.Backup(&bb, since) require.NoError(t, err) // pick 5 items to mark as expired. err = db1.Update(func(txn *Txn) error { for _, i := range (ints)[10:15] { entry := NewEntry(entries[i].Key, entries[i].Value).WithTTL(-time.Hour) if err := txn.SetEntry(entry); err != nil { return err } updates[i] = bitDelete // expired } return nil }) require.NoError(t, err) since, err = db1.Backup(&bb, since) require.NoError(t, err) // pick 5 items to mark as discard. err = db1.Update(func(txn *Txn) error { for _, i := range ints[15:20] { entry := NewEntry(entries[i].Key, entries[i].Value).WithDiscard() if err := txn.SetEntry(entry); err != nil { return err } updates[i] = bitDiscardEarlierVersions } return nil }) require.NoError(t, err) _, err = db1.Backup(&bb, since) require.NoError(t, err) db1NextTs = db1.orc.nextTs() require.NoError(t, db1.Close()) } require.True(t, len(entries) == N) require.True(t, bb.Len() > 0) // restore db2, err := Open(getTestOptions(filepath.Join(tmpdir, "restore2"))) require.NoError(t, err) defer db2.Close() require.NotEqual(t, db1NextTs, db2.orc.nextTs()) require.NoError(t, db2.Load(&bb, 16)) require.Equal(t, db1NextTs, db2.orc.nextTs()) // verify actual := make(map[int]byte) err = db2.View(func(txn *Txn) error { opts := DefaultIteratorOptions opts.AllVersions = true it := txn.NewIterator(opts) defer it.Close() var count int for it.Rewind(); it.Valid(); it.Next() { item := it.Item() idx, err := strconv.Atoi(string(item.Key())[3:]) if err != nil { return err } if item.IsDeletedOrExpired() { _, ok := updates[idx] if !ok { return fmt.Errorf("%s: not expected to be updated but it is", string(item.Key())) } actual[idx] = item.meta count++ continue } } if len(updates) != count { return fmt.Errorf("mismatched updated items: %d expected, %d actual", len(updates), count) } return nil }) require.NoError(t, err, "%v %v", updates, actual) } func TestBackupBitClear(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opt := getTestOptions(dir) opt.ValueThreshold = 10 // This is important db, err := Open(opt) require.NoError(t, err) key := []byte("foo") val := []byte(fmt.Sprintf("%0100d", 1)) require.Greater(t, len(val), db.opt.ValueThreshold) err = db.Update(func(txn *Txn) error { e := NewEntry(key, val) // Value > valueTheshold so bitValuePointer will be set. return txn.SetEntry(e) }) require.NoError(t, err) // Use different directory. dir, err = ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) bak, err := ioutil.TempFile(dir, "badgerbak") require.NoError(t, err) _, err = db.Backup(bak, 0) require.NoError(t, err) require.NoError(t, bak.Close()) oldValue := db.orc.nextTs() require.NoError(t, db.Close()) opt = getTestOptions(dir) opt.ValueThreshold = 200 // This is important. db, err = Open(opt) require.NoError(t, err) defer db.Close() bak, err = os.Open(bak.Name()) require.NoError(t, err) defer bak.Close() require.NoError(t, db.Load(bak, 16)) // Ensure nextTs is still the same. require.Equal(t, oldValue, db.orc.nextTs()) require.NoError(t, db.View(func(txn *Txn) error { e, err := txn.Get(key) require.NoError(t, err) v, err := e.ValueCopy(nil) require.NoError(t, err) require.Equal(t, val, v) return nil })) } badger-2.2007.2/badger/000077500000000000000000000000001372173116500144225ustar00rootroot00000000000000badger-2.2007.2/badger/.gitignore000066400000000000000000000000101372173116500164010ustar00rootroot00000000000000/badger badger-2.2007.2/badger/cmd/000077500000000000000000000000001372173116500151655ustar00rootroot00000000000000badger-2.2007.2/badger/cmd/backup.go000066400000000000000000000037131372173116500167650ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cmd import ( "bufio" "os" "github.com/dgraph-io/badger/v2" "github.com/spf13/cobra" ) var backupFile string var truncate bool // backupCmd represents the backup command var backupCmd = &cobra.Command{ Use: "backup", Short: "Backup Badger database.", Long: `Backup Badger database to a file in a version-agnostic manner. Iterates over each key-value pair, encodes it along with its metadata and version in protocol buffers and writes them to a file. This file can later be used by the restore command to create an identical copy of the database.`, RunE: doBackup, } func init() { RootCmd.AddCommand(backupCmd) backupCmd.Flags().StringVarP(&backupFile, "backup-file", "f", "badger.bak", "File to backup to") backupCmd.Flags().BoolVarP(&truncate, "truncate", "t", false, "Allow value log truncation if required.") } func doBackup(cmd *cobra.Command, args []string) error { // Open DB db, err := badger.Open(badger.DefaultOptions(sstDir). WithValueDir(vlogDir). WithTruncate(truncate)) if err != nil { return err } defer db.Close() // Create File f, err := os.Create(backupFile) if err != nil { return err } bw := bufio.NewWriterSize(f, 64<<20) if _, err = db.Backup(bw, 0); err != nil { return err } if err = bw.Flush(); err != nil { return err } if err = f.Sync(); err != nil { return err } return f.Close() } badger-2.2007.2/badger/cmd/bank.go000066400000000000000000000335751372173116500164440ustar00rootroot00000000000000/* * Copyright 2018 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cmd import ( "bytes" "context" "errors" "fmt" "io/ioutil" "log" "math" "math/rand" "strconv" "sync" "sync/atomic" "time" "github.com/dgraph-io/badger/v2" "github.com/dgraph-io/badger/v2/options" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/y" "github.com/spf13/cobra" ) var testCmd = &cobra.Command{ Use: "bank", Short: "Run bank test on Badger.", Long: ` This command runs bank test on Badger, inspired by Jepsen. It creates many accounts and moves money among them transactionally. It also reads the sum total of all the accounts, to ensure that the total never changes. `, } var bankTest = &cobra.Command{ Use: "test", Short: "Execute bank test on Badger.", RunE: runTest, } var bankDisect = &cobra.Command{ Use: "disect", Short: "Disect the bank output.", Long: ` Disect the bank output BadgerDB to find the first transaction which causes failure of the total invariant. `, RunE: runDisect, } var ( numGoroutines int numAccounts int numPrevious int duration string stopAll int32 mmap bool checkStream bool checkSubscriber bool verbose bool encryptionKey string ) const ( keyPrefix = "account:" initialBal uint64 = 100 ) func init() { RootCmd.AddCommand(testCmd) testCmd.AddCommand(bankTest) testCmd.AddCommand(bankDisect) testCmd.Flags().IntVarP( &numAccounts, "accounts", "a", 10000, "Number of accounts in the bank.") bankTest.Flags().IntVarP( &numGoroutines, "conc", "c", 16, "Number of concurrent transactions to run.") bankTest.Flags().StringVarP(&duration, "duration", "d", "3m", "How long to run the test.") bankTest.Flags().BoolVarP(&mmap, "mmap", "m", false, "If true, mmap LSM tree. Default is RAM.") bankTest.Flags().BoolVarP(&checkStream, "check_stream", "s", false, "If true, the test will send transactions to another badger instance via the stream "+ "interface in order to verify that all data is streamed correctly.") bankTest.Flags().BoolVarP(&checkSubscriber, "check_subscriber", "w", false, "If true, the test will send transactions to another badger instance via the subscriber "+ "interface in order to verify that all the data is published correctly.") bankTest.Flags().BoolVarP(&verbose, "verbose", "v", false, "If true, the test will print all the executed bank transfers to standard output. "+ "This outputs a lot so it's best to turn it off when running the test for a while.") bankTest.Flags().StringVarP(&encryptionKey, "encryption-key", "e", "", "If it is true, badger will encrypt all the data stored on the disk.") bankDisect.Flags().IntVarP(&numPrevious, "previous", "p", 12, "Starting from the violation txn, how many previous versions to retrieve.") bankDisect.Flags().StringVar(&encryptionKey, "decryption-key", "", "If set, DB will be opened using the provided decryption key.") } func key(account int) []byte { return []byte(fmt.Sprintf("%s%s", keyPrefix, strconv.Itoa(account))) } func toUint64(val []byte) uint64 { u, err := strconv.ParseUint(string(val), 10, 64) y.Check(err) return uint64(u) } func toSlice(bal uint64) []byte { return []byte(strconv.FormatUint(bal, 10)) } func getBalance(txn *badger.Txn, account int) (uint64, error) { item, err := txn.Get(key(account)) if err != nil { return 0, err } var bal uint64 err = item.Value(func(v []byte) error { bal = toUint64(v) return nil }) return bal, err } func putBalance(txn *badger.Txn, account int, bal uint64) error { return txn.SetEntry(badger.NewEntry(key(account), toSlice(bal))) } func min(a, b uint64) uint64 { if a < b { return a } return b } var errAbandoned = errors.New("Transaction abandonded due to insufficient balance") func moveMoney(db *badger.DB, from, to int) error { return db.Update(func(txn *badger.Txn) error { balf, err := getBalance(txn, from) if err != nil { return err } balt, err := getBalance(txn, to) if err != nil { return err } floor := min(balf, balt) if floor < 5 { return errAbandoned } // Move the money. balf -= 5 balt += 5 if err = putBalance(txn, from, balf); err != nil { return err } return putBalance(txn, to, balt) }) } type account struct { Id int Bal uint64 } func diff(a, b []account) string { var buf bytes.Buffer y.AssertTruef(len(a) == len(b), "len(a)=%d. len(b)=%d\n", len(a), len(b)) for i := range a { ai := a[i] bi := b[i] if ai.Id != bi.Id || ai.Bal != bi.Bal { buf.WriteString(fmt.Sprintf("Index: %d. Account [%+v] -> [%+v]\n", i, ai, bi)) } } return buf.String() } var errFailure = errors.New("test failed due to balance mismatch") // seekTotal retrives the total of all accounts by seeking for each account key. func seekTotal(txn *badger.Txn) ([]account, error) { expected := uint64(numAccounts) * uint64(initialBal) var accounts []account var total uint64 for i := 0; i < numAccounts; i++ { item, err := txn.Get(key(i)) if err != nil { log.Printf("Error for account: %d. err=%v. key=%q\n", i, err, key(i)) return accounts, err } val, err := item.ValueCopy(nil) if err != nil { return accounts, err } acc := account{ Id: i, Bal: toUint64(val), } accounts = append(accounts, acc) total += acc.Bal } if total != expected { log.Printf("Balance did NOT match up. Expected: %d. Received: %d", expected, total) atomic.AddInt32(&stopAll, 1) return accounts, errFailure } return accounts, nil } // Range is [lowTs, highTs). func findFirstInvalidTxn(db *badger.DB, lowTs, highTs uint64) uint64 { checkAt := func(ts uint64) error { txn := db.NewTransactionAt(ts, false) _, err := seekTotal(txn) txn.Discard() return err } if highTs-lowTs < 1 { log.Printf("Checking at lowTs: %d\n", lowTs) err := checkAt(lowTs) if err == errFailure { fmt.Printf("Violation at ts: %d\n", lowTs) return lowTs } else if err != nil { log.Printf("Error at lowTs: %d. Err=%v\n", lowTs, err) return 0 } fmt.Printf("No violation found at ts: %d\n", lowTs) return 0 } midTs := (lowTs + highTs) / 2 log.Println() log.Printf("Checking. low=%d. high=%d. mid=%d\n", lowTs, highTs, midTs) err := checkAt(midTs) if err == badger.ErrKeyNotFound || err == nil { // If no failure, move to higher ts. return findFirstInvalidTxn(db, midTs+1, highTs) } // Found an error. return findFirstInvalidTxn(db, lowTs, midTs) } func compareTwo(db *badger.DB, before, after uint64) { fmt.Printf("Comparing @ts=%d with @ts=%d\n", before, after) txn := db.NewTransactionAt(before, false) prev, err := seekTotal(txn) if err == errFailure { // pass } else { y.Check(err) } txn.Discard() txn = db.NewTransactionAt(after, false) now, err := seekTotal(txn) if err == errFailure { // pass } else { y.Check(err) } txn.Discard() fmt.Println(diff(prev, now)) } func runDisect(cmd *cobra.Command, args []string) error { // The total did not match up. So, let's disect the DB to find the // transction which caused the total mismatch. db, err := badger.OpenManaged(badger.DefaultOptions(sstDir). WithValueDir(vlogDir). WithReadOnly(true). WithEncryptionKey([]byte(encryptionKey))) if err != nil { return err } fmt.Println("opened db") var min, max uint64 = math.MaxUint64, 0 { txn := db.NewTransactionAt(uint64(math.MaxUint32), false) iopt := badger.DefaultIteratorOptions iopt.AllVersions = true itr := txn.NewIterator(iopt) for itr.Rewind(); itr.Valid(); itr.Next() { item := itr.Item() if min > item.Version() { min = item.Version() } if max < item.Version() { max = item.Version() } } itr.Close() txn.Discard() } log.Printf("min=%d. max=%d\n", min, max) ts := findFirstInvalidTxn(db, min, max) fmt.Println() if ts == 0 { fmt.Println("Nothing found. Exiting.") return nil } for i := 0; i < numPrevious; i++ { compareTwo(db, ts-1-uint64(i), ts-uint64(i)) } return nil } func runTest(cmd *cobra.Command, args []string) error { rand.Seed(time.Now().UnixNano()) // Open DB opts := badger.DefaultOptions(sstDir). WithValueDir(vlogDir). WithMaxTableSize(4 << 20). // Force more compactions. WithNumLevelZeroTables(2). WithNumMemtables(2). // Do not GC any versions, because we need them for the disect.. WithNumVersionsToKeep(int(math.MaxInt32)). WithValueThreshold(1) // Make all values go to value log if mmap { opts = opts.WithTableLoadingMode(options.MemoryMap) } if encryptionKey != "" { opts = opts.WithEncryptionKey([]byte(encryptionKey)) // The following comment is intentional as we would need the encryption key in case // we want to run disect tool on the directory generated by bank test tool. log.Printf("Using encryption key %s\n", encryptionKey) } log.Printf("Opening DB with options: %+v\n", opts) db, err := badger.Open(opts) if err != nil { return err } defer db.Close() var tmpDb *badger.DB var subscribeDB *badger.DB if checkSubscriber { dir, err := ioutil.TempDir("", "bank_subscribe") y.Check(err) subscribeDB, err = badger.Open(badger.DefaultOptions(dir).WithSyncWrites(false)) if err != nil { return err } defer subscribeDB.Close() } if checkStream { dir, err := ioutil.TempDir("", "bank_stream") y.Check(err) tmpDb, err = badger.Open(badger.DefaultOptions(dir).WithSyncWrites(false)) if err != nil { return err } defer tmpDb.Close() } wb := db.NewWriteBatch() for i := 0; i < numAccounts; i++ { y.Check(wb.Set(key(i), toSlice(initialBal))) } log.Println("Waiting for writes to be done...") y.Check(wb.Flush()) log.Println("Bank initialization OK. Commencing test.") log.Printf("Running with %d accounts, and %d goroutines.\n", numAccounts, numGoroutines) log.Printf("Using keyPrefix: %s\n", keyPrefix) dur, err := time.ParseDuration(duration) y.Check(err) // startTs := time.Now() endTs := time.Now().Add(dur) var total, errors, reads uint64 var wg sync.WaitGroup wg.Add(1) go func() { defer wg.Done() ticker := time.NewTicker(time.Second) defer ticker.Stop() for range ticker.C { if atomic.LoadInt32(&stopAll) > 0 { // Do not proceed. return } // log.Printf("[%6s] Total: %d. Errors: %d Reads: %d.\n", // time.Since(startTs).Round(time.Second).String(), // atomic.LoadUint64(&total), // atomic.LoadUint64(&errors), // atomic.LoadUint64(&reads)) if time.Now().After(endTs) { return } } }() // RW goroutines. for i := 0; i < numGoroutines; i++ { wg.Add(1) go func() { defer wg.Done() ticker := time.NewTicker(10 * time.Microsecond) defer ticker.Stop() for range ticker.C { if atomic.LoadInt32(&stopAll) > 0 { // Do not proceed. return } if time.Now().After(endTs) { return } from := rand.Intn(numAccounts) to := rand.Intn(numAccounts) if from == to { continue } err := moveMoney(db, from, to) atomic.AddUint64(&total, 1) if err == nil && verbose { log.Printf("Moved $5. %d -> %d\n", from, to) } else { atomic.AddUint64(&errors, 1) } } }() } if checkStream { wg.Add(1) go func() { defer wg.Done() ticker := time.NewTicker(time.Second) defer ticker.Stop() for range ticker.C { log.Printf("Received stream\n") // Do not proceed. if atomic.LoadInt32(&stopAll) > 0 || time.Now().After(endTs) { return } // Clean up the database receiving the stream. err = tmpDb.DropAll() y.Check(err) batch := tmpDb.NewWriteBatch() stream := db.NewStream() stream.Send = func(list *pb.KVList) error { for _, kv := range list.Kv { if err := batch.Set(kv.Key, kv.Value); err != nil { return err } } return nil } y.Check(stream.Orchestrate(context.Background())) y.Check(batch.Flush()) y.Check(tmpDb.View(func(txn *badger.Txn) error { _, err := seekTotal(txn) if err != nil { log.Printf("Error while calculating total in stream: %v", err) } return nil })) } }() } // RO goroutine. wg.Add(1) go func() { defer wg.Done() ticker := time.NewTicker(10 * time.Microsecond) defer ticker.Stop() for range ticker.C { if atomic.LoadInt32(&stopAll) > 0 { // Do not proceed. return } if time.Now().After(endTs) { return } y.Check(db.View(func(txn *badger.Txn) error { _, err := seekTotal(txn) if err != nil { log.Printf("Error while calculating total: %v", err) } else { atomic.AddUint64(&reads, 1) } return nil })) } }() ctx, cancel := context.WithCancel(context.Background()) defer cancel() var subWg sync.WaitGroup if checkSubscriber { subWg.Add(1) go func() { defer subWg.Done() accountIDS := [][]byte{} for i := 0; i < numAccounts; i++ { accountIDS = append(accountIDS, key(i)) } updater := func(kvs *pb.KVList) error { batch := subscribeDB.NewWriteBatch() for _, kv := range kvs.GetKv() { y.Check(batch.Set(kv.Key, kv.Value)) } return batch.Flush() } _ = db.Subscribe(ctx, updater, accountIDS...) }() } wg.Wait() if checkSubscriber { cancel() subWg.Wait() y.Check(subscribeDB.View(func(txn *badger.Txn) error { _, err := seekTotal(txn) if err != nil { log.Printf("Error while calculating subscriber DB total: %v", err) } else { atomic.AddUint64(&reads, 1) } return nil })) } if atomic.LoadInt32(&stopAll) == 0 { log.Println("Test OK") return nil } log.Println("Test FAILED") return fmt.Errorf("Test FAILED") } badger-2.2007.2/badger/cmd/bench.go000066400000000000000000000017101372173116500165720ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cmd import ( "github.com/spf13/cobra" ) var benchCmd = &cobra.Command{ Use: "benchmark", Short: "Benchmark Badger database.", Long: `This command will benchmark Badger for different usecases. Currently only read benchmark is supported. Useful for testing and performance analysis.`, } func init() { RootCmd.AddCommand(benchCmd) } badger-2.2007.2/badger/cmd/flatten.go000066400000000000000000000026251372173116500171560ustar00rootroot00000000000000/* * Copyright 2018 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cmd import ( "github.com/dgraph-io/badger/v2" "github.com/spf13/cobra" ) var flattenCmd = &cobra.Command{ Use: "flatten", Short: "Flatten the LSM tree.", Long: ` This command would compact all the LSM tables into one level. `, RunE: flatten, } var numWorkers int func init() { RootCmd.AddCommand(flattenCmd) flattenCmd.Flags().IntVarP(&numWorkers, "num-workers", "w", 1, "Number of concurrent compactors to run. More compactors would use more"+ " server resources to potentially achieve faster compactions.") } func flatten(cmd *cobra.Command, args []string) error { db, err := badger.Open(badger.DefaultOptions(sstDir). WithValueDir(vlogDir). WithTruncate(truncate). WithNumCompactors(0)) if err != nil { return err } defer db.Close() return db.Flatten(numWorkers) } badger-2.2007.2/badger/cmd/info.go000066400000000000000000000273651372173116500164640ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cmd import ( "bytes" "encoding/hex" "fmt" "io/ioutil" "os" "path/filepath" "sort" "strings" "time" "github.com/pkg/errors" "github.com/dgraph-io/badger/v2" "github.com/dgraph-io/badger/v2/options" "github.com/dgraph-io/badger/v2/table" "github.com/dgraph-io/badger/v2/y" humanize "github.com/dustin/go-humanize" "github.com/spf13/cobra" ) type flagOptions struct { showTables bool showHistogram bool showKeys bool withPrefix string keyLookup string itemMeta bool keyHistory bool showInternal bool readOnly bool truncate bool } var ( opt flagOptions ) func init() { RootCmd.AddCommand(infoCmd) infoCmd.Flags().BoolVarP(&opt.showTables, "show-tables", "s", false, "If set to true, show tables as well.") infoCmd.Flags().BoolVar(&opt.showHistogram, "histogram", false, "Show a histogram of the key and value sizes.") infoCmd.Flags().BoolVar(&opt.showKeys, "show-keys", false, "Show keys stored in Badger") infoCmd.Flags().StringVar(&opt.withPrefix, "with-prefix", "", "Consider only the keys with specified prefix") infoCmd.Flags().StringVarP(&opt.keyLookup, "lookup", "l", "", "Hex of the key to lookup") infoCmd.Flags().BoolVar(&opt.itemMeta, "show-meta", true, "Output item meta data as well") infoCmd.Flags().BoolVar(&opt.keyHistory, "history", false, "Show all versions of a key") infoCmd.Flags().BoolVar( &opt.showInternal, "show-internal", false, "Show internal keys along with other keys."+ " This option should be used along with --show-key option") infoCmd.Flags().BoolVar(&opt.readOnly, "read-only", true, "If set to true, DB will be opened "+ "in read only mode. If DB has not been closed properly, this option can be set to false "+ "to open DB.") infoCmd.Flags().BoolVar(&opt.truncate, "truncate", false, "If set to true, it allows "+ "truncation of value log files if they have corrupt data.") } var infoCmd = &cobra.Command{ Use: "info", Short: "Health info about Badger database.", Long: ` This command prints information about the badger key-value store. It reads MANIFEST and prints its info. It also prints info about missing/extra files, and general information about the value log files (which are not referenced by the manifest). Use this tool to report any issues about Badger to the Dgraph team. `, RunE: handleInfo, } func handleInfo(cmd *cobra.Command, args []string) error { if err := printInfo(sstDir, vlogDir); err != nil { return errors.Wrap(err, "failed to print information in MANIFEST file") } // Open DB db, err := badger.Open(badger.DefaultOptions(sstDir). WithValueDir(vlogDir). WithReadOnly(opt.readOnly). WithTruncate(opt.truncate). WithTableLoadingMode(options.MemoryMap)) if err != nil { return errors.Wrap(err, "failed to open database") } defer db.Close() if opt.showTables { tableInfo(sstDir, vlogDir, db) } prefix, err := hex.DecodeString(opt.withPrefix) if err != nil { return errors.Wrapf(err, "failed to decode hex prefix: %s", opt.withPrefix) } if opt.showHistogram { db.PrintHistogram(prefix) } if opt.showKeys { if err := showKeys(db, prefix); err != nil { return err } } if len(opt.keyLookup) > 0 { if err := lookup(db); err != nil { return errors.Wrapf(err, "failed to perform lookup for the key: %x", opt.keyLookup) } } return nil } func showKeys(db *badger.DB, prefix []byte) error { if len(prefix) > 0 { fmt.Printf("Only choosing keys with prefix: \n%s", hex.Dump(prefix)) } txn := db.NewTransaction(false) defer txn.Discard() iopt := badger.DefaultIteratorOptions iopt.Prefix = []byte(prefix) iopt.PrefetchValues = false iopt.AllVersions = opt.keyHistory iopt.InternalAccess = opt.showInternal it := txn.NewIterator(iopt) defer it.Close() totalKeys := 0 for it.Rewind(); it.Valid(); it.Next() { item := it.Item() if err := printKey(item, false); err != nil { return errors.Wrapf(err, "failed to print information about key: %x(%d)", item.Key(), item.Version()) } totalKeys++ } fmt.Print("\n[Summary]\n") fmt.Println("Total Number of keys:", totalKeys) return nil } func lookup(db *badger.DB) error { txn := db.NewTransaction(false) defer txn.Discard() key, err := hex.DecodeString(opt.keyLookup) if err != nil { return errors.Wrapf(err, "failed to decode key: %q", opt.keyLookup) } iopts := badger.DefaultIteratorOptions iopts.AllVersions = opt.keyHistory iopts.PrefetchValues = opt.keyHistory itr := txn.NewKeyIterator(key, iopts) defer itr.Close() itr.Rewind() if !itr.Valid() { return errors.Errorf("Unable to rewind to key:\n%s", hex.Dump(key)) } fmt.Println() item := itr.Item() if err := printKey(item, true); err != nil { return errors.Wrapf(err, "failed to print information about key: %x(%d)", item.Key(), item.Version()) } if !opt.keyHistory { return nil } itr.Next() // Move to the next key for ; itr.Valid(); itr.Next() { item := itr.Item() if !bytes.Equal(key, item.Key()) { break } if err := printKey(item, true); err != nil { return errors.Wrapf(err, "failed to print information about key: %x(%d)", item.Key(), item.Version()) } } return nil } func printKey(item *badger.Item, showValue bool) error { var buf bytes.Buffer fmt.Fprintf(&buf, "Key: %x\tversion: %d", item.Key(), item.Version()) if opt.itemMeta { fmt.Fprintf(&buf, "\tsize: %d\tmeta: b%04b", item.EstimatedSize(), item.UserMeta()) } if item.IsDeletedOrExpired() { buf.WriteString("\t{deleted}") } if item.DiscardEarlierVersions() { buf.WriteString("\t{discard}") } if showValue { val, err := item.ValueCopy(nil) if err != nil { return errors.Wrapf(err, "failed to copy value of the key: %x(%d)", item.Key(), item.Version()) } fmt.Fprintf(&buf, "\n\tvalue: %v", val) } fmt.Println(buf.String()) return nil } func hbytes(sz int64) string { return humanize.Bytes(uint64(sz)) } func dur(src, dst time.Time) string { return humanize.RelTime(dst, src, "earlier", "later") } func tableInfo(dir, valueDir string, db *badger.DB) { // we want all tables with keys count here. tables := db.Tables(true) fmt.Println() fmt.Println("SSTable [Li, Id, Total Keys including internal keys] " + "[Left Key, Version -> Right Key, Version]") for _, t := range tables { lk, lt := y.ParseKey(t.Left), y.ParseTs(t.Left) rk, rt := y.ParseKey(t.Right), y.ParseTs(t.Right) fmt.Printf("SSTable [L%d, %03d, %07d] [%20X, v%d -> %20X, v%d]\n", t.Level, t.ID, t.KeyCount, lk, lt, rk, rt) } fmt.Println() } func printInfo(dir, valueDir string) error { if dir == "" { return fmt.Errorf("--dir not supplied") } if valueDir == "" { valueDir = dir } fp, err := os.Open(filepath.Join(dir, badger.ManifestFilename)) if err != nil { return err } defer func() { if fp != nil { fp.Close() } }() manifest, truncOffset, err := badger.ReplayManifestFile(fp) if err != nil { return err } fp.Close() fp = nil fileinfos, err := ioutil.ReadDir(dir) if err != nil { return err } fileinfoByName := make(map[string]os.FileInfo) fileinfoMarked := make(map[string]bool) for _, info := range fileinfos { fileinfoByName[info.Name()] = info fileinfoMarked[info.Name()] = false } fmt.Println() var baseTime time.Time manifestTruncated := false manifestInfo, ok := fileinfoByName[badger.ManifestFilename] if ok { fileinfoMarked[badger.ManifestFilename] = true truncatedString := "" if truncOffset != manifestInfo.Size() { truncatedString = fmt.Sprintf(" [TRUNCATED to %d]", truncOffset) manifestTruncated = true } baseTime = manifestInfo.ModTime() fmt.Printf("[%25s] %-12s %6s MA%s\n", manifestInfo.ModTime().Format(time.RFC3339), manifestInfo.Name(), hbytes(manifestInfo.Size()), truncatedString) } else { fmt.Printf("%s [MISSING]\n", manifestInfo.Name()) } numMissing := 0 numEmpty := 0 levelSizes := make([]int64, len(manifest.Levels)) for level, lm := range manifest.Levels { // fmt.Printf("\n[Level %d]\n", level) // We create a sorted list of table ID's so that output is in consistent order. tableIDs := make([]uint64, 0, len(lm.Tables)) for id := range lm.Tables { tableIDs = append(tableIDs, id) } sort.Slice(tableIDs, func(i, j int) bool { return tableIDs[i] < tableIDs[j] }) for _, tableID := range tableIDs { tableFile := table.IDToFilename(tableID) _, ok1 := manifest.Tables[tableID] file, ok2 := fileinfoByName[tableFile] if ok1 && ok2 { fileinfoMarked[tableFile] = true emptyString := "" fileSize := file.Size() if fileSize == 0 { emptyString = " [EMPTY]" numEmpty++ } levelSizes[level] += fileSize // (Put level on every line to make easier to process with sed/perl.) fmt.Printf("[%25s] %-12s %6s L%d %s\n", dur(baseTime, file.ModTime()), tableFile, hbytes(fileSize), level, emptyString) } else { fmt.Printf("%s [MISSING]\n", tableFile) numMissing++ } } } valueDirFileinfos := fileinfos if valueDir != dir { valueDirFileinfos, err = ioutil.ReadDir(valueDir) if err != nil { return err } } // If valueDir is different from dir, holds extra files in the value dir. valueDirExtras := []os.FileInfo{} valueLogSize := int64(0) // fmt.Print("\n[Value Log]\n") for _, file := range valueDirFileinfos { if !strings.HasSuffix(file.Name(), ".vlog") { if valueDir != dir { valueDirExtras = append(valueDirExtras, file) } continue } fileSize := file.Size() emptyString := "" if fileSize == 0 { emptyString = " [EMPTY]" numEmpty++ } valueLogSize += fileSize fmt.Printf("[%25s] %-12s %6s VL%s\n", dur(baseTime, file.ModTime()), file.Name(), hbytes(fileSize), emptyString) fileinfoMarked[file.Name()] = true } numExtra := 0 for _, file := range fileinfos { if fileinfoMarked[file.Name()] { continue } if numExtra == 0 { fmt.Print("\n[EXTRA]\n") } fmt.Printf("[%s] %-12s %6s\n", file.ModTime().Format(time.RFC3339), file.Name(), hbytes(file.Size())) numExtra++ } numValueDirExtra := 0 for _, file := range valueDirExtras { if numValueDirExtra == 0 { fmt.Print("\n[ValueDir EXTRA]\n") } fmt.Printf("[%s] %-12s %6s\n", file.ModTime().Format(time.RFC3339), file.Name(), hbytes(file.Size())) numValueDirExtra++ } fmt.Print("\n[Summary]\n") totalIndexSize := int64(0) for i, sz := range levelSizes { fmt.Printf("Level %d size: %12s\n", i, hbytes(sz)) totalIndexSize += sz } fmt.Printf("Total index size: %8s\n", hbytes(totalIndexSize)) fmt.Printf("Value log size: %10s\n", hbytes(valueLogSize)) fmt.Println() totalExtra := numExtra + numValueDirExtra if totalExtra == 0 && numMissing == 0 && numEmpty == 0 && !manifestTruncated { fmt.Println("Abnormalities: None.") } else { fmt.Println("Abnormalities:") } fmt.Printf("%d extra %s.\n", totalExtra, pluralFiles(totalExtra)) fmt.Printf("%d missing %s.\n", numMissing, pluralFiles(numMissing)) fmt.Printf("%d empty %s.\n", numEmpty, pluralFiles(numEmpty)) fmt.Printf("%d truncated %s.\n", boolToNum(manifestTruncated), pluralManifest(manifestTruncated)) return nil } func boolToNum(x bool) int { if x { return 1 } return 0 } func pluralManifest(manifestTruncated bool) string { if manifestTruncated { return "manifest" } return "manifests" } func pluralFiles(count int) string { if count == 1 { return "file" } return "files" } badger-2.2007.2/badger/cmd/read_bench.go000066400000000000000000000143001372173116500175640ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cmd import ( "context" "fmt" "math/rand" "strings" "sync/atomic" "time" humanize "github.com/dustin/go-humanize" "github.com/spf13/cobra" "github.com/dgraph-io/badger/v2" "github.com/dgraph-io/badger/v2/options" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/y" ) var readBenchCmd = &cobra.Command{ Use: "read", Short: "Read data from Badger randomly to benchmark read speed.", Long: ` This command reads data from existing Badger database randomly using multiple go routines.`, RunE: readBench, } var ( sizeRead uint64 // will store size read till now entriesRead uint64 // will store entries read till now startTime time.Time // start time of read benchmarking sampleSize int loadingMode string keysOnly bool readOnly bool ) func init() { benchCmd.AddCommand(readBenchCmd) readBenchCmd.Flags().IntVarP( &numGoroutines, "goroutines", "g", 16, "Number of goroutines to run for reading.") readBenchCmd.Flags().StringVarP( &duration, "duration", "d", "1m", "How long to run the benchmark.") readBenchCmd.Flags().IntVar( &sampleSize, "sample-size", 1000000, "Keys sample size to be used for random lookup.") readBenchCmd.Flags().BoolVar( &keysOnly, "keys-only", false, "If false, values will also be read.") readBenchCmd.Flags().BoolVar( &readOnly, "read-only", true, "If true, DB will be opened in read only mode.") readBenchCmd.Flags().StringVar( &loadingMode, "loading-mode", "mmap", "Mode for accessing SSTables and value log files. "+ "Valid loading modes are fileio and mmap.") } func readBench(cmd *cobra.Command, args []string) error { rand.Seed(time.Now().Unix()) dur, err := time.ParseDuration(duration) if err != nil { return y.Wrapf(err, "unable to parse duration") } y.AssertTrue(numGoroutines > 0) mode := getLoadingMode(loadingMode) db, err := badger.Open(badger.DefaultOptions(sstDir). WithValueDir(vlogDir). WithReadOnly(readOnly). WithTableLoadingMode(mode). WithValueLogLoadingMode(mode)) if err != nil { return y.Wrapf(err, "unable to open DB") } defer db.Close() now := time.Now() keys, err := getSampleKeys(db) if err != nil { return y.Wrapf(err, "error while sampling keys") } fmt.Println("*********************************************************") fmt.Printf("Total Sampled Keys: %d, read in time: %s\n", len(keys), time.Since(now)) fmt.Println("*********************************************************") if len(keys) == 0 { fmt.Println("DB is empty, hence returning") return nil } fmt.Println("*********************************************************") fmt.Println("Starting to benchmark Reads") fmt.Println("*********************************************************") c := y.NewCloser(0) startTime = time.Now() for i := 0; i < numGoroutines; i++ { c.AddRunning(1) go readKeys(db, c, keys) } // also start printing stats c.AddRunning(1) go printStats(c) <-time.After(dur) c.SignalAndWait() return nil } func printStats(c *y.Closer) { defer c.Done() t := time.NewTicker(time.Second) defer t.Stop() for { select { case <-c.HasBeenClosed(): return case <-t.C: dur := time.Since(startTime) sz := atomic.LoadUint64(&sizeRead) entries := atomic.LoadUint64(&entriesRead) bytesRate := sz / uint64(dur.Seconds()) entriesRate := entries / uint64(dur.Seconds()) fmt.Printf("Time elapsed: %s, bytes read: %s, speed: %s/sec, "+ "entries read: %d, speed: %d/sec\n", y.FixedDuration(time.Since(startTime)), humanize.Bytes(sz), humanize.Bytes(bytesRate), entries, entriesRate) } } } func readKeys(db *badger.DB, c *y.Closer, keys [][]byte) { defer c.Done() r := rand.New(rand.NewSource(time.Now().Unix())) for { select { case <-c.HasBeenClosed(): return default: key := keys[r.Int31n(int32(len(keys)))] atomic.AddUint64(&sizeRead, lookupForKey(db, key)) atomic.AddUint64(&entriesRead, 1) } } } func lookupForKey(db *badger.DB, key []byte) (sz uint64) { err := db.View(func(txn *badger.Txn) error { itm, err := txn.Get(key) y.Check(err) if keysOnly { sz = uint64(itm.KeySize()) } else { y.Check2(itm.ValueCopy(nil)) sz = uint64(itm.EstimatedSize()) } return nil }) y.Check(err) return } // getSampleKeys uses stream framework internally, to get keys in random order. func getSampleKeys(db *badger.DB) ([][]byte, error) { var keys [][]byte count := 0 stream := db.NewStream() // overide stream.KeyToList as we only want keys. Also // we can take only first version for the key. stream.KeyToList = func(key []byte, itr *badger.Iterator) (*pb.KVList, error) { l := &pb.KVList{} // Since stream framework copies the item's key while calling // KeyToList, we can directly append key to list. l.Kv = append(l.Kv, &pb.KV{Key: key}) return l, nil } ctx, cancel := context.WithCancel(context.Background()) defer cancel() stream.Send = func(l *pb.KVList) error { if count >= sampleSize { return nil } for _, kv := range l.Kv { keys = append(keys, kv.Key) count++ if count >= sampleSize { cancel() return nil } } return nil } if err := stream.Orchestrate(ctx); err != nil && err != context.Canceled { return nil, err } // Shuffle keys before returning to minimise locality // of keys coming from stream framework. rand.Shuffle(len(keys), func(i, j int) { keys[i], keys[j] = keys[j], keys[i] }) return keys, nil } func getLoadingMode(m string) options.FileLoadingMode { m = strings.ToLower(m) var mode options.FileLoadingMode switch m { case "fileio": mode = options.FileIO case "mmap": mode = options.MemoryMap default: panic("loading mode not supported") } return mode } badger-2.2007.2/badger/cmd/restore.go000066400000000000000000000044721372173116500172060ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cmd import ( "errors" "os" "path" "github.com/dgraph-io/badger/v2" "github.com/spf13/cobra" ) var restoreFile string var maxPendingWrites int // restoreCmd represents the restore command var restoreCmd = &cobra.Command{ Use: "restore", Short: "Restore Badger database.", Long: `Restore Badger database from a file. It reads a file generated using the backup command (or by calling the DB.Backup() API method) and writes each key-value pair found in the file to the Badger database. Restore creates a new database, and currently does not work on an already existing database.`, RunE: doRestore, } func init() { RootCmd.AddCommand(restoreCmd) restoreCmd.Flags().StringVarP(&restoreFile, "backup-file", "f", "badger.bak", "File to restore from") // Default value for maxPendingWrites is 256, to minimise memory usage // and overall finish time. restoreCmd.Flags().IntVarP(&maxPendingWrites, "max-pending-writes", "w", 256, "Max number of pending writes at any time while restore") } func doRestore(cmd *cobra.Command, args []string) error { // Check if the DB already exists manifestFile := path.Join(sstDir, badger.ManifestFilename) if _, err := os.Stat(manifestFile); err == nil { // No error. File already exists. return errors.New("Cannot restore to an already existing database") } else if os.IsNotExist(err) { // pass } else { // Return an error if anything other than the error above return err } // Open DB db, err := badger.Open(badger.DefaultOptions(sstDir).WithValueDir(vlogDir)) if err != nil { return err } defer db.Close() // Open File f, err := os.Open(restoreFile) if err != nil { return err } defer f.Close() // Run restore return db.Load(f, maxPendingWrites) } badger-2.2007.2/badger/cmd/root.go000066400000000000000000000034021372173116500164760ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cmd import ( "errors" "fmt" "os" "strings" "github.com/spf13/cobra" ) var sstDir, vlogDir string // RootCmd represents the base command when called without any subcommands var RootCmd = &cobra.Command{ Use: "badger", Short: "Tools to manage Badger database.", PersistentPreRunE: validateRootCmdArgs, } // Execute adds all child commands to the root command and sets flags appropriately. // This is called by main.main(). It only needs to happen once to the rootCmd. func Execute() { if err := RootCmd.Execute(); err != nil { fmt.Println(err) os.Exit(1) } } func init() { RootCmd.PersistentFlags().StringVar(&sstDir, "dir", "", "Directory where the LSM tree files are located. (required)") RootCmd.PersistentFlags().StringVar(&vlogDir, "vlog-dir", "", "Directory where the value log files are located, if different from --dir") } func validateRootCmdArgs(cmd *cobra.Command, args []string) error { if strings.HasPrefix(cmd.Use, "help ") { // No need to validate if it is help return nil } if sstDir == "" { return errors.New("--dir not specified") } if vlogDir == "" { vlogDir = sstDir } return nil } badger-2.2007.2/badger/cmd/rotate.go000066400000000000000000000037401372173116500170160ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cmd import ( "io/ioutil" "os" "time" "github.com/dgraph-io/badger/v2" "github.com/spf13/cobra" ) var oldKeyPath string var newKeyPath string var rotateCmd = &cobra.Command{ Use: "rotate", Short: "Rotate encryption key.", Long: "Rotate will rotate the old key with new encryption key.", RunE: doRotate, } func init() { RootCmd.AddCommand(rotateCmd) rotateCmd.Flags().StringVarP(&oldKeyPath, "old-key-path", "o", "", "Path of the old key") rotateCmd.Flags().StringVarP(&newKeyPath, "new-key-path", "n", "", "Path of the new key") } func doRotate(cmd *cobra.Command, args []string) error { oldKey, err := getKey(oldKeyPath) if err != nil { return err } opt := badger.KeyRegistryOptions{ Dir: sstDir, ReadOnly: true, EncryptionKey: oldKey, EncryptionKeyRotationDuration: 10 * 24 * time.Hour, } kr, err := badger.OpenKeyRegistry(opt) if err != nil { return err } newKey, err := getKey(newKeyPath) if err != nil { return err } opt.EncryptionKey = newKey err = badger.WriteKeyRegistry(kr, opt) if err != nil { return err } return nil } func getKey(path string) ([]byte, error) { if path == "" { // Empty bytes for plain text to encryption(vice versa). return []byte{}, nil } fp, err := os.Open(path) if err != nil { return nil, err } return ioutil.ReadAll(fp) } badger-2.2007.2/badger/cmd/rotate_test.go000066400000000000000000000075271372173116500200640ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cmd import ( "io/ioutil" "math/rand" "os" "testing" "github.com/dgraph-io/badger/v2" "github.com/dgraph-io/badger/v2/y" "github.com/stretchr/testify/require" ) func TestRotate(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer os.RemoveAll(dir) // Creating sample key. key := make([]byte, 32) _, err = rand.Read(key) require.NoError(t, err) fp, err := ioutil.TempFile("", "*.key") require.NoError(t, err) _, err = fp.Write(key) require.NoError(t, err) defer fp.Close() // Opening DB with the encryption key. opts := badger.DefaultOptions(dir) opts.EncryptionKey = key db, err := badger.Open(opts) require.NoError(t, err) // Closing the db. require.NoError(t, db.Close()) // Opening the db again for the successful open. db, err = badger.Open(opts) require.NoError(t, err) // Closing so that we can open another db require.NoError(t, db.Close()) // Creating another sample key. key2 := make([]byte, 32) _, err = rand.Read(key2) require.NoError(t, err) fp2, err := ioutil.TempFile("", "*.key") require.NoError(t, err) _, err = fp2.Write(key2) require.NoError(t, err) defer fp2.Close() oldKeyPath = fp2.Name() sstDir = dir // Check whether we able to rotate the key with some sample key. We should get mismatch // error. require.EqualError(t, doRotate(nil, []string{}), badger.ErrEncryptionKeyMismatch.Error()) // rotating key with proper key. oldKeyPath = fp.Name() newKeyPath = fp2.Name() require.NoError(t, doRotate(nil, []string{})) // Checking whether db opens with the new key. opts.EncryptionKey = key2 db, err = badger.Open(opts) require.NoError(t, err) require.NoError(t, db.Close()) // Checking for plain text rotation. oldKeyPath = newKeyPath newKeyPath = "" require.NoError(t, doRotate(nil, []string{})) opts.EncryptionKey = []byte{} db, err = badger.Open(opts) require.NoError(t, err) defer db.Close() } // This test shows that rotate tool can be used to enable encryption. func TestRotatePlainTextToEncrypted(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer os.RemoveAll(dir) // Open DB without encryption. opts := badger.DefaultOptions(dir) db, err := badger.Open(opts) require.NoError(t, err) db.Update(func(txn *badger.Txn) error { return txn.Set([]byte("foo"), []byte("bar")) }) require.NoError(t, db.Close()) // Create an encryption key. key := make([]byte, 32) y.Check2(rand.Read(key)) fp, err := ioutil.TempFile("", "*.key") require.NoError(t, err) _, err = fp.Write(key) require.NoError(t, err) defer fp.Close() oldKeyPath = "" newKeyPath = fp.Name() sstDir = dir // Enable encryption. newKeyPath is encrypted. require.Nil(t, doRotate(nil, []string{})) // Try opening DB without the key. _, err = badger.Open(opts) require.EqualError(t, err, badger.ErrEncryptionKeyMismatch.Error()) // Check whether db opens with the new key. opts.EncryptionKey = key db, err = badger.Open(opts) require.NoError(t, err) db.View(func(txn *badger.Txn) error { iopt := badger.DefaultIteratorOptions it := txn.NewIterator(iopt) defer it.Close() count := 0 for it.Rewind(); it.Valid(); it.Next() { count++ } require.Equal(t, 1, count) return nil }) require.NoError(t, db.Close()) } badger-2.2007.2/badger/cmd/write_bench.go000066400000000000000000000123041372173116500200050ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cmd import ( "encoding/binary" "fmt" "log" "math/rand" "sync" "sync/atomic" "time" humanize "github.com/dustin/go-humanize" "github.com/spf13/cobra" "github.com/dgraph-io/badger/v2" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/y" ) var writeBenchCmd = &cobra.Command{ Use: "write", Short: "Writes random data to Badger to benchmark write speed.", Long: ` This command writes random data to Badger to benchmark write speed. Useful for testing and performance analysis. `, RunE: writeBench, } var ( keySz int valSz int numKeys float64 force bool sorted bool showLogs bool sizeWritten uint64 entriesWritten uint64 ) const ( mil float64 = 1e6 ) func init() { benchCmd.AddCommand(writeBenchCmd) writeBenchCmd.Flags().IntVarP(&keySz, "key-size", "k", 32, "Size of key") writeBenchCmd.Flags().IntVarP(&valSz, "val-size", "v", 128, "Size of value") writeBenchCmd.Flags().Float64VarP(&numKeys, "keys-mil", "m", 10.0, "Number of keys to add in millions") writeBenchCmd.Flags().BoolVarP(&force, "force-compact", "f", true, "Force compact level 0 on close.") writeBenchCmd.Flags().BoolVarP(&sorted, "sorted", "s", false, "Write keys in sorted order.") writeBenchCmd.Flags().BoolVarP(&showLogs, "logs", "l", false, "Show Badger logs.") } func writeRandom(db *badger.DB, num uint64) error { value := make([]byte, valSz) y.Check2(rand.Read(value)) es := uint64(keySz + valSz) // entry size is keySz + valSz batch := db.NewWriteBatch() for i := uint64(1); i <= num; i++ { key := make([]byte, keySz) y.Check2(rand.Read(key)) if err := batch.Set(key, value); err != nil { return err } atomic.AddUint64(&entriesWritten, 1) atomic.AddUint64(&sizeWritten, es) } return batch.Flush() } func writeSorted(db *badger.DB, num uint64) error { value := make([]byte, valSz) y.Check2(rand.Read(value)) es := 8 + valSz // key size is 8 bytes and value size is valSz writer := db.NewStreamWriter() if err := writer.Prepare(); err != nil { return err } wg := &sync.WaitGroup{} writeCh := make(chan *pb.KVList, 3) writeRange := func(start, end uint64, streamId uint32) { // end is not included. defer wg.Done() kvs := &pb.KVList{} var sz int for i := start; i < end; i++ { key := make([]byte, 8) binary.BigEndian.PutUint64(key, i) kvs.Kv = append(kvs.Kv, &pb.KV{ Key: key, Value: value, Version: 1, StreamId: streamId, }) sz += es atomic.AddUint64(&entriesWritten, 1) atomic.AddUint64(&sizeWritten, uint64(es)) if sz >= 4<<20 { // 4 MB writeCh <- kvs kvs = &pb.KVList{} sz = 0 } } writeCh <- kvs } // Let's create some streams. width := num / 16 streamID := uint32(0) for start := uint64(0); start < num; start += width { end := start + width if end > num { end = num } streamID++ wg.Add(1) go writeRange(start, end, streamID) } go func() { wg.Wait() close(writeCh) }() log.Printf("Max StreamId used: %d. Width: %d\n", streamID, width) for kvs := range writeCh { if err := writer.Write(kvs); err != nil { panic(err) } } log.Println("DONE streaming. Flushing...") return writer.Flush() } func writeBench(cmd *cobra.Command, args []string) error { opt := badger.DefaultOptions(sstDir). WithValueDir(vlogDir). WithTruncate(truncate). WithSyncWrites(false). WithCompactL0OnClose(force) if !showLogs { opt = opt.WithLogger(nil) } db, err := badger.Open(opt) if err != nil { return err } defer func() { start := time.Now() err := db.Close() log.Printf("DB.Close. Error: %v. Time taken to close: %s", err, time.Since(start)) }() fmt.Println("*********************************************************") fmt.Println("Starting to benchmark Writes") fmt.Println("*********************************************************") startTime = time.Now() num := uint64(numKeys * mil) c := y.NewCloser(1) go reportStats(c) if sorted { err = writeSorted(db, num) } else { err = writeRandom(db, num) } c.SignalAndWait() return err } func reportStats(c *y.Closer) { defer c.Done() t := time.NewTicker(time.Second) defer t.Stop() for { select { case <-c.HasBeenClosed(): return case <-t.C: dur := time.Since(startTime) sz := atomic.LoadUint64(&sizeWritten) entries := atomic.LoadUint64(&entriesWritten) bytesRate := sz / uint64(dur.Seconds()) entriesRate := entries / uint64(dur.Seconds()) fmt.Printf("Time elapsed: %s, bytes written: %s, speed: %s/sec, "+ "entries written: %d, speed: %d/sec\n", y.FixedDuration(time.Since(startTime)), humanize.Bytes(sz), humanize.Bytes(bytesRate), entries, entriesRate) } } } badger-2.2007.2/badger/main.go000066400000000000000000000021241372173116500156740ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package main import ( "fmt" "net/http" _ "net/http/pprof" "runtime" "github.com/dgraph-io/badger/v2/badger/cmd" ) func main() { go func() { for i := 8080; i < 9080; i++ { fmt.Printf("Listening for /debug HTTP requests at port: %d\n", i) if err := http.ListenAndServe(fmt.Sprintf("0.0.0.0:%d", i), nil); err != nil { fmt.Println("Port busy. Trying another one...") continue } } }() runtime.SetBlockProfileRate(100) runtime.GOMAXPROCS(128) cmd.Execute() } badger-2.2007.2/batch.go000066400000000000000000000133161372173116500146120ustar00rootroot00000000000000/* * Copyright 2018 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "sync" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/y" "github.com/pkg/errors" ) // WriteBatch holds the necessary info to perform batched writes. type WriteBatch struct { sync.Mutex txn *Txn db *DB throttle *y.Throttle err error isManaged bool commitTs uint64 } // NewWriteBatch creates a new WriteBatch. This provides a way to conveniently do a lot of writes, // batching them up as tightly as possible in a single transaction and using callbacks to avoid // waiting for them to commit, thus achieving good performance. This API hides away the logic of // creating and committing transactions. Due to the nature of SSI guaratees provided by Badger, // blind writes can never encounter transaction conflicts (ErrConflict). func (db *DB) NewWriteBatch() *WriteBatch { if db.opt.managedTxns { panic("cannot use NewWriteBatch in managed mode. Use NewWriteBatchAt instead") } return db.newWriteBatch(false) } func (db *DB) newWriteBatch(isManaged bool) *WriteBatch { return &WriteBatch{ db: db, isManaged: isManaged, txn: db.newTransaction(true, isManaged), throttle: y.NewThrottle(16), } } // SetMaxPendingTxns sets a limit on maximum number of pending transactions while writing batches. // This function should be called before using WriteBatch. Default value of MaxPendingTxns is // 16 to minimise memory usage. func (wb *WriteBatch) SetMaxPendingTxns(max int) { wb.throttle = y.NewThrottle(max) } // Cancel function must be called if there's a chance that Flush might not get // called. If neither Flush or Cancel is called, the transaction oracle would // never get a chance to clear out the row commit timestamp map, thus causing an // unbounded memory consumption. Typically, you can call Cancel as a defer // statement right after NewWriteBatch is called. // // Note that any committed writes would still go through despite calling Cancel. func (wb *WriteBatch) Cancel() { if err := wb.throttle.Finish(); err != nil { wb.db.opt.Errorf("WatchBatch.Cancel error while finishing: %v", err) } wb.txn.Discard() } func (wb *WriteBatch) callback(err error) { // sync.WaitGroup is thread-safe, so it doesn't need to be run inside wb.Lock. defer wb.throttle.Done(err) if err == nil { return } wb.Lock() defer wb.Unlock() if wb.err != nil { return } wb.err = err } func (wb *WriteBatch) Write(kvList *pb.KVList) error { wb.Lock() defer wb.Unlock() for _, kv := range kvList.Kv { e := Entry{Key: kv.Key, Value: kv.Value} if len(kv.UserMeta) > 0 { e.UserMeta = kv.UserMeta[0] } y.AssertTrue(kv.Version != 0) e.version = kv.Version if err := wb.handleEntry(&e); err != nil { return err } } return nil } // SetEntryAt is the equivalent of Txn.SetEntry but it also allows setting version for the entry. // SetEntryAt can be used only in managed mode. func (wb *WriteBatch) SetEntryAt(e *Entry, ts uint64) error { if !wb.db.opt.managedTxns { return errors.New("SetEntryAt can only be used in managed mode. Use SetEntry instead") } e.version = ts return wb.SetEntry(e) } // Should be called with lock acquired. func (wb *WriteBatch) handleEntry(e *Entry) error { if err := wb.txn.SetEntry(e); err != ErrTxnTooBig { return err } // Txn has reached it's zenith. Commit now. if cerr := wb.commit(); cerr != nil { return cerr } // This time the error must not be ErrTxnTooBig, otherwise, we make the // error permanent. if err := wb.txn.SetEntry(e); err != nil { wb.err = err return err } return nil } // SetEntry is the equivalent of Txn.SetEntry. func (wb *WriteBatch) SetEntry(e *Entry) error { wb.Lock() defer wb.Unlock() return wb.handleEntry(e) } // Set is equivalent of Txn.Set(). func (wb *WriteBatch) Set(k, v []byte) error { e := &Entry{Key: k, Value: v} return wb.SetEntry(e) } // DeleteAt is equivalent of Txn.Delete but accepts a delete timestamp. func (wb *WriteBatch) DeleteAt(k []byte, ts uint64) error { e := Entry{Key: k, meta: bitDelete, version: ts} return wb.SetEntry(&e) } // Delete is equivalent of Txn.Delete. func (wb *WriteBatch) Delete(k []byte) error { wb.Lock() defer wb.Unlock() if err := wb.txn.Delete(k); err != ErrTxnTooBig { return err } if err := wb.commit(); err != nil { return err } if err := wb.txn.Delete(k); err != nil { wb.err = err return err } return nil } // Caller to commit must hold a write lock. func (wb *WriteBatch) commit() error { if wb.err != nil { return wb.err } if err := wb.throttle.Do(); err != nil { return err } wb.txn.CommitWith(wb.callback) wb.txn = wb.db.newTransaction(true, wb.isManaged) wb.txn.commitTs = wb.commitTs return wb.err } // Flush must be called at the end to ensure that any pending writes get committed to Badger. Flush // returns any error stored by WriteBatch. func (wb *WriteBatch) Flush() error { wb.Lock() _ = wb.commit() wb.txn.Discard() wb.Unlock() if err := wb.throttle.Finish(); err != nil { return err } return wb.err } // Error returns any errors encountered so far. No commits would be run once an error is detected. func (wb *WriteBatch) Error() error { wb.Lock() defer wb.Unlock() return wb.err } badger-2.2007.2/batch_test.go000066400000000000000000000065411372173116500156530ustar00rootroot00000000000000/* * Copyright 2018 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "fmt" "testing" "time" "github.com/stretchr/testify/require" ) func TestWriteBatch(t *testing.T) { key := func(i int) []byte { return []byte(fmt.Sprintf("%10d", i)) } val := func(i int) []byte { return []byte(fmt.Sprintf("%128d", i)) } test := func(t *testing.T, db *DB) { wb := db.NewWriteBatch() defer wb.Cancel() // Sanity check for SetEntryAt. require.Error(t, wb.SetEntryAt(&Entry{}, 12)) N, M := 50000, 1000 start := time.Now() for i := 0; i < N; i++ { require.NoError(t, wb.Set(key(i), val(i))) } for i := 0; i < M; i++ { require.NoError(t, wb.Delete(key(i))) } require.NoError(t, wb.Flush()) t.Logf("Time taken for %d writes (w/ test options): %s\n", N+M, time.Since(start)) err := db.View(func(txn *Txn) error { itr := txn.NewIterator(DefaultIteratorOptions) defer itr.Close() i := M for itr.Rewind(); itr.Valid(); itr.Next() { item := itr.Item() require.Equal(t, string(key(i)), string(item.Key())) valcopy, err := item.ValueCopy(nil) require.NoError(t, err) require.Equal(t, val(i), valcopy) i++ } require.Equal(t, N, i) return nil }) require.NoError(t, err) } t.Run("disk mode", func(t *testing.T) { opt := getTestOptions("") // Set value threshold to 32 bytes otherwise write batch will generate // too many files and we will crash with too many files open error. opt.ValueThreshold = 32 runBadgerTest(t, &opt, func(t *testing.T, db *DB) { test(t, db) }) }) t.Run("InMemory mode", func(t *testing.T) { opt := getTestOptions("") opt.InMemory = true db, err := Open(opt) require.NoError(t, err) test(t, db) require.NoError(t, db.Close()) }) } // This test ensures we don't end up in deadlock in case of empty writebatch. func TestEmptyWriteBatch(t *testing.T) { t.Run("normal mode", func(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { wb := db.NewWriteBatch() require.NoError(t, wb.Flush()) wb = db.NewWriteBatch() require.NoError(t, wb.Flush()) wb = db.NewWriteBatch() require.NoError(t, wb.Flush()) }) }) t.Run("managed mode", func(t *testing.T) { opt := getTestOptions("") opt.managedTxns = true runBadgerTest(t, &opt, func(t *testing.T, db *DB) { t.Run("WriteBatchAt", func(t *testing.T) { wb := db.NewWriteBatchAt(2) require.NoError(t, wb.Flush()) wb = db.NewWriteBatchAt(208) require.NoError(t, wb.Flush()) wb = db.NewWriteBatchAt(31) require.NoError(t, wb.Flush()) }) t.Run("ManagedWriteBatch", func(t *testing.T) { wb := db.NewManagedWriteBatch() require.NoError(t, wb.Flush()) wb = db.NewManagedWriteBatch() require.NoError(t, wb.Flush()) wb = db.NewManagedWriteBatch() require.NoError(t, wb.Flush()) }) }) }) } badger-2.2007.2/compaction.go000066400000000000000000000124241372173116500156640ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "bytes" "fmt" "log" "math" "sync" "golang.org/x/net/trace" "github.com/dgraph-io/badger/v2/table" "github.com/dgraph-io/badger/v2/y" ) type keyRange struct { left []byte right []byte inf bool } var infRange = keyRange{inf: true} func (r keyRange) String() string { return fmt.Sprintf("[left=%x, right=%x, inf=%v]", r.left, r.right, r.inf) } func (r keyRange) equals(dst keyRange) bool { return bytes.Equal(r.left, dst.left) && bytes.Equal(r.right, dst.right) && r.inf == dst.inf } func (r keyRange) overlapsWith(dst keyRange) bool { if r.inf || dst.inf { return true } // If my left is greater than dst right, we have no overlap. if y.CompareKeys(r.left, dst.right) > 0 { return false } // If my right is less than dst left, we have no overlap. if y.CompareKeys(r.right, dst.left) < 0 { return false } // We have overlap. return true } func getKeyRange(tables ...*table.Table) keyRange { if len(tables) == 0 { return keyRange{} } smallest := tables[0].Smallest() biggest := tables[0].Biggest() for i := 1; i < len(tables); i++ { if y.CompareKeys(tables[i].Smallest(), smallest) < 0 { smallest = tables[i].Smallest() } if y.CompareKeys(tables[i].Biggest(), biggest) > 0 { biggest = tables[i].Biggest() } } // We pick all the versions of the smallest and the biggest key. Note that version zero would // be the rightmost key, considering versions are default sorted in descending order. return keyRange{ left: y.KeyWithTs(y.ParseKey(smallest), math.MaxUint64), right: y.KeyWithTs(y.ParseKey(biggest), 0), } } type levelCompactStatus struct { ranges []keyRange delSize int64 } func (lcs *levelCompactStatus) debug() string { var b bytes.Buffer for _, r := range lcs.ranges { b.WriteString(r.String()) } return b.String() } func (lcs *levelCompactStatus) overlapsWith(dst keyRange) bool { for _, r := range lcs.ranges { if r.overlapsWith(dst) { return true } } return false } func (lcs *levelCompactStatus) remove(dst keyRange) bool { final := lcs.ranges[:0] var found bool for _, r := range lcs.ranges { if !r.equals(dst) { final = append(final, r) } else { found = true } } lcs.ranges = final return found } type compactStatus struct { sync.RWMutex levels []*levelCompactStatus } func (cs *compactStatus) toLog(tr trace.Trace) { cs.RLock() defer cs.RUnlock() tr.LazyPrintf("Compaction status:") for i, l := range cs.levels { if l.debug() == "" { continue } tr.LazyPrintf("[%d] %s", i, l.debug()) } } func (cs *compactStatus) overlapsWith(level int, this keyRange) bool { cs.RLock() defer cs.RUnlock() thisLevel := cs.levels[level] return thisLevel.overlapsWith(this) } func (cs *compactStatus) delSize(l int) int64 { cs.RLock() defer cs.RUnlock() return cs.levels[l].delSize } type thisAndNextLevelRLocked struct{} // compareAndAdd will check whether we can run this compactDef. That it doesn't overlap with any // other running compaction. If it can be run, it would store this run in the compactStatus state. func (cs *compactStatus) compareAndAdd(_ thisAndNextLevelRLocked, cd compactDef) bool { cs.Lock() defer cs.Unlock() level := cd.thisLevel.level y.AssertTruef(level < len(cs.levels)-1, "Got level %d. Max levels: %d", level, len(cs.levels)) thisLevel := cs.levels[level] nextLevel := cs.levels[level+1] if thisLevel.overlapsWith(cd.thisRange) { return false } if nextLevel.overlapsWith(cd.nextRange) { return false } // Check whether this level really needs compaction or not. Otherwise, we'll end up // running parallel compactions for the same level. // Update: We should not be checking size here. Compaction priority already did the size checks. // Here we should just be executing the wish of others. thisLevel.ranges = append(thisLevel.ranges, cd.thisRange) nextLevel.ranges = append(nextLevel.ranges, cd.nextRange) thisLevel.delSize += cd.thisSize return true } func (cs *compactStatus) delete(cd compactDef) { cs.Lock() defer cs.Unlock() level := cd.thisLevel.level y.AssertTruef(level < len(cs.levels)-1, "Got level %d. Max levels: %d", level, len(cs.levels)) thisLevel := cs.levels[level] nextLevel := cs.levels[level+1] thisLevel.delSize -= cd.thisSize found := thisLevel.remove(cd.thisRange) found = nextLevel.remove(cd.nextRange) && found if !found { this := cd.thisRange next := cd.nextRange fmt.Printf("Looking for: [%q, %q, %v] in this level.\n", this.left, this.right, this.inf) fmt.Printf("This Level:\n%s\n", thisLevel.debug()) fmt.Println() fmt.Printf("Looking for: [%q, %q, %v] in next level.\n", next.left, next.right, next.inf) fmt.Printf("Next Level:\n%s\n", nextLevel.debug()) log.Fatal("keyRange not found") } } badger-2.2007.2/db.go000066400000000000000000001442511372173116500141210ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "bytes" "context" "encoding/binary" "expvar" "math" "os" "path/filepath" "sort" "strconv" "sync" "sync/atomic" "time" "github.com/dgraph-io/badger/v2/options" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/skl" "github.com/dgraph-io/badger/v2/table" "github.com/dgraph-io/badger/v2/y" "github.com/dgraph-io/ristretto" humanize "github.com/dustin/go-humanize" "github.com/pkg/errors" ) var ( badgerPrefix = []byte("!badger!") // Prefix for internal keys used by badger. head = []byte("!badger!head") // For storing value offset for replay. txnKey = []byte("!badger!txn") // For indicating end of entries in txn. badgerMove = []byte("!badger!move") // For key-value pairs which got moved during GC. lfDiscardStatsKey = []byte("!badger!discard") // For storing lfDiscardStats ) type closers struct { updateSize *y.Closer compactors *y.Closer memtable *y.Closer writes *y.Closer valueGC *y.Closer pub *y.Closer } // DB provides the various functions required to interact with Badger. // DB is thread-safe. type DB struct { sync.RWMutex // Guards list of inmemory tables, not individual reads and writes. dirLockGuard *directoryLockGuard // nil if Dir and ValueDir are the same valueDirGuard *directoryLockGuard closers closers mt *skl.Skiplist // Our latest (actively written) in-memory table imm []*skl.Skiplist // Add here only AFTER pushing to flushChan. opt Options manifest *manifestFile lc *levelsController vlog valueLog vhead valuePointer // less than or equal to a pointer to the last vlog value put into mt writeCh chan *request flushChan chan flushTask // For flushing memtables. closeOnce sync.Once // For closing DB only once. // Number of log rotates since the last memtable flush. We will access this field via atomic // functions. Since we are not going to use any 64bit atomic functions, there is no need for // 64 bit alignment of this struct(see #311). logRotates int32 blockWrites int32 isClosed uint32 orc *oracle pub *publisher registry *KeyRegistry blockCache *ristretto.Cache indexCache *ristretto.Cache } const ( kvWriteChCapacity = 1000 ) func (db *DB) replayFunction() func(Entry, valuePointer) error { type txnEntry struct { nk []byte v y.ValueStruct } var txn []txnEntry var lastCommit uint64 toLSM := func(nk []byte, vs y.ValueStruct) { for err := db.ensureRoomForWrite(); err != nil; err = db.ensureRoomForWrite() { db.opt.Debugf("Replay: Making room for writes") time.Sleep(10 * time.Millisecond) } db.mt.Put(nk, vs) } first := true return func(e Entry, vp valuePointer) error { // Function for replaying. if first { db.opt.Debugf("First key=%q\n", e.Key) } first = false db.orc.Lock() if db.orc.nextTxnTs < y.ParseTs(e.Key) { db.orc.nextTxnTs = y.ParseTs(e.Key) } db.orc.Unlock() nk := make([]byte, len(e.Key)) copy(nk, e.Key) var nv []byte meta := e.meta if db.shouldWriteValueToLSM(e) { nv = make([]byte, len(e.Value)) copy(nv, e.Value) } else { nv = vp.Encode() meta = meta | bitValuePointer } // Update vhead. If the crash happens while replay was in progess // and the head is not updated, we will end up replaying all the // files starting from file zero, again. db.updateHead([]valuePointer{vp}) v := y.ValueStruct{ Value: nv, Meta: meta, UserMeta: e.UserMeta, ExpiresAt: e.ExpiresAt, } switch { case e.meta&bitFinTxn > 0: txnTs, err := strconv.ParseUint(string(e.Value), 10, 64) if err != nil { return errors.Wrapf(err, "Unable to parse txn fin: %q", e.Value) } y.AssertTrue(lastCommit == txnTs) y.AssertTrue(len(txn) > 0) // Got the end of txn. Now we can store them. for _, t := range txn { toLSM(t.nk, t.v) } txn = txn[:0] lastCommit = 0 case e.meta&bitTxn > 0: txnTs := y.ParseTs(nk) if lastCommit == 0 { lastCommit = txnTs } if lastCommit != txnTs { db.opt.Warningf("Found an incomplete txn at timestamp %d. Discarding it.\n", lastCommit) txn = txn[:0] lastCommit = txnTs } te := txnEntry{nk: nk, v: v} txn = append(txn, te) default: // This entry is from a rewrite or via SetEntryAt(..). toLSM(nk, v) // We shouldn't get this entry in the middle of a transaction. y.AssertTrue(lastCommit == 0) y.AssertTrue(len(txn) == 0) } return nil } } // Open returns a new DB object. func Open(opt Options) (db *DB, err error) { // It's okay to have zero compactors which will disable all compactions but // we cannot have just one compactor otherwise we will end up with all data // one level 2. if opt.NumCompactors == 1 { return nil, errors.New("Cannot have 1 compactor. Need at least 2") } if opt.InMemory && (opt.Dir != "" || opt.ValueDir != "") { return nil, errors.New("Cannot use badger in Disk-less mode with Dir or ValueDir set") } opt.maxBatchSize = (15 * opt.MaxTableSize) / 100 opt.maxBatchCount = opt.maxBatchSize / int64(skl.MaxNodeSize) // We are limiting opt.ValueThreshold to maxValueThreshold for now. if opt.ValueThreshold > maxValueThreshold { return nil, errors.Errorf("Invalid ValueThreshold, must be less or equal to %d", maxValueThreshold) } // If ValueThreshold is greater than opt.maxBatchSize, we won't be able to push any data using // the transaction APIs. Transaction batches entries into batches of size opt.maxBatchSize. if int64(opt.ValueThreshold) > opt.maxBatchSize { return nil, errors.Errorf("Valuethreshold greater than max batch size of %d. Either "+ "reduce opt.ValueThreshold or increase opt.MaxTableSize.", opt.maxBatchSize) } if !(opt.ValueLogFileSize <= 2<<30 && opt.ValueLogFileSize >= 1<<20) { return nil, ErrValueLogSize } if !(opt.ValueLogLoadingMode == options.FileIO || opt.ValueLogLoadingMode == options.MemoryMap) { return nil, ErrInvalidLoadingMode } // Return error if badger is built without cgo and compression is set to ZSTD. if opt.Compression == options.ZSTD && !y.CgoEnabled { return nil, y.ErrZstdCgo } // Keep L0 in memory if either KeepL0InMemory is set or if InMemory is set. opt.KeepL0InMemory = opt.KeepL0InMemory || opt.InMemory // Compact L0 on close if either it is set or if KeepL0InMemory is set. When // keepL0InMemory is set we need to compact L0 on close otherwise we might lose data. opt.CompactL0OnClose = opt.CompactL0OnClose || opt.KeepL0InMemory if opt.ReadOnly { // Can't truncate if the DB is read only. opt.Truncate = false // Do not perform compaction in read only mode. opt.CompactL0OnClose = false } var dirLockGuard, valueDirLockGuard *directoryLockGuard // Create directories and acquire lock on it only if badger is not running in InMemory mode. // We don't have any directories/files in InMemory mode so we don't need to acquire // any locks on them. if !opt.InMemory { if err := createDirs(opt); err != nil { return nil, err } if !opt.BypassLockGuard { dirLockGuard, err = acquireDirectoryLock(opt.Dir, lockFile, opt.ReadOnly) if err != nil { return nil, err } defer func() { if dirLockGuard != nil { _ = dirLockGuard.release() } }() absDir, err := filepath.Abs(opt.Dir) if err != nil { return nil, err } absValueDir, err := filepath.Abs(opt.ValueDir) if err != nil { return nil, err } if absValueDir != absDir { valueDirLockGuard, err = acquireDirectoryLock(opt.ValueDir, lockFile, opt.ReadOnly) if err != nil { return nil, err } defer func() { if valueDirLockGuard != nil { _ = valueDirLockGuard.release() } }() } } } manifestFile, manifest, err := openOrCreateManifestFile(opt) if err != nil { return nil, err } defer func() { if manifestFile != nil { _ = manifestFile.close() } }() db = &DB{ imm: make([]*skl.Skiplist, 0, opt.NumMemtables), flushChan: make(chan flushTask, opt.NumMemtables), writeCh: make(chan *request, kvWriteChCapacity), opt: opt, manifest: manifestFile, dirLockGuard: dirLockGuard, valueDirGuard: valueDirLockGuard, orc: newOracle(opt), pub: newPublisher(), } // Cleanup all the goroutines started by badger in case of an error. defer func() { if err != nil { db.cleanup() db = nil } }() if opt.BlockCacheSize > 0 { config := ristretto.Config{ // Use 5% of cache memory for storing counters. NumCounters: int64(float64(opt.BlockCacheSize) * 0.05 * 2), MaxCost: int64(float64(opt.BlockCacheSize) * 0.95), BufferItems: 64, Metrics: true, } db.blockCache, err = ristretto.NewCache(&config) if err != nil { return nil, errors.Wrap(err, "failed to create data cache") } } if opt.IndexCacheSize > 0 { config := ristretto.Config{ // Use 5% of cache memory for storing counters. NumCounters: int64(float64(opt.IndexCacheSize) * 0.05 * 2), MaxCost: int64(float64(opt.IndexCacheSize) * 0.95), BufferItems: 64, Metrics: true, } db.indexCache, err = ristretto.NewCache(&config) if err != nil { return nil, errors.Wrap(err, "failed to create bf cache") } } if db.opt.InMemory { db.opt.SyncWrites = false // If badger is running in memory mode, push everything into the LSM Tree. db.opt.ValueThreshold = math.MaxInt32 } krOpt := KeyRegistryOptions{ ReadOnly: opt.ReadOnly, Dir: opt.Dir, EncryptionKey: opt.EncryptionKey, EncryptionKeyRotationDuration: opt.EncryptionKeyRotationDuration, InMemory: opt.InMemory, } if db.registry, err = OpenKeyRegistry(krOpt); err != nil { return db, err } db.calculateSize() db.closers.updateSize = y.NewCloser(1) go db.updateSize(db.closers.updateSize) db.mt = skl.NewSkiplist(arenaSize(opt)) // newLevelsController potentially loads files in directory. if db.lc, err = newLevelsController(db, &manifest); err != nil { return db, err } // Initialize vlog struct. db.vlog.init(db) if !opt.ReadOnly { db.closers.compactors = y.NewCloser(1) db.lc.startCompact(db.closers.compactors) db.closers.memtable = y.NewCloser(1) go func() { _ = db.flushMemtable(db.closers.memtable) // Need levels controller to be up. }() } headKey := y.KeyWithTs(head, math.MaxUint64) // Need to pass with timestamp, lsm get removes the last 8 bytes and compares key vs, err := db.get(headKey) if err != nil { return db, errors.Wrap(err, "Retrieving head") } db.orc.nextTxnTs = vs.Version var vptr valuePointer if len(vs.Value) > 0 { vptr.Decode(vs.Value) } replayCloser := y.NewCloser(1) go db.doWrites(replayCloser) if err = db.vlog.open(db, vptr, db.replayFunction()); err != nil { replayCloser.SignalAndWait() return db, y.Wrapf(err, "During db.vlog.open") } replayCloser.SignalAndWait() // Wait for replay to be applied first. // Let's advance nextTxnTs to one more than whatever we observed via // replaying the logs. db.orc.txnMark.Done(db.orc.nextTxnTs) // In normal mode, we must update readMark so older versions of keys can be removed during // compaction when run in offline mode via the flatten tool. db.orc.readMark.Done(db.orc.nextTxnTs) db.orc.incrementNextTs() db.closers.writes = y.NewCloser(1) go db.doWrites(db.closers.writes) if !db.opt.InMemory { db.closers.valueGC = y.NewCloser(1) go db.vlog.waitOnGC(db.closers.valueGC) } db.closers.pub = y.NewCloser(1) go db.pub.listenForUpdates(db.closers.pub) valueDirLockGuard = nil dirLockGuard = nil manifestFile = nil return db, nil } // cleanup stops all the goroutines started by badger. This is used in open to // cleanup goroutines in case of an error. func (db *DB) cleanup() { db.stopMemoryFlush() db.stopCompactions() db.blockCache.Close() db.indexCache.Close() if db.closers.updateSize != nil { db.closers.updateSize.Signal() } if db.closers.valueGC != nil { db.closers.valueGC.Signal() } if db.closers.writes != nil { db.closers.writes.Signal() } if db.closers.pub != nil { db.closers.pub.Signal() } db.orc.Stop() // Do not use vlog.Close() here. vlog.Close truncates the files. We don't // want to truncate files unless the user has specified the truncate flag. db.vlog.stopFlushDiscardStats() } // BlockCacheMetrics returns the metrics for the underlying block cache. func (db *DB) BlockCacheMetrics() *ristretto.Metrics { if db.blockCache != nil { return db.blockCache.Metrics } return nil } // IndexCacheMetrics returns the metrics for the underlying index cache. func (db *DB) IndexCacheMetrics() *ristretto.Metrics { if db.indexCache != nil { return db.indexCache.Metrics } return nil } // Close closes a DB. It's crucial to call it to ensure all the pending updates make their way to // disk. Calling DB.Close() multiple times would still only close the DB once. func (db *DB) Close() error { var err error db.closeOnce.Do(func() { err = db.close() }) return err } // IsClosed denotes if the badger DB is closed or not. A DB instance should not // be used after closing it. func (db *DB) IsClosed() bool { return atomic.LoadUint32(&db.isClosed) == 1 } func (db *DB) close() (err error) { db.opt.Debugf("Closing database") atomic.StoreInt32(&db.blockWrites, 1) if !db.opt.InMemory { // Stop value GC first. db.closers.valueGC.SignalAndWait() } // Stop writes next. db.closers.writes.SignalAndWait() // Don't accept any more write. close(db.writeCh) db.closers.pub.SignalAndWait() // Now close the value log. if vlogErr := db.vlog.Close(); vlogErr != nil { err = errors.Wrap(vlogErr, "DB.Close") } // Make sure that block writer is done pushing stuff into memtable! // Otherwise, you will have a race condition: we are trying to flush memtables // and remove them completely, while the block / memtable writer is still // trying to push stuff into the memtable. This will also resolve the value // offset problem: as we push into memtable, we update value offsets there. if !db.mt.Empty() { db.opt.Debugf("Flushing memtable") for { pushedFlushTask := func() bool { db.Lock() defer db.Unlock() y.AssertTrue(db.mt != nil) select { case db.flushChan <- flushTask{mt: db.mt, vptr: db.vhead}: db.imm = append(db.imm, db.mt) // Flusher will attempt to remove this from s.imm. db.mt = nil // Will segfault if we try writing! db.opt.Debugf("pushed to flush chan\n") return true default: // If we fail to push, we need to unlock and wait for a short while. // The flushing operation needs to update s.imm. Otherwise, we have a deadlock. // TODO: Think about how to do this more cleanly, maybe without any locks. } return false }() if pushedFlushTask { break } time.Sleep(10 * time.Millisecond) } } db.stopMemoryFlush() db.stopCompactions() // Force Compact L0 // We don't need to care about cstatus since no parallel compaction is running. if db.opt.CompactL0OnClose { err := db.lc.doCompact(173, compactionPriority{level: 0, score: 1.73}) switch err { case errFillTables: // This error only means that there might be enough tables to do a compaction. So, we // should not report it to the end user to avoid confusing them. case nil: db.opt.Infof("Force compaction on level 0 done") default: db.opt.Warningf("While forcing compaction on level 0: %v", err) } } if lcErr := db.lc.close(); err == nil { err = errors.Wrap(lcErr, "DB.Close") } db.opt.Debugf("Waiting for closer") db.closers.updateSize.SignalAndWait() db.orc.Stop() db.blockCache.Close() db.indexCache.Close() atomic.StoreUint32(&db.isClosed, 1) if db.opt.InMemory { return } if db.dirLockGuard != nil { if guardErr := db.dirLockGuard.release(); err == nil { err = errors.Wrap(guardErr, "DB.Close") } } if db.valueDirGuard != nil { if guardErr := db.valueDirGuard.release(); err == nil { err = errors.Wrap(guardErr, "DB.Close") } } if manifestErr := db.manifest.close(); err == nil { err = errors.Wrap(manifestErr, "DB.Close") } if registryErr := db.registry.Close(); err == nil { err = errors.Wrap(registryErr, "DB.Close") } // Fsync directories to ensure that lock file, and any other removed files whose directory // we haven't specifically fsynced, are guaranteed to have their directory entry removal // persisted to disk. if syncErr := db.syncDir(db.opt.Dir); err == nil { err = errors.Wrap(syncErr, "DB.Close") } if syncErr := db.syncDir(db.opt.ValueDir); err == nil { err = errors.Wrap(syncErr, "DB.Close") } return err } // VerifyChecksum verifies checksum for all tables on all levels. // This method can be used to verify checksum, if opt.ChecksumVerificationMode is NoVerification. func (db *DB) VerifyChecksum() error { return db.lc.verifyChecksum() } const ( lockFile = "LOCK" ) // Sync syncs database content to disk. This function provides // more control to user to sync data whenever required. func (db *DB) Sync() error { return db.vlog.sync(math.MaxUint32) } // getMemtables returns the current memtables and get references. func (db *DB) getMemTables() ([]*skl.Skiplist, func()) { db.RLock() defer db.RUnlock() tables := make([]*skl.Skiplist, len(db.imm)+1) // Get mutable memtable. tables[0] = db.mt tables[0].IncrRef() // Get immutable memtables. last := len(db.imm) - 1 for i := range db.imm { tables[i+1] = db.imm[last-i] tables[i+1].IncrRef() } return tables, func() { for _, tbl := range tables { tbl.DecrRef() } } } // get returns the value in memtable or disk for given key. // Note that value will include meta byte. // // IMPORTANT: We should never write an entry with an older timestamp for the same key, We need to // maintain this invariant to search for the latest value of a key, or else we need to search in all // tables and find the max version among them. To maintain this invariant, we also need to ensure // that all versions of a key are always present in the same table from level 1, because compaction // can push any table down. // // Update (Sep 22, 2018): To maintain the above invariant, and to allow keys to be moved from one // value log to another (while reclaiming space during value log GC), we have logically moved this // need to write "old versions after new versions" to the badgerMove keyspace. Thus, for normal // gets, we can stop going down the LSM tree once we find any version of the key (note however that // we will ALWAYS skip versions with ts greater than the key version). However, if that key has // been moved, then for the corresponding movekey, we'll look through all the levels of the tree // to ensure that we pick the highest version of the movekey present. func (db *DB) get(key []byte) (y.ValueStruct, error) { if db.IsClosed() { return y.ValueStruct{}, ErrDBClosed } tables, decr := db.getMemTables() // Lock should be released. defer decr() var maxVs *y.ValueStruct var version uint64 if bytes.HasPrefix(key, badgerMove) { // If we are checking badgerMove key, we should look into all the // levels, so we can pick up the newer versions, which might have been // compacted down the tree. maxVs = &y.ValueStruct{} version = y.ParseTs(key) } y.NumGets.Add(1) for i := 0; i < len(tables); i++ { vs := tables[i].Get(key) y.NumMemtableGets.Add(1) if vs.Meta == 0 && vs.Value == nil { continue } // Found a version of the key. For user keyspace, return immediately. For move keyspace, // continue iterating, unless we found a version == given key version. if maxVs == nil || vs.Version == version { return vs, nil } if maxVs.Version < vs.Version { *maxVs = vs } } return db.lc.get(key, maxVs, 0) } // updateHead should not be called without the db.Lock() since db.vhead is used // by the writer go routines and memtable flushing goroutine. func (db *DB) updateHead(ptrs []valuePointer) { var ptr valuePointer for i := len(ptrs) - 1; i >= 0; i-- { p := ptrs[i] if !p.IsZero() { ptr = p break } } if ptr.IsZero() { return } y.AssertTrue(!ptr.Less(db.vhead)) db.vhead = ptr } var requestPool = sync.Pool{ New: func() interface{} { return new(request) }, } func (db *DB) shouldWriteValueToLSM(e Entry) bool { return len(e.Value) < db.opt.ValueThreshold } func (db *DB) writeToLSM(b *request) error { // We should check the length of b.Prts and b.Entries only when badger is not // running in InMemory mode. In InMemory mode, we don't write anything to the // value log and that's why the length of b.Ptrs will always be zero. if !db.opt.InMemory && len(b.Ptrs) != len(b.Entries) { return errors.Errorf("Ptrs and Entries don't match: %+v", b) } for i, entry := range b.Entries { if entry.meta&bitFinTxn != 0 { continue } if db.shouldWriteValueToLSM(*entry) { // Will include deletion / tombstone case. db.mt.Put(entry.Key, y.ValueStruct{ Value: entry.Value, // Ensure value pointer flag is removed. Otherwise, the value will fail // to be retrieved during iterator prefetch. `bitValuePointer` is only // known to be set in write to LSM when the entry is loaded from a backup // with lower ValueThreshold and its value was stored in the value log. Meta: entry.meta &^ bitValuePointer, UserMeta: entry.UserMeta, ExpiresAt: entry.ExpiresAt, }) } else { db.mt.Put(entry.Key, y.ValueStruct{ Value: b.Ptrs[i].Encode(), Meta: entry.meta | bitValuePointer, UserMeta: entry.UserMeta, ExpiresAt: entry.ExpiresAt, }) } } return nil } // writeRequests is called serially by only one goroutine. func (db *DB) writeRequests(reqs []*request) error { if len(reqs) == 0 { return nil } done := func(err error) { for _, r := range reqs { r.Err = err r.Wg.Done() } } db.opt.Debugf("writeRequests called. Writing to value log") err := db.vlog.write(reqs) if err != nil { done(err) return err } db.opt.Debugf("Sending updates to subscribers") db.pub.sendUpdates(reqs) db.opt.Debugf("Writing to memtable") var count int for _, b := range reqs { if len(b.Entries) == 0 { continue } count += len(b.Entries) var i uint64 for err = db.ensureRoomForWrite(); err == errNoRoom; err = db.ensureRoomForWrite() { i++ if i%100 == 0 { db.opt.Debugf("Making room for writes") } // We need to poll a bit because both hasRoomForWrite and the flusher need access to s.imm. // When flushChan is full and you are blocked there, and the flusher is trying to update s.imm, // you will get a deadlock. time.Sleep(10 * time.Millisecond) } if err != nil { done(err) return errors.Wrap(err, "writeRequests") } if err := db.writeToLSM(b); err != nil { done(err) return errors.Wrap(err, "writeRequests") } db.Lock() db.updateHead(b.Ptrs) db.Unlock() } done(nil) db.opt.Debugf("%d entries written", count) return nil } func (db *DB) sendToWriteCh(entries []*Entry) (*request, error) { if atomic.LoadInt32(&db.blockWrites) == 1 { return nil, ErrBlockedWrites } var count, size int64 for _, e := range entries { size += int64(e.estimateSize(db.opt.ValueThreshold)) count++ } if count >= db.opt.maxBatchCount || size >= db.opt.maxBatchSize { return nil, ErrTxnTooBig } // We can only service one request because we need each txn to be stored in a contigous section. // Txns should not interleave among other txns or rewrites. req := requestPool.Get().(*request) req.reset() req.Entries = entries req.Wg.Add(1) req.IncrRef() // for db write db.writeCh <- req // Handled in doWrites. y.NumPuts.Add(int64(len(entries))) return req, nil } func (db *DB) doWrites(lc *y.Closer) { defer lc.Done() pendingCh := make(chan struct{}, 1) writeRequests := func(reqs []*request) { if err := db.writeRequests(reqs); err != nil { db.opt.Errorf("writeRequests: %v", err) } <-pendingCh } // This variable tracks the number of pending writes. reqLen := new(expvar.Int) y.PendingWrites.Set(db.opt.Dir, reqLen) reqs := make([]*request, 0, 10) for { var r *request select { case r = <-db.writeCh: case <-lc.HasBeenClosed(): goto closedCase } for { reqs = append(reqs, r) reqLen.Set(int64(len(reqs))) if len(reqs) >= 3*kvWriteChCapacity { pendingCh <- struct{}{} // blocking. goto writeCase } select { // Either push to pending, or continue to pick from writeCh. case r = <-db.writeCh: case pendingCh <- struct{}{}: goto writeCase case <-lc.HasBeenClosed(): goto closedCase } } closedCase: // All the pending request are drained. // Don't close the writeCh, because it has be used in several places. for { select { case r = <-db.writeCh: reqs = append(reqs, r) default: pendingCh <- struct{}{} // Push to pending before doing a write. writeRequests(reqs) return } } writeCase: go writeRequests(reqs) reqs = make([]*request, 0, 10) reqLen.Set(0) } } // batchSet applies a list of badger.Entry. If a request level error occurs it // will be returned. // Check(kv.BatchSet(entries)) func (db *DB) batchSet(entries []*Entry) error { req, err := db.sendToWriteCh(entries) if err != nil { return err } return req.Wait() } // batchSetAsync is the asynchronous version of batchSet. It accepts a callback // function which is called when all the sets are complete. If a request level // error occurs, it will be passed back via the callback. // err := kv.BatchSetAsync(entries, func(err error)) { // Check(err) // } func (db *DB) batchSetAsync(entries []*Entry, f func(error)) error { req, err := db.sendToWriteCh(entries) if err != nil { return err } go func() { err := req.Wait() // Write is complete. Let's call the callback function now. f(err) }() return nil } var errNoRoom = errors.New("No room for write") // ensureRoomForWrite is always called serially. func (db *DB) ensureRoomForWrite() error { var err error db.Lock() defer db.Unlock() // Here we determine if we need to force flush memtable. Given we rotated log file, it would // make sense to force flush a memtable, so the updated value head would have a chance to be // pushed to L0. Otherwise, it would not go to L0, until the memtable has been fully filled, // which can take a lot longer if the write load has fewer keys and larger values. This force // flush, thus avoids the need to read through a lot of log files on a crash and restart. // Above approach is quite simple with small drawback. We are calling ensureRoomForWrite before // inserting every entry in Memtable. We will get latest db.head after all entries for a request // are inserted in Memtable. If we have done >= db.logRotates rotations, then while inserting // first entry in Memtable, below condition will be true and we will endup flushing old value of // db.head. Hence we are limiting no of value log files to be read to db.logRotates only. forceFlush := atomic.LoadInt32(&db.logRotates) >= db.opt.LogRotatesToFlush if !forceFlush && db.mt.MemSize() < db.opt.MaxTableSize { return nil } y.AssertTrue(db.mt != nil) // A nil mt indicates that DB is being closed. select { case db.flushChan <- flushTask{mt: db.mt, vptr: db.vhead}: // After every memtable flush, let's reset the counter. atomic.StoreInt32(&db.logRotates, 0) // Ensure value log is synced to disk so this memtable's contents wouldn't be lost. err = db.vlog.sync(db.vhead.Fid) if err != nil { return err } db.opt.Debugf("Flushing memtable, mt.size=%d size of flushChan: %d\n", db.mt.MemSize(), len(db.flushChan)) // We manage to push this task. Let's modify imm. db.imm = append(db.imm, db.mt) db.mt = skl.NewSkiplist(arenaSize(db.opt)) // New memtable is empty. We certainly have room. return nil default: // We need to do this to unlock and allow the flusher to modify imm. return errNoRoom } } func arenaSize(opt Options) int64 { return opt.MaxTableSize + opt.maxBatchSize + opt.maxBatchCount*int64(skl.MaxNodeSize) } // buildL0Table builds a new table from the memtable. func buildL0Table(ft flushTask, bopts table.Options) []byte { iter := ft.mt.NewIterator() defer iter.Close() b := table.NewTableBuilder(bopts) defer b.Close() var vp valuePointer for iter.SeekToFirst(); iter.Valid(); iter.Next() { if len(ft.dropPrefixes) > 0 && hasAnyPrefixes(iter.Key(), ft.dropPrefixes) { continue } vs := iter.Value() if vs.Meta&bitValuePointer > 0 { vp.Decode(vs.Value) } b.Add(iter.Key(), iter.Value(), vp.Len) } return b.Finish() } type flushTask struct { mt *skl.Skiplist vptr valuePointer dropPrefixes [][]byte } func (db *DB) pushHead(ft flushTask) error { // We don't need to store head pointer in the in-memory mode since we will // never be replay anything. if db.opt.InMemory { return nil } // Ensure we never push a zero valued head pointer. if ft.vptr.IsZero() { return errors.New("Head should not be zero") } // Store badger head even if vptr is zero, need it for readTs db.opt.Infof("Storing value log head: %+v\n", ft.vptr) val := ft.vptr.Encode() // Pick the max commit ts, so in case of crash, our read ts would be higher than all the // commits. headTs := y.KeyWithTs(head, db.orc.nextTs()) ft.mt.Put(headTs, y.ValueStruct{Value: val}) return nil } // handleFlushTask must be run serially. func (db *DB) handleFlushTask(ft flushTask) error { // There can be a scenario, when empty memtable is flushed. For example, memtable is empty and // after writing request to value log, rotation count exceeds db.LogRotatesToFlush. if ft.mt.Empty() { return nil } if err := db.pushHead(ft); err != nil { return err } dk, err := db.registry.latestDataKey() if err != nil { return y.Wrapf(err, "failed to get datakey in db.handleFlushTask") } bopts := buildTableOptions(db.opt) bopts.DataKey = dk // Builder does not need cache but the same options are used for opening table. bopts.BlockCache = db.blockCache bopts.IndexCache = db.indexCache tableData := buildL0Table(ft, bopts) fileID := db.lc.reserveFileID() if db.opt.KeepL0InMemory { tbl, err := table.OpenInMemoryTable(tableData, fileID, &bopts) if err != nil { return errors.Wrapf(err, "failed to open table in memory") } return db.lc.addLevel0Table(tbl) } fd, err := y.CreateSyncedFile(table.NewFilename(fileID, db.opt.Dir), true) if err != nil { return y.Wrap(err) } // Don't block just to sync the directory entry. dirSyncCh := make(chan error, 1) go func() { dirSyncCh <- db.syncDir(db.opt.Dir) }() if _, err = fd.Write(tableData); err != nil { db.opt.Errorf("ERROR while writing to level 0: %v", err) return err } if dirSyncErr := <-dirSyncCh; dirSyncErr != nil { // Do dir sync as best effort. No need to return due to an error there. db.opt.Errorf("ERROR while syncing level directory: %v", dirSyncErr) } tbl, err := table.OpenTable(fd, bopts) if err != nil { db.opt.Debugf("ERROR while opening table: %v", err) return err } // We own a ref on tbl. err = db.lc.addLevel0Table(tbl) // This will incrRef _ = tbl.DecrRef() // Releases our ref. return err } // flushMemtable must keep running until we send it an empty flushTask. If there // are errors during handling the flush task, we'll retry indefinitely. func (db *DB) flushMemtable(lc *y.Closer) error { defer lc.Done() for ft := range db.flushChan { if ft.mt == nil { // We close db.flushChan now, instead of sending a nil ft.mt. continue } for { err := db.handleFlushTask(ft) if err == nil { // Update s.imm. Need a lock. db.Lock() // This is a single-threaded operation. ft.mt corresponds to the head of // db.imm list. Once we flush it, we advance db.imm. The next ft.mt // which would arrive here would match db.imm[0], because we acquire a // lock over DB when pushing to flushChan. // TODO: This logic is dirty AF. Any change and this could easily break. y.AssertTrue(ft.mt == db.imm[0]) db.imm = db.imm[1:] ft.mt.DecrRef() // Return memory. db.Unlock() break } // Encountered error. Retry indefinitely. db.opt.Errorf("Failure while flushing memtable to disk: %v. Retrying...\n", err) time.Sleep(time.Second) } } return nil } func exists(path string) (bool, error) { _, err := os.Stat(path) if err == nil { return true, nil } if os.IsNotExist(err) { return false, nil } return true, err } // This function does a filewalk, calculates the size of vlog and sst files and stores it in // y.LSMSize and y.VlogSize. func (db *DB) calculateSize() { if db.opt.InMemory { return } newInt := func(val int64) *expvar.Int { v := new(expvar.Int) v.Add(val) return v } totalSize := func(dir string) (int64, int64) { var lsmSize, vlogSize int64 err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { if err != nil { return err } ext := filepath.Ext(path) switch ext { case ".sst": lsmSize += info.Size() case ".vlog": vlogSize += info.Size() } return nil }) if err != nil { db.opt.Debugf("Got error while calculating total size of directory: %s", dir) } return lsmSize, vlogSize } lsmSize, vlogSize := totalSize(db.opt.Dir) y.LSMSize.Set(db.opt.Dir, newInt(lsmSize)) // If valueDir is different from dir, we'd have to do another walk. if db.opt.ValueDir != db.opt.Dir { _, vlogSize = totalSize(db.opt.ValueDir) } y.VlogSize.Set(db.opt.ValueDir, newInt(vlogSize)) } func (db *DB) updateSize(lc *y.Closer) { defer lc.Done() if db.opt.InMemory { return } metricsTicker := time.NewTicker(time.Minute) defer metricsTicker.Stop() for { select { case <-metricsTicker.C: db.calculateSize() case <-lc.HasBeenClosed(): return } } } // RunValueLogGC triggers a value log garbage collection. // // It picks value log files to perform GC based on statistics that are collected // during compactions. If no such statistics are available, then log files are // picked in random order. The process stops as soon as the first log file is // encountered which does not result in garbage collection. // // When a log file is picked, it is first sampled. If the sample shows that we // can discard at least discardRatio space of that file, it would be rewritten. // // If a call to RunValueLogGC results in no rewrites, then an ErrNoRewrite is // thrown indicating that the call resulted in no file rewrites. // // We recommend setting discardRatio to 0.5, thus indicating that a file be // rewritten if half the space can be discarded. This results in a lifetime // value log write amplification of 2 (1 from original write + 0.5 rewrite + // 0.25 + 0.125 + ... = 2). Setting it to higher value would result in fewer // space reclaims, while setting it to a lower value would result in more space // reclaims at the cost of increased activity on the LSM tree. discardRatio // must be in the range (0.0, 1.0), both endpoints excluded, otherwise an // ErrInvalidRequest is returned. // // Only one GC is allowed at a time. If another value log GC is running, or DB // has been closed, this would return an ErrRejected. // // Note: Every time GC is run, it would produce a spike of activity on the LSM // tree. func (db *DB) RunValueLogGC(discardRatio float64) error { if db.opt.InMemory { return ErrGCInMemoryMode } if discardRatio >= 1.0 || discardRatio <= 0.0 { return ErrInvalidRequest } // startLevel is the level from which we should search for the head key. When badger is running // with KeepL0InMemory flag, all tables on L0 are kept in memory. This means we should pick head // key from Level 1 onwards because if we pick the headkey from Level 0 we might end up losing // data. See test TestL0GCBug. startLevel := 0 if db.opt.KeepL0InMemory { startLevel = 1 } // Find head on disk headKey := y.KeyWithTs(head, math.MaxUint64) // Need to pass with timestamp, lsm get removes the last 8 bytes and compares key val, err := db.lc.get(headKey, nil, startLevel) if err != nil { return errors.Wrap(err, "Retrieving head from on-disk LSM") } var head valuePointer if len(val.Value) > 0 { head.Decode(val.Value) } // Pick a log file and run GC return db.vlog.runGC(discardRatio, head) } // Size returns the size of lsm and value log files in bytes. It can be used to decide how often to // call RunValueLogGC. func (db *DB) Size() (lsm, vlog int64) { if y.LSMSize.Get(db.opt.Dir) == nil { lsm, vlog = 0, 0 return } lsm = y.LSMSize.Get(db.opt.Dir).(*expvar.Int).Value() vlog = y.VlogSize.Get(db.opt.ValueDir).(*expvar.Int).Value() return } // Sequence represents a Badger sequence. type Sequence struct { sync.Mutex db *DB key []byte next uint64 leased uint64 bandwidth uint64 } // Next would return the next integer in the sequence, updating the lease by running a transaction // if needed. func (seq *Sequence) Next() (uint64, error) { seq.Lock() defer seq.Unlock() if seq.next >= seq.leased { if err := seq.updateLease(); err != nil { return 0, err } } val := seq.next seq.next++ return val, nil } // Release the leased sequence to avoid wasted integers. This should be done right // before closing the associated DB. However it is valid to use the sequence after // it was released, causing a new lease with full bandwidth. func (seq *Sequence) Release() error { seq.Lock() defer seq.Unlock() err := seq.db.Update(func(txn *Txn) error { item, err := txn.Get(seq.key) if err != nil { return err } var num uint64 if err := item.Value(func(v []byte) error { num = binary.BigEndian.Uint64(v) return nil }); err != nil { return err } if num == seq.leased { var buf [8]byte binary.BigEndian.PutUint64(buf[:], seq.next) return txn.SetEntry(NewEntry(seq.key, buf[:])) } return nil }) if err != nil { return err } seq.leased = seq.next return nil } func (seq *Sequence) updateLease() error { return seq.db.Update(func(txn *Txn) error { item, err := txn.Get(seq.key) switch { case err == ErrKeyNotFound: seq.next = 0 case err != nil: return err default: var num uint64 if err := item.Value(func(v []byte) error { num = binary.BigEndian.Uint64(v) return nil }); err != nil { return err } seq.next = num } lease := seq.next + seq.bandwidth var buf [8]byte binary.BigEndian.PutUint64(buf[:], lease) if err = txn.SetEntry(NewEntry(seq.key, buf[:])); err != nil { return err } seq.leased = lease return nil }) } // GetSequence would initiate a new sequence object, generating it from the stored lease, if // available, in the database. Sequence can be used to get a list of monotonically increasing // integers. Multiple sequences can be created by providing different keys. Bandwidth sets the // size of the lease, determining how many Next() requests can be served from memory. // // GetSequence is not supported on ManagedDB. Calling this would result in a panic. func (db *DB) GetSequence(key []byte, bandwidth uint64) (*Sequence, error) { if db.opt.managedTxns { panic("Cannot use GetSequence with managedDB=true.") } switch { case len(key) == 0: return nil, ErrEmptyKey case bandwidth == 0: return nil, ErrZeroBandwidth } seq := &Sequence{ db: db, key: key, next: 0, leased: 0, bandwidth: bandwidth, } err := seq.updateLease() return seq, err } // Tables gets the TableInfo objects from the level controller. If withKeysCount // is true, TableInfo objects also contain counts of keys for the tables. func (db *DB) Tables(withKeysCount bool) []TableInfo { return db.lc.getTableInfo(withKeysCount) } // KeySplits can be used to get rough key ranges to divide up iteration over // the DB. func (db *DB) KeySplits(prefix []byte) []string { var splits []string // We just want table ranges here and not keys count. for _, ti := range db.Tables(false) { // We don't use ti.Left, because that has a tendency to store !badger // keys. if bytes.HasPrefix(ti.Right, prefix) { splits = append(splits, string(ti.Right)) } } sort.Strings(splits) return splits } // MaxBatchCount returns max possible entries in batch func (db *DB) MaxBatchCount() int64 { return db.opt.maxBatchCount } // MaxBatchSize returns max possible batch size func (db *DB) MaxBatchSize() int64 { return db.opt.maxBatchSize } func (db *DB) stopMemoryFlush() { // Stop memtable flushes. if db.closers.memtable != nil { close(db.flushChan) db.closers.memtable.SignalAndWait() } } func (db *DB) stopCompactions() { // Stop compactions. if db.closers.compactors != nil { db.closers.compactors.SignalAndWait() } } func (db *DB) startCompactions() { // Resume compactions. if db.closers.compactors != nil { db.closers.compactors = y.NewCloser(1) db.lc.startCompact(db.closers.compactors) } } func (db *DB) startMemoryFlush() { // Start memory fluhser. if db.closers.memtable != nil { db.flushChan = make(chan flushTask, db.opt.NumMemtables) db.closers.memtable = y.NewCloser(1) go func() { _ = db.flushMemtable(db.closers.memtable) }() } } // Flatten can be used to force compactions on the LSM tree so all the tables fall on the same // level. This ensures that all the versions of keys are colocated and not split across multiple // levels, which is necessary after a restore from backup. During Flatten, live compactions are // stopped. Ideally, no writes are going on during Flatten. Otherwise, it would create competition // between flattening the tree and new tables being created at level zero. func (db *DB) Flatten(workers int) error { db.stopCompactions() defer db.startCompactions() compactAway := func(cp compactionPriority) error { db.opt.Infof("Attempting to compact with %+v\n", cp) errCh := make(chan error, 1) for i := 0; i < workers; i++ { go func() { errCh <- db.lc.doCompact(175, cp) }() } var success int var rerr error for i := 0; i < workers; i++ { err := <-errCh if err != nil { rerr = err db.opt.Warningf("While running doCompact with %+v. Error: %v\n", cp, err) } else { success++ } } if success == 0 { return rerr } // We could do at least one successful compaction. So, we'll consider this a success. db.opt.Infof("%d compactor(s) succeeded. One or more tables from level %d compacted.\n", success, cp.level) return nil } hbytes := func(sz int64) string { return humanize.Bytes(uint64(sz)) } for { db.opt.Infof("\n") var levels []int for i, l := range db.lc.levels { sz := l.getTotalSize() db.opt.Infof("Level: %d. %8s Size. %8s Max.\n", i, hbytes(l.getTotalSize()), hbytes(l.maxTotalSize)) if sz > 0 { levels = append(levels, i) } } if len(levels) <= 1 { prios := db.lc.pickCompactLevels() if len(prios) == 0 || prios[0].score <= 1.0 { db.opt.Infof("All tables consolidated into one level. Flattening done.\n") return nil } if err := compactAway(prios[0]); err != nil { return err } continue } // Create an artificial compaction priority, to ensure that we compact the level. cp := compactionPriority{level: levels[0], score: 1.71} if err := compactAway(cp); err != nil { return err } } } func (db *DB) blockWrite() error { // Stop accepting new writes. if !atomic.CompareAndSwapInt32(&db.blockWrites, 0, 1) { return ErrBlockedWrites } // Make all pending writes finish. The following will also close writeCh. db.closers.writes.SignalAndWait() db.opt.Infof("Writes flushed. Stopping compactions now...") return nil } func (db *DB) unblockWrite() { db.closers.writes = y.NewCloser(1) go db.doWrites(db.closers.writes) // Resume writes. atomic.StoreInt32(&db.blockWrites, 0) } func (db *DB) prepareToDrop() (func(), error) { if db.opt.ReadOnly { panic("Attempting to drop data in read-only mode.") } // In order prepare for drop, we need to block the incoming writes and // write it to db. Then, flush all the pending flushtask. So that, we // don't miss any entries. if err := db.blockWrite(); err != nil { return nil, err } reqs := make([]*request, 0, 10) for { select { case r := <-db.writeCh: reqs = append(reqs, r) default: if err := db.writeRequests(reqs); err != nil { db.opt.Errorf("writeRequests: %v", err) } db.stopMemoryFlush() return func() { db.opt.Infof("Resuming writes") db.startMemoryFlush() db.unblockWrite() }, nil } } } // DropAll would drop all the data stored in Badger. It does this in the following way. // - Stop accepting new writes. // - Pause memtable flushes and compactions. // - Pick all tables from all levels, create a changeset to delete all these // tables and apply it to manifest. // - Pick all log files from value log, and delete all of them. Restart value log files from zero. // - Resume memtable flushes and compactions. // // NOTE: DropAll is resilient to concurrent writes, but not to reads. It is up to the user to not do // any reads while DropAll is going on, otherwise they may result in panics. Ideally, both reads and // writes are paused before running DropAll, and resumed after it is finished. func (db *DB) DropAll() error { f, err := db.dropAll() if f != nil { f() } return err } func (db *DB) dropAll() (func(), error) { db.opt.Infof("DropAll called. Blocking writes...") f, err := db.prepareToDrop() if err != nil { return f, err } // prepareToDrop will stop all the incomming write and flushes any pending flush tasks. // Before we drop, we'll stop the compaction because anyways all the datas are going to // be deleted. db.stopCompactions() resume := func() { db.startCompactions() f() } // Block all foreign interactions with memory tables. db.Lock() defer db.Unlock() // Remove inmemory tables. Calling DecrRef for safety. Not sure if they're absolutely needed. db.mt.DecrRef() for _, mt := range db.imm { mt.DecrRef() } db.imm = db.imm[:0] db.mt = skl.NewSkiplist(arenaSize(db.opt)) // Set it up for future writes. num, err := db.lc.dropTree() if err != nil { return resume, err } db.opt.Infof("Deleted %d SSTables. Now deleting value logs...\n", num) num, err = db.vlog.dropAll() if err != nil { return resume, err } db.vhead = valuePointer{} // Zero it out. db.lc.nextFileID = 1 db.opt.Infof("Deleted %d value log files. DropAll done.\n", num) db.blockCache.Clear() db.indexCache.Clear() return resume, nil } // DropPrefix would drop all the keys with the provided prefix. It does this in the following way: // - Stop accepting new writes. // - Stop memtable flushes before acquiring lock. Because we're acquring lock here // and memtable flush stalls for lock, which leads to deadlock // - Flush out all memtables, skipping over keys with the given prefix, Kp. // - Write out the value log header to memtables when flushing, so we don't accidentally bring Kp // back after a restart. // - Stop compaction. // - Compact L0->L1, skipping over Kp. // - Compact rest of the levels, Li->Li, picking tables which have Kp. // - Resume memtable flushes, compactions and writes. func (db *DB) DropPrefix(prefixes ...[]byte) error { db.opt.Infof("DropPrefix Called") f, err := db.prepareToDrop() if err != nil { return err } defer f() // Block all foreign interactions with memory tables. db.Lock() defer db.Unlock() db.imm = append(db.imm, db.mt) for _, memtable := range db.imm { if memtable.Empty() { memtable.DecrRef() continue } task := flushTask{ mt: memtable, // Ensure that the head of value log gets persisted to disk. vptr: db.vhead, dropPrefixes: prefixes, } db.opt.Debugf("Flushing memtable") if err := db.handleFlushTask(task); err != nil { db.opt.Errorf("While trying to flush memtable: %v", err) return err } memtable.DecrRef() } db.stopCompactions() defer db.startCompactions() db.imm = db.imm[:0] db.mt = skl.NewSkiplist(arenaSize(db.opt)) // Drop prefixes from the levels. if err := db.lc.dropPrefixes(prefixes); err != nil { return err } db.opt.Infof("DropPrefix done") return nil } // KVList contains a list of key-value pairs. type KVList = pb.KVList // Subscribe can be used to watch key changes for the given key prefixes. // At least one prefix should be passed, or an error will be returned. // You can use an empty prefix to monitor all changes to the DB. // This function blocks until the given context is done or an error occurs. // The given function will be called with a new KVList containing the modified keys and the // corresponding values. func (db *DB) Subscribe(ctx context.Context, cb func(kv *KVList) error, prefixes ...[]byte) error { if cb == nil { return ErrNilCallback } c := y.NewCloser(1) recvCh, id := db.pub.newSubscriber(c, prefixes...) slurp := func(batch *pb.KVList) error { for { select { case kvs := <-recvCh: batch.Kv = append(batch.Kv, kvs.Kv...) default: if len(batch.GetKv()) > 0 { return cb(batch) } return nil } } } for { select { case <-c.HasBeenClosed(): // No need to delete here. Closer will be called only while // closing DB. Subscriber will be deleted by cleanSubscribers. err := slurp(new(pb.KVList)) // Drain if any pending updates. c.Done() return err case <-ctx.Done(): c.Done() db.pub.deleteSubscriber(id) // Delete the subscriber to avoid further updates. return ctx.Err() case batch := <-recvCh: err := slurp(batch) if err != nil { c.Done() // Delete the subscriber if there is an error by the callback. db.pub.deleteSubscriber(id) return err } } } } // shouldEncrypt returns bool, which tells whether to encrypt or not. func (db *DB) shouldEncrypt() bool { return len(db.opt.EncryptionKey) > 0 } func (db *DB) syncDir(dir string) error { if db.opt.InMemory { return nil } return syncDir(dir) } func createDirs(opt Options) error { for _, path := range []string{opt.Dir, opt.ValueDir} { dirExists, err := exists(path) if err != nil { return y.Wrapf(err, "Invalid Dir: %q", path) } if !dirExists { if opt.ReadOnly { return errors.Errorf("Cannot find directory %q for read-only open", path) } // Try to create the directory err = os.Mkdir(path, 0700) if err != nil { return y.Wrapf(err, "Error Creating Dir: %q", path) } } } return nil } badger-2.2007.2/db2_test.go000066400000000000000000000562361372173116500152470ustar00rootroot00000000000000/* * Copyright 2018 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "bytes" "encoding/binary" "flag" "fmt" "io/ioutil" "log" "math" "math/rand" "os" "path" "regexp" "runtime" "sync" "testing" "time" "github.com/dgraph-io/badger/v2/options" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/table" "github.com/dgraph-io/badger/v2/y" "github.com/stretchr/testify/require" ) func TestTruncateVlogWithClose(t *testing.T) { key := func(i int) []byte { return []byte(fmt.Sprintf("%d%10d", i, i)) } data := func(l int) []byte { m := make([]byte, l) _, err := rand.Read(m) require.NoError(t, err) return m } dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opt := getTestOptions(dir) opt.SyncWrites = true opt.Truncate = true opt.ValueThreshold = 1 // Force all reads from value log. db, err := Open(opt) require.NoError(t, err) err = db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry(key(0), data(4055))) }) require.NoError(t, err) // Close the DB. require.NoError(t, db.Close()) require.NoError(t, os.Truncate(path.Join(dir, "000000.vlog"), 4090)) // Reopen and write some new data. db, err = Open(opt) require.NoError(t, err) for i := 0; i < 32; i++ { err := db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry(key(i), data(10))) }) require.NoError(t, err) } // Read it back to ensure that we can read it now. for i := 0; i < 32; i++ { err := db.View(func(txn *Txn) error { item, err := txn.Get(key(i)) require.NoError(t, err) val := getItemValue(t, item) require.Equal(t, 10, len(val)) return nil }) require.NoError(t, err) } require.NoError(t, db.Close()) // Reopen and read the data again. db, err = Open(opt) require.NoError(t, err) for i := 0; i < 32; i++ { err := db.View(func(txn *Txn) error { item, err := txn.Get(key(i)) require.NoError(t, err) val := getItemValue(t, item) require.Equal(t, 10, len(val)) return nil }) require.NoError(t, err) } require.NoError(t, db.Close()) } var manual = flag.Bool("manual", false, "Set when manually running some tests.") // Badger dir to be used for performing db.Open benchmark. var benchDir = flag.String("benchdir", "", "Set when running db.Open benchmark") // The following 3 TruncateVlogNoClose tests should be run one after another. // None of these close the DB, simulating a crash. They should be run with a // script, which truncates the value log to 4090, lining up with the end of the // first entry in the txn. At <4090, it would cause the entry to be truncated // immediately, at >4090, same thing. func TestTruncateVlogNoClose(t *testing.T) { if !*manual { t.Skip("Skipping test meant to be run manually.") return } dir := "p" opts := getTestOptions(dir) opts.SyncWrites = true opts.Truncate = true kv, err := Open(opts) require.NoError(t, err) key := func(i int) string { return fmt.Sprintf("%d%10d", i, i) } data := fmt.Sprintf("%4055d", 1) err = kv.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry([]byte(key(0)), []byte(data))) }) require.NoError(t, err) } func TestTruncateVlogNoClose2(t *testing.T) { if !*manual { t.Skip("Skipping test meant to be run manually.") return } dir := "p" opts := getTestOptions(dir) opts.SyncWrites = true opts.Truncate = true kv, err := Open(opts) require.NoError(t, err) key := func(i int) string { return fmt.Sprintf("%d%10d", i, i) } data := fmt.Sprintf("%10d", 1) for i := 32; i < 64; i++ { err := kv.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry([]byte(key(i)), []byte(data))) }) require.NoError(t, err) } for i := 32; i < 64; i++ { require.NoError(t, kv.View(func(txn *Txn) error { item, err := txn.Get([]byte(key(i))) require.NoError(t, err) val := getItemValue(t, item) require.NotNil(t, val) require.True(t, len(val) > 0) return nil })) } } func TestTruncateVlogNoClose3(t *testing.T) { if !*manual { t.Skip("Skipping test meant to be run manually.") return } fmt.Print("Running") dir := "p" opts := getTestOptions(dir) opts.SyncWrites = true opts.Truncate = true kv, err := Open(opts) require.NoError(t, err) key := func(i int) string { return fmt.Sprintf("%d%10d", i, i) } for i := 32; i < 64; i++ { require.NoError(t, kv.View(func(txn *Txn) error { item, err := txn.Get([]byte(key(i))) require.NoError(t, err) val := getItemValue(t, item) require.NotNil(t, val) require.True(t, len(val) > 0) return nil })) } } func TestBigKeyValuePairs(t *testing.T) { // This test takes too much memory. So, run separately. if !*manual { t.Skip("Skipping test meant to be run manually.") return } // Passing an empty directory since it will be filled by runBadgerTest. opts := DefaultOptions(""). WithMaxTableSize(1 << 20). WithValueLogMaxEntries(64) runBadgerTest(t, &opts, func(t *testing.T, db *DB) { bigK := make([]byte, 65001) bigV := make([]byte, db.opt.ValueLogFileSize+1) small := make([]byte, 65000) txn := db.NewTransaction(true) require.Regexp(t, regexp.MustCompile("Key.*exceeded"), txn.SetEntry(NewEntry(bigK, small))) require.Regexp(t, regexp.MustCompile("Value.*exceeded"), txn.SetEntry(NewEntry(small, bigV))) require.NoError(t, txn.SetEntry(NewEntry(small, small))) require.Regexp(t, regexp.MustCompile("Key.*exceeded"), txn.SetEntry(NewEntry(bigK, bigV))) require.NoError(t, db.View(func(txn *Txn) error { _, err := txn.Get(small) require.Equal(t, ErrKeyNotFound, err) return nil })) // Now run a longer test, which involves value log GC. data := fmt.Sprintf("%100d", 1) key := func(i int) string { return fmt.Sprintf("%65000d", i) } saveByKey := func(key string, value []byte) error { return db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry([]byte(key), value)) }) } getByKey := func(key string) error { return db.View(func(txn *Txn) error { item, err := txn.Get([]byte(key)) if err != nil { return err } return item.Value(func(val []byte) error { if len(val) == 0 { log.Fatalf("key not found %q", len(key)) } return nil }) }) } for i := 0; i < 32; i++ { if i < 30 { require.NoError(t, saveByKey(key(i), []byte(data))) } else { require.NoError(t, saveByKey(key(i), []byte(fmt.Sprintf("%100d", i)))) } } for j := 0; j < 5; j++ { for i := 0; i < 32; i++ { if i < 30 { require.NoError(t, saveByKey(key(i), []byte(data))) } else { require.NoError(t, saveByKey(key(i), []byte(fmt.Sprintf("%100d", i)))) } } } for i := 0; i < 32; i++ { require.NoError(t, getByKey(key(i))) } var loops int var err error for err == nil { err = db.RunValueLogGC(0.5) require.NotRegexp(t, regexp.MustCompile("truncate"), err) loops++ } t.Logf("Ran value log GC %d times. Last error: %v\n", loops, err) }) } // The following test checks for issue #585. func TestPushValueLogLimit(t *testing.T) { // This test takes too much memory. So, run separately. if !*manual { t.Skip("Skipping test meant to be run manually.") return } // Passing an empty directory since it will be filled by runBadgerTest. opt := DefaultOptions(""). WithValueLogMaxEntries(64). WithValueLogFileSize(2 << 30) runBadgerTest(t, &opt, func(t *testing.T, db *DB) { data := []byte(fmt.Sprintf("%30d", 1)) key := func(i int) string { return fmt.Sprintf("%100d", i) } for i := 0; i < 32; i++ { if i == 4 { v := make([]byte, math.MaxInt32) err := db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry([]byte(key(i)), v)) }) require.NoError(t, err) } else { err := db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry([]byte(key(i)), data)) }) require.NoError(t, err) } } for i := 0; i < 32; i++ { err := db.View(func(txn *Txn) error { item, err := txn.Get([]byte(key(i))) require.NoError(t, err, "Getting key: %s", key(i)) err = item.Value(func(v []byte) error { _ = v return nil }) require.NoError(t, err, "Getting value: %s", key(i)) return nil }) require.NoError(t, err) } }) } // The following benchmark test is supposed to be run against a badger directory with some data. // Use badger fill to create data if it doesn't exist. func BenchmarkDBOpen(b *testing.B) { if *benchDir == "" { b.Skip("Please set -benchdir to badger directory") } dir := *benchDir // Passing an empty directory since it will be filled by runBadgerTest. opt := DefaultOptions(dir). WithReadOnly(true) for i := 0; i < b.N; i++ { db, err := Open(opt) require.NoError(b, err) require.NoError(b, db.Close()) } } // Regression test for https://github.com/dgraph-io/badger/issues/830 func TestDiscardMapTooBig(t *testing.T) { createDiscardStats := func() map[uint32]int64 { stat := map[uint32]int64{} for i := uint32(0); i < 8000; i++ { stat[i] = 0 } return stat } dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) db, err := Open(DefaultOptions(dir)) require.NoError(t, err, "error while opening db") // Add some data so that memtable flush happens on close. require.NoError(t, db.Update(func(txn *Txn) error { return txn.Set([]byte("foo"), []byte("bar")) })) // overwrite discardstat with large value db.vlog.lfDiscardStats.m = createDiscardStats() require.NoError(t, db.Close()) // reopen the same DB db, err = Open(DefaultOptions(dir)) require.NoError(t, err, "error while opening db") require.NoError(t, db.Close()) } // Test for values of size uint32. func TestBigValues(t *testing.T) { if !*manual { t.Skip("Skipping test meant to be run manually.") return } opts := DefaultOptions(""). WithValueThreshold(1 << 20). WithValueLogMaxEntries(100) test := func(t *testing.T, db *DB) { keyCount := 1000 data := bytes.Repeat([]byte("a"), (1 << 20)) // Valuesize 1 MB. key := func(i int) string { return fmt.Sprintf("%65000d", i) } saveByKey := func(key string, value []byte) error { return db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry([]byte(key), value)) }) } getByKey := func(key string) error { return db.View(func(txn *Txn) error { item, err := txn.Get([]byte(key)) if err != nil { return err } return item.Value(func(val []byte) error { if len(val) == 0 || len(val) != len(data) || !bytes.Equal(val, []byte(data)) { log.Fatalf("key not found %q", len(key)) } return nil }) }) } for i := 0; i < keyCount; i++ { require.NoError(t, saveByKey(key(i), []byte(data))) } for i := 0; i < keyCount; i++ { require.NoError(t, getByKey(key(i))) } } t.Run("disk mode", func(t *testing.T) { runBadgerTest(t, &opts, func(t *testing.T, db *DB) { test(t, db) }) }) t.Run("InMemory mode", func(t *testing.T) { opts.InMemory = true opts.Dir = "" opts.ValueDir = "" db, err := Open(opts) require.NoError(t, err) test(t, db) require.NoError(t, db.Close()) }) } // This test is for compaction file picking testing. We are creating db with two levels. We have 10 // tables on level 3 and 3 tables on level 2. Tables on level 2 have overlap with 2, 4, 3 tables on // level 3. func TestCompactionFilePicking(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) db, err := Open(DefaultOptions(dir).WithTableLoadingMode(options.LoadToRAM)) require.NoError(t, err, "error while opening db") defer func() { require.NoError(t, db.Close()) }() l3 := db.lc.levels[3] for i := 1; i <= 10; i++ { // Each table has difference of 1 between smallest and largest key. tab := createTableWithRange(t, db, 2*i-1, 2*i) addToManifest(t, db, tab, 3) require.NoError(t, l3.replaceTables([]*table.Table{}, []*table.Table{tab})) } l2 := db.lc.levels[2] // First table has keys 1 and 4. tab := createTableWithRange(t, db, 1, 4) addToManifest(t, db, tab, 2) require.NoError(t, l2.replaceTables([]*table.Table{}, []*table.Table{tab})) // Second table has keys 5 and 12. tab = createTableWithRange(t, db, 5, 12) addToManifest(t, db, tab, 2) require.NoError(t, l2.replaceTables([]*table.Table{}, []*table.Table{tab})) // Third table has keys 13 and 18. tab = createTableWithRange(t, db, 13, 18) addToManifest(t, db, tab, 2) require.NoError(t, l2.replaceTables([]*table.Table{}, []*table.Table{tab})) cdef := &compactDef{ thisLevel: db.lc.levels[2], nextLevel: db.lc.levels[3], } tables := db.lc.levels[2].tables db.lc.sortByOverlap(tables, cdef) var expKey [8]byte // First table should be with smallest and biggest keys as 1 and 4. binary.BigEndian.PutUint64(expKey[:], uint64(1)) require.Equal(t, expKey[:], y.ParseKey(tables[0].Smallest())) binary.BigEndian.PutUint64(expKey[:], uint64(4)) require.Equal(t, expKey[:], y.ParseKey(tables[0].Biggest())) // Second table should be with smallest and biggest keys as 13 and 18. binary.BigEndian.PutUint64(expKey[:], uint64(13)) require.Equal(t, expKey[:], y.ParseKey(tables[1].Smallest())) binary.BigEndian.PutUint64(expKey[:], uint64(18)) require.Equal(t, expKey[:], y.ParseKey(tables[1].Biggest())) // Third table should be with smallest and biggest keys as 5 and 12. binary.BigEndian.PutUint64(expKey[:], uint64(5)) require.Equal(t, expKey[:], y.ParseKey(tables[2].Smallest())) binary.BigEndian.PutUint64(expKey[:], uint64(12)) require.Equal(t, expKey[:], y.ParseKey(tables[2].Biggest())) } // addToManifest function is used in TestCompactionFilePicking. It adds table to db manifest. func addToManifest(t *testing.T, db *DB, tab *table.Table, level uint32) { change := &pb.ManifestChange{ Id: tab.ID(), Op: pb.ManifestChange_CREATE, Level: level, Compression: uint32(tab.CompressionType()), } require.NoError(t, db.manifest.addChanges([]*pb.ManifestChange{change}), "unable to add to manifest") } // createTableWithRange function is used in TestCompactionFilePicking. It creates // a table with key starting from start and ending with end. func createTableWithRange(t *testing.T, db *DB, start, end int) *table.Table { bopts := buildTableOptions(db.opt) b := table.NewTableBuilder(bopts) nums := []int{start, end} for _, i := range nums { key := make([]byte, 8) binary.BigEndian.PutUint64(key[:], uint64(i)) key = y.KeyWithTs(key, uint64(0)) val := y.ValueStruct{Value: []byte(fmt.Sprintf("%d", i))} b.Add(key, val, 0) } fileID := db.lc.reserveFileID() fd, err := y.CreateSyncedFile(table.NewFilename(fileID, db.opt.Dir), true) require.NoError(t, err) _, err = fd.Write(b.Finish()) require.NoError(t, err, "unable to write to file") tab, err := table.OpenTable(fd, bopts) require.NoError(t, err) return tab } func TestReadSameVlog(t *testing.T) { key := func(i int) []byte { return []byte(fmt.Sprintf("%d%10d", i, i)) } testReadingSameKey := func(t *testing.T, db *DB) { // Forcing to read all values from vlog. for i := 0; i < 50; i++ { err := db.Update(func(txn *Txn) error { return txn.Set(key(i), key(i)) }) require.NoError(t, err) } // reading it again several times for i := 0; i < 50; i++ { for j := 0; j < 10; j++ { err := db.View(func(txn *Txn) error { item, err := txn.Get(key(i)) require.NoError(t, err) require.Equal(t, key(i), getItemValue(t, item)) return nil }) require.NoError(t, err) } } } t.Run("Test Read Again Plain Text", func(t *testing.T) { opt := getTestOptions("") // Forcing to read from vlog opt.ValueThreshold = 1 runBadgerTest(t, nil, func(t *testing.T, db *DB) { testReadingSameKey(t, db) }) }) t.Run("Test Read Again Encryption", func(t *testing.T) { opt := getTestOptions("") opt.ValueThreshold = 1 // Generate encryption key. eKey := make([]byte, 32) _, err := rand.Read(eKey) require.NoError(t, err) opt.EncryptionKey = eKey runBadgerTest(t, nil, func(t *testing.T, db *DB) { testReadingSameKey(t, db) }) }) } // The test ensures we don't lose data when badger is opened with KeepL0InMemory and GC is being // done. func TestL0GCBug(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) // Do not change any of the options below unless it's necessary. opts := getTestOptions(dir) opts.NumLevelZeroTables = 50 opts.NumLevelZeroTablesStall = 51 opts.ValueLogMaxEntries = 2 opts.ValueThreshold = 2 opts.KeepL0InMemory = true // Setting LoadingMode to mmap seems to cause segmentation fault while closing DB. opts.ValueLogLoadingMode = options.FileIO opts.TableLoadingMode = options.FileIO db1, err := Open(opts) require.NoError(t, err) key := func(i int) []byte { return []byte(fmt.Sprintf("%10d", i)) } val := []byte{1, 1, 1, 1, 1, 1, 1, 1} // Insert 100 entries. This will create about 50*3 vlog files and 6 SST files. for i := 0; i < 3; i++ { for j := 0; j < 100; j++ { err = db1.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry(key(j), val)) }) require.NoError(t, err) } } // Run value log GC multiple times. This would ensure at least // one value log file is garbage collected. success := 0 for i := 0; i < 10; i++ { err := db1.RunValueLogGC(0.01) if err == nil { success++ } if err != nil && err != ErrNoRewrite { t.Fatalf(err.Error()) } } // Ensure alteast one GC call was successful. require.NotZero(t, success) // CheckKeys reads all the keys previously stored. checkKeys := func(db *DB) { for i := 0; i < 100; i++ { err := db.View(func(txn *Txn) error { item, err := txn.Get(key(i)) require.NoError(t, err) val1 := getItemValue(t, item) require.Equal(t, val, val1) return nil }) require.NoError(t, err) } } checkKeys(db1) // Simulate a crash by not closing db1 but releasing the locks. if db1.dirLockGuard != nil { require.NoError(t, db1.dirLockGuard.release()) } if db1.valueDirGuard != nil { require.NoError(t, db1.valueDirGuard.release()) } for _, f := range db1.vlog.filesMap { require.NoError(t, f.fd.Close()) } require.NoError(t, db1.registry.Close()) require.NoError(t, db1.lc.close()) require.NoError(t, db1.manifest.close()) db2, err := Open(opts) require.NoError(t, err) // Ensure we still have all the keys. checkKeys(db2) require.NoError(t, db2.Close()) } // Regression test for https://github.com/dgraph-io/badger/issues/1126 // // The test has 3 steps // Step 1 - Create badger data. It is necessary that the value size is // greater than valuethreshold. The value log file size after // this step is around 170 bytes. // Step 2 - Re-open the same badger and simulate a crash. The value log file // size after this crash is around 2 GB (we increase the file size to mmap it). // Step 3 - Re-open the same badger. We should be able to read all the data // inserted in the first step. func TestWindowsDataLoss(t *testing.T) { if runtime.GOOS != "windows" { t.Skip("The test is only for Windows.") } dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opt := DefaultOptions(dir).WithSyncWrites(true) opt.ValueThreshold = 32 db, err := Open(opt) require.NoError(t, err) keyCount := 20 var keyList [][]byte // Stores all the keys generated. for i := 0; i < keyCount; i++ { // It is important that we create different transactions for each request. err := db.Update(func(txn *Txn) error { key := []byte(fmt.Sprintf("%d", i)) v := []byte("barValuebarValuebarValuebarValuebarValue") require.Greater(t, len(v), opt.ValueThreshold) //32 bytes length and now it's not working err := txn.Set(key, v) require.NoError(t, err) keyList = append(keyList, key) return nil }) require.NoError(t, err) } require.NoError(t, db.Close()) opt.Truncate = true db, err = Open(opt) require.NoError(t, err) // Return after reading one entry. We're simulating a crash. // Simulate a crash by not closing db but releasing the locks. if db.dirLockGuard != nil { require.NoError(t, db.dirLockGuard.release()) } if db.valueDirGuard != nil { require.NoError(t, db.valueDirGuard.release()) } // Don't use vlog.Close here. We don't want to fix the file size. Only un-mmap // the data so that we can truncate the file durning the next vlog.Open. require.NoError(t, y.Munmap(db.vlog.filesMap[db.vlog.maxFid].fmap)) for _, f := range db.vlog.filesMap { require.NoError(t, f.fd.Close()) } require.NoError(t, db.registry.Close()) require.NoError(t, db.manifest.close()) require.NoError(t, db.lc.close()) opt.Truncate = true db, err = Open(opt) require.NoError(t, err) defer db.Close() txn := db.NewTransaction(false) defer txn.Discard() it := txn.NewIterator(DefaultIteratorOptions) defer it.Close() var result [][]byte // stores all the keys read from the db. for it.Rewind(); it.Valid(); it.Next() { item := it.Item() k := item.Key() err := item.Value(func(v []byte) error { _ = v return nil }) require.NoError(t, err) result = append(result, k) } require.ElementsMatch(t, keyList, result) } func TestDropAllDropPrefix(t *testing.T) { key := func(i int) []byte { return []byte(fmt.Sprintf("%10d", i)) } val := func(i int) []byte { return []byte(fmt.Sprintf("%128d", i)) } runBadgerTest(t, nil, func(t *testing.T, db *DB) { wb := db.NewWriteBatch() defer wb.Cancel() N := 50000 for i := 0; i < N; i++ { require.NoError(t, wb.Set(key(i), val(i))) } require.NoError(t, wb.Flush()) var wg sync.WaitGroup wg.Add(3) go func() { defer wg.Done() err := db.DropPrefix([]byte("000")) for err == ErrBlockedWrites { fmt.Printf("DropPrefix 000 err: %v", err) err = db.DropPrefix([]byte("000")) time.Sleep(time.Millisecond * 500) } require.NoError(t, err) }() go func() { defer wg.Done() err := db.DropPrefix([]byte("111")) for err == ErrBlockedWrites { fmt.Printf("DropPrefix 111 err: %v", err) err = db.DropPrefix([]byte("111")) time.Sleep(time.Millisecond * 500) } require.NoError(t, err) }() go func() { time.Sleep(time.Millisecond) // Let drop prefix run first. defer wg.Done() err := db.DropAll() for err == ErrBlockedWrites { fmt.Printf("dropAll err: %v", err) err = db.DropAll() time.Sleep(time.Millisecond * 300) } require.NoError(t, err) }() wg.Wait() }) } func TestIsClosed(t *testing.T) { test := func(inMemory bool) { opt := DefaultOptions("") if inMemory { opt.InMemory = true } else { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opt.Dir = dir opt.ValueDir = dir } db, err := Open(opt) require.NoError(t, err) require.False(t, db.IsClosed()) require.NoError(t, db.Close()) require.True(t, db.IsClosed()) } t.Run("normal", func(t *testing.T) { test(false) }) t.Run("in-memory", func(t *testing.T) { test(true) }) } badger-2.2007.2/db_test.go000066400000000000000000001511341372173116500151560ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "bytes" "context" "encoding/binary" "flag" "fmt" "io/ioutil" "math" "math/rand" "os" "path/filepath" "runtime" "sort" "sync" "testing" "time" "github.com/stretchr/testify/require" "github.com/dgraph-io/badger/v2/options" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/skl" "github.com/dgraph-io/badger/v2/y" ) var mmap = flag.Bool("vlog_mmap", true, "Specify if value log must be memory-mapped") // summary is produced when DB is closed. Currently it is used only for testing. type summary struct { fileIDs map[uint64]bool } func (s *levelsController) getSummary() *summary { out := &summary{ fileIDs: make(map[uint64]bool), } for _, l := range s.levels { l.getSummary(out) } return out } func (s *levelHandler) getSummary(sum *summary) { s.RLock() defer s.RUnlock() for _, t := range s.tables { sum.fileIDs[t.ID()] = true } } func (s *DB) validate() error { return s.lc.validate() } func getTestOptions(dir string) Options { opt := DefaultOptions(dir). WithMaxTableSize(1 << 15). // Force more compaction. WithLevelOneSize(4 << 15). // Force more compaction. WithSyncWrites(false). WithBlockCacheSize(10 << 20) if !*mmap { return opt.WithValueLogLoadingMode(options.FileIO) } return opt } func getItemValue(t *testing.T, item *Item) (val []byte) { t.Helper() var v []byte err := item.Value(func(val []byte) error { v = append(v, val...) return nil }) if err != nil { t.Error(err) } if v == nil { return nil } another, err := item.ValueCopy(nil) require.NoError(t, err) require.Equal(t, v, another) return v } func txnSet(t *testing.T, kv *DB, key []byte, val []byte, meta byte) { txn := kv.NewTransaction(true) require.NoError(t, txn.SetEntry(NewEntry(key, val).WithMeta(meta))) require.NoError(t, txn.Commit()) } func txnDelete(t *testing.T, kv *DB, key []byte) { txn := kv.NewTransaction(true) require.NoError(t, txn.Delete(key)) require.NoError(t, txn.Commit()) } // Opens a badger db and runs a a test on it. func runBadgerTest(t *testing.T, opts *Options, test func(t *testing.T, db *DB)) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) if opts == nil { opts = new(Options) *opts = getTestOptions(dir) } else { opts.Dir = dir opts.ValueDir = dir } if opts.InMemory { opts.Dir = "" opts.ValueDir = "" } db, err := Open(*opts) require.NoError(t, err) defer func() { require.NoError(t, db.Close()) }() test(t, db) } func TestWrite(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { for i := 0; i < 100; i++ { txnSet(t, db, []byte(fmt.Sprintf("key%d", i)), []byte(fmt.Sprintf("val%d", i)), 0x00) } }) } func TestUpdateAndView(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { err := db.Update(func(txn *Txn) error { for i := 0; i < 10; i++ { entry := NewEntry([]byte(fmt.Sprintf("key%d", i)), []byte(fmt.Sprintf("val%d", i))) if err := txn.SetEntry(entry); err != nil { return err } } return nil }) require.NoError(t, err) err = db.View(func(txn *Txn) error { for i := 0; i < 10; i++ { item, err := txn.Get([]byte(fmt.Sprintf("key%d", i))) if err != nil { return err } expected := []byte(fmt.Sprintf("val%d", i)) if err := item.Value(func(val []byte) error { require.Equal(t, expected, val, "Invalid value for key %q. expected: %q, actual: %q", item.Key(), expected, val) return nil }); err != nil { return err } } return nil }) require.NoError(t, err) }) } func TestConcurrentWrite(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { // Not a benchmark. Just a simple test for concurrent writes. n := 20 m := 500 var wg sync.WaitGroup for i := 0; i < n; i++ { wg.Add(1) go func(i int) { defer wg.Done() for j := 0; j < m; j++ { txnSet(t, db, []byte(fmt.Sprintf("k%05d_%08d", i, j)), []byte(fmt.Sprintf("v%05d_%08d", i, j)), byte(j%127)) } }(i) } wg.Wait() t.Log("Starting iteration") opt := IteratorOptions{} opt.Reverse = false opt.PrefetchSize = 10 opt.PrefetchValues = true txn := db.NewTransaction(true) it := txn.NewIterator(opt) defer it.Close() var i, j int for it.Rewind(); it.Valid(); it.Next() { item := it.Item() k := item.Key() if k == nil { break // end of iteration. } require.EqualValues(t, fmt.Sprintf("k%05d_%08d", i, j), string(k)) v := getItemValue(t, item) require.EqualValues(t, fmt.Sprintf("v%05d_%08d", i, j), string(v)) require.Equal(t, item.UserMeta(), byte(j%127)) j++ if j == m { i++ j = 0 } } require.EqualValues(t, n, i) require.EqualValues(t, 0, j) }) } func TestGet(t *testing.T) { test := func(t *testing.T, db *DB) { txnSet(t, db, []byte("key1"), []byte("val1"), 0x08) txn := db.NewTransaction(false) item, err := txn.Get([]byte("key1")) require.NoError(t, err) require.EqualValues(t, "val1", getItemValue(t, item)) require.Equal(t, byte(0x08), item.UserMeta()) txn.Discard() txnSet(t, db, []byte("key1"), []byte("val2"), 0x09) txn = db.NewTransaction(false) item, err = txn.Get([]byte("key1")) require.NoError(t, err) require.EqualValues(t, "val2", getItemValue(t, item)) require.Equal(t, byte(0x09), item.UserMeta()) txn.Discard() txnDelete(t, db, []byte("key1")) txn = db.NewTransaction(false) _, err = txn.Get([]byte("key1")) require.Equal(t, ErrKeyNotFound, err) txn.Discard() txnSet(t, db, []byte("key1"), []byte("val3"), 0x01) txn = db.NewTransaction(false) item, err = txn.Get([]byte("key1")) require.NoError(t, err) require.EqualValues(t, "val3", getItemValue(t, item)) require.Equal(t, byte(0x01), item.UserMeta()) longVal := make([]byte, 1000) txnSet(t, db, []byte("key1"), longVal, 0x00) txn = db.NewTransaction(false) item, err = txn.Get([]byte("key1")) require.NoError(t, err) require.EqualValues(t, longVal, getItemValue(t, item)) txn.Discard() } t.Run("disk mode", func(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { test(t, db) }) }) t.Run("InMemory mode", func(t *testing.T) { opts := DefaultOptions("").WithInMemory(true) db, err := Open(opts) require.NoError(t, err) test(t, db) require.NoError(t, db.Close()) }) t.Run("cache enabled", func(t *testing.T) { opts := DefaultOptions("").WithBlockCacheSize(10 << 20) runBadgerTest(t, &opts, func(t *testing.T, db *DB) { test(t, db) }) }) } func TestGetAfterDelete(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { // populate with one entry key := []byte("key") txnSet(t, db, key, []byte("val1"), 0x00) require.NoError(t, db.Update(func(txn *Txn) error { err := txn.Delete(key) require.NoError(t, err) _, err = txn.Get(key) require.Equal(t, ErrKeyNotFound, err) return nil })) }) } func TestTxnTooBig(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { data := func(i int) []byte { return []byte(fmt.Sprintf("%b", i)) } // n := 500000 n := 1000 txn := db.NewTransaction(true) for i := 0; i < n; { if err := txn.SetEntry(NewEntry(data(i), data(i))); err != nil { require.NoError(t, txn.Commit()) txn = db.NewTransaction(true) } else { i++ } } require.NoError(t, txn.Commit()) txn = db.NewTransaction(true) for i := 0; i < n; { if err := txn.Delete(data(i)); err != nil { require.NoError(t, txn.Commit()) txn = db.NewTransaction(true) } else { i++ } } require.NoError(t, txn.Commit()) }) } func TestForceCompactL0(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opts := getTestOptions(dir) opts.ValueLogFileSize = 15 << 20 opts.managedTxns = true db, err := Open(opts) require.NoError(t, err) data := func(i int) []byte { return []byte(fmt.Sprintf("%b", i)) } n := 80 m := 45 // Increasing would cause ErrTxnTooBig sz := 32 << 10 v := make([]byte, sz) for i := 0; i < n; i += 2 { version := uint64(i) txn := db.NewTransactionAt(version, true) for j := 0; j < m; j++ { require.NoError(t, txn.SetEntry(NewEntry(data(j), v))) } require.NoError(t, txn.CommitAt(version+1, nil)) } db.Close() opts.managedTxns = true db, err = Open(opts) require.NoError(t, err) require.Equal(t, len(db.lc.levels[0].tables), 0) require.NoError(t, db.Close()) } func dirSize(path string) (int64, error) { var size int64 err := filepath.Walk(path, func(_ string, info os.FileInfo, err error) error { if err != nil { if os.IsNotExist(err) { return nil } return err } if !info.IsDir() { size += info.Size() } return err }) return (size >> 20), err } // BenchmarkDbGrowth ensures DB does not grow with repeated adds and deletes. // // New keys are created with each for-loop iteration. During each // iteration, the previous for-loop iteration's keys are deleted. // // To reproduce continous growth problem due to `badgerMove` keys, // update `value.go` `discardEntry` line 1628 to return false // // Also with PR #1303, the delete keys are properly cleaned which // further reduces disk size. func BenchmarkDbGrowth(b *testing.B) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(b, err) defer removeDir(dir) start := 0 lastStart := 0 numKeys := 2000 valueSize := 1024 value := make([]byte, valueSize) discardRatio := 0.001 maxWrites := 200 opts := getTestOptions(dir) opts.ValueLogFileSize = 64 << 15 opts.MaxTableSize = 4 << 15 opts.LevelOneSize = 16 << 15 opts.NumVersionsToKeep = 1 opts.NumLevelZeroTables = 1 opts.NumLevelZeroTablesStall = 2 opts.KeepL0InMemory = false // enable L0 compaction db, err := Open(opts) require.NoError(b, err) for numWrites := 0; numWrites < maxWrites; numWrites++ { txn := db.NewTransaction(true) if start > 0 { for i := lastStart; i < start; i++ { key := make([]byte, 8) binary.BigEndian.PutUint64(key[:], uint64(i)) err := txn.Delete(key) if err == ErrTxnTooBig { require.NoError(b, txn.Commit()) txn = db.NewTransaction(true) } else { require.NoError(b, err) } } } for i := start; i < numKeys+start; i++ { key := make([]byte, 8) binary.BigEndian.PutUint64(key[:], uint64(i)) err := txn.SetEntry(NewEntry(key, value)) if err == ErrTxnTooBig { require.NoError(b, txn.Commit()) txn = db.NewTransaction(true) } else { require.NoError(b, err) } } require.NoError(b, txn.Commit()) require.NoError(b, db.Flatten(1)) for { err = db.RunValueLogGC(discardRatio) if err == ErrNoRewrite { break } else { require.NoError(b, err) } } size, err := dirSize(dir) require.NoError(b, err) fmt.Printf("Badger DB Size = %dMB\n", size) lastStart = start start += numKeys } db.Close() size, err := dirSize(dir) require.NoError(b, err) require.LessOrEqual(b, size, int64(16)) fmt.Printf("Badger DB Size = %dMB\n", size) } // Put a lot of data to move some data to disk. // WARNING: This test might take a while but it should pass! func TestGetMore(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { data := func(i int) []byte { return []byte(fmt.Sprintf("%b", i)) } // n := 500000 n := 10000 m := 45 // Increasing would cause ErrTxnTooBig for i := 0; i < n; i += m { txn := db.NewTransaction(true) for j := i; j < i+m && j < n; j++ { require.NoError(t, txn.SetEntry(NewEntry(data(j), data(j)))) } require.NoError(t, txn.Commit()) } require.NoError(t, db.validate()) for i := 0; i < n; i++ { txn := db.NewTransaction(false) item, err := txn.Get(data(i)) if err != nil { t.Error(err) } require.EqualValues(t, string(data(i)), string(getItemValue(t, item))) txn.Discard() } // Overwrite for i := 0; i < n; i += m { txn := db.NewTransaction(true) for j := i; j < i+m && j < n; j++ { require.NoError(t, txn.SetEntry(NewEntry(data(j), // Use a long value that will certainly exceed value threshold. []byte(fmt.Sprintf("zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz%9d", j))))) } require.NoError(t, txn.Commit()) } require.NoError(t, db.validate()) for i := 0; i < n; i++ { expectedValue := fmt.Sprintf("zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz%9d", i) k := data(i) txn := db.NewTransaction(false) item, err := txn.Get(k) if err != nil { t.Error(err) } got := string(getItemValue(t, item)) if expectedValue != got { vs, err := db.get(y.KeyWithTs(k, math.MaxUint64)) require.NoError(t, err) fmt.Printf("wanted=%q Item: %s\n", k, item) fmt.Printf("on re-run, got version: %+v\n", vs) txn := db.NewTransaction(false) itr := txn.NewIterator(DefaultIteratorOptions) for itr.Seek(k); itr.Valid(); itr.Next() { item := itr.Item() fmt.Printf("item=%s\n", item) if !bytes.Equal(item.Key(), k) { break } } itr.Close() txn.Discard() } require.EqualValues(t, expectedValue, string(getItemValue(t, item)), "wanted=%q Item: %s\n", k, item) txn.Discard() } // "Delete" key. for i := 0; i < n; i += m { if (i % 10000) == 0 { fmt.Printf("Deleting i=%d\n", i) } txn := db.NewTransaction(true) for j := i; j < i+m && j < n; j++ { require.NoError(t, txn.Delete(data(j))) } require.NoError(t, txn.Commit()) } db.validate() for i := 0; i < n; i++ { if (i % 10000) == 0 { // Display some progress. Right now, it's not very fast with no caching. fmt.Printf("Testing i=%d\n", i) } k := data(i) txn := db.NewTransaction(false) _, err := txn.Get([]byte(k)) require.Equal(t, ErrKeyNotFound, err, "should not have found k: %q", k) txn.Discard() } }) } // Put a lot of data to move some data to disk. // WARNING: This test might take a while but it should pass! func TestExistsMore(t *testing.T) { test := func(t *testing.T, db *DB) { // n := 500000 n := 10000 m := 45 for i := 0; i < n; i += m { if (i % 1000) == 0 { t.Logf("Putting i=%d\n", i) } txn := db.NewTransaction(true) for j := i; j < i+m && j < n; j++ { require.NoError(t, txn.SetEntry(NewEntry([]byte(fmt.Sprintf("%09d", j)), []byte(fmt.Sprintf("%09d", j))))) } require.NoError(t, txn.Commit()) } db.validate() for i := 0; i < n; i++ { if (i % 1000) == 0 { fmt.Printf("Testing i=%d\n", i) } k := fmt.Sprintf("%09d", i) require.NoError(t, db.View(func(txn *Txn) error { _, err := txn.Get([]byte(k)) require.NoError(t, err) return nil })) } require.NoError(t, db.View(func(txn *Txn) error { _, err := txn.Get([]byte("non-exists")) require.Error(t, err) return nil })) // "Delete" key. for i := 0; i < n; i += m { if (i % 1000) == 0 { fmt.Printf("Deleting i=%d\n", i) } txn := db.NewTransaction(true) for j := i; j < i+m && j < n; j++ { require.NoError(t, txn.Delete([]byte(fmt.Sprintf("%09d", j)))) } require.NoError(t, txn.Commit()) } db.validate() for i := 0; i < n; i++ { if (i % 10000) == 0 { // Display some progress. Right now, it's not very fast with no caching. fmt.Printf("Testing i=%d\n", i) } k := fmt.Sprintf("%09d", i) require.NoError(t, db.View(func(txn *Txn) error { _, err := txn.Get([]byte(k)) require.Error(t, err) return nil })) } fmt.Println("Done and closing") } t.Run("disk mode", func(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { test(t, db) }) }) t.Run("InMemory mode", func(t *testing.T) { opt := DefaultOptions("").WithInMemory(true) db, err := Open(opt) require.NoError(t, err) test(t, db) require.NoError(t, db.Close()) }) } func TestIterate2Basic(t *testing.T) { test := func(t *testing.T, db *DB) { bkey := func(i int) []byte { return []byte(fmt.Sprintf("%09d", i)) } bval := func(i int) []byte { return []byte(fmt.Sprintf("%025d", i)) } // n := 500000 n := 10000 for i := 0; i < n; i++ { if (i % 1000) == 0 { t.Logf("Put i=%d\n", i) } txnSet(t, db, bkey(i), bval(i), byte(i%127)) } opt := IteratorOptions{} opt.PrefetchValues = true opt.PrefetchSize = 10 txn := db.NewTransaction(false) it := txn.NewIterator(opt) { var count int rewind := true t.Log("Starting first basic iteration") for it.Rewind(); it.Valid(); it.Next() { item := it.Item() key := item.Key() if rewind && count == 5000 { // Rewind would skip /head/ key, and it.Next() would skip 0. count = 1 it.Rewind() t.Log("Rewinding from 5000 to zero.") rewind = false continue } require.EqualValues(t, bkey(count), string(key)) val := getItemValue(t, item) require.EqualValues(t, bval(count), string(val)) require.Equal(t, byte(count%127), item.UserMeta()) count++ } require.EqualValues(t, n, count) } { t.Log("Starting second basic iteration") idx := 5030 for it.Seek(bkey(idx)); it.Valid(); it.Next() { item := it.Item() require.EqualValues(t, bkey(idx), string(item.Key())) require.EqualValues(t, bval(idx), string(getItemValue(t, item))) idx++ } } it.Close() } t.Run("disk mode", func(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { test(t, db) }) }) t.Run("InMemory mode", func(t *testing.T) { opt := DefaultOptions("").WithInMemory(true) db, err := Open(opt) require.NoError(t, err) test(t, db) require.NoError(t, db.Close()) }) } func TestLoad(t *testing.T) { testLoad := func(t *testing.T, opt Options) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opt.Dir = dir opt.ValueDir = dir n := 10000 { kv, err := Open(opt) require.NoError(t, err) for i := 0; i < n; i++ { if (i % 10000) == 0 { fmt.Printf("Putting i=%d\n", i) } k := []byte(fmt.Sprintf("%09d", i)) txnSet(t, kv, k, k, 0x00) } kv.Close() } kv, err := Open(opt) require.NoError(t, err) require.Equal(t, uint64(10001), kv.orc.readTs()) for i := 0; i < n; i++ { if (i % 10000) == 0 { fmt.Printf("Testing i=%d\n", i) } k := fmt.Sprintf("%09d", i) require.NoError(t, kv.View(func(txn *Txn) error { item, err := txn.Get([]byte(k)) require.NoError(t, err) require.EqualValues(t, k, string(getItemValue(t, item))) return nil })) } kv.Close() summary := kv.lc.getSummary() // Check that files are garbage collected. idMap := getIDMap(dir) for fileID := range idMap { // Check that name is in summary.filenames. require.True(t, summary.fileIDs[fileID], "%d", fileID) } require.EqualValues(t, len(idMap), len(summary.fileIDs)) var fileIDs []uint64 for k := range summary.fileIDs { // Map to array. fileIDs = append(fileIDs, k) } sort.Slice(fileIDs, func(i, j int) bool { return fileIDs[i] < fileIDs[j] }) fmt.Printf("FileIDs: %v\n", fileIDs) } t.Run("TestLoad Without Encryption/Compression", func(t *testing.T) { opt := getTestOptions("") opt.Compression = options.None testLoad(t, opt) }) t.Run("TestLoad With Encryption and no compression", func(t *testing.T) { key := make([]byte, 32) _, err := rand.Read(key) require.NoError(t, err) opt := getTestOptions("") opt.EncryptionKey = key opt.Compression = options.None testLoad(t, opt) }) t.Run("TestLoad With Encryption and compression", func(t *testing.T) { key := make([]byte, 32) _, err := rand.Read(key) require.NoError(t, err) opt := getTestOptions("") opt.EncryptionKey = key opt.Compression = options.ZSTD testLoad(t, opt) }) t.Run("TestLoad without Encryption and with compression", func(t *testing.T) { opt := getTestOptions("") opt.Compression = options.ZSTD testLoad(t, opt) }) } func TestIterateDeleted(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { txnSet(t, db, []byte("Key1"), []byte("Value1"), 0x00) txnSet(t, db, []byte("Key2"), []byte("Value2"), 0x00) iterOpt := DefaultIteratorOptions iterOpt.PrefetchValues = false txn := db.NewTransaction(false) idxIt := txn.NewIterator(iterOpt) defer idxIt.Close() count := 0 txn2 := db.NewTransaction(true) prefix := []byte("Key") for idxIt.Seek(prefix); idxIt.ValidForPrefix(prefix); idxIt.Next() { key := idxIt.Item().Key() count++ newKey := make([]byte, len(key)) copy(newKey, key) require.NoError(t, txn2.Delete(newKey)) } require.Equal(t, 2, count) require.NoError(t, txn2.Commit()) for _, prefetch := range [...]bool{true, false} { t.Run(fmt.Sprintf("Prefetch=%t", prefetch), func(t *testing.T) { txn := db.NewTransaction(false) iterOpt = DefaultIteratorOptions iterOpt.PrefetchValues = prefetch idxIt = txn.NewIterator(iterOpt) var estSize int64 var idxKeys []string for idxIt.Seek(prefix); idxIt.Valid(); idxIt.Next() { item := idxIt.Item() key := item.Key() estSize += item.EstimatedSize() if !bytes.HasPrefix(key, prefix) { break } idxKeys = append(idxKeys, string(key)) t.Logf("%+v\n", idxIt.Item()) } require.Equal(t, 0, len(idxKeys)) require.Equal(t, int64(0), estSize) }) } }) } func TestIterateParallel(t *testing.T) { key := func(account int) []byte { var b [4]byte binary.BigEndian.PutUint32(b[:], uint32(account)) return append([]byte("account-"), b[:]...) } N := 100000 iterate := func(txn *Txn, wg *sync.WaitGroup) { defer wg.Done() itr := txn.NewIterator(DefaultIteratorOptions) defer itr.Close() var count int for itr.Rewind(); itr.Valid(); itr.Next() { count++ item := itr.Item() require.Equal(t, "account-", string(item.Key()[0:8])) err := item.Value(func(val []byte) error { require.Equal(t, "1000", string(val)) return nil }) require.NoError(t, err) } require.Equal(t, N, count) itr.Close() // Double close. } opt := DefaultOptions("") runBadgerTest(t, &opt, func(t *testing.T, db *DB) { var wg sync.WaitGroup var txns []*Txn for i := 0; i < N; i++ { wg.Add(1) txn := db.NewTransaction(true) require.NoError(t, txn.SetEntry(NewEntry(key(i), []byte("1000")))) txns = append(txns, txn) } for _, txn := range txns { txn.CommitWith(func(err error) { y.Check(err) wg.Done() }) } wg.Wait() // Check that a RW txn can run multiple iterators. txn := db.NewTransaction(true) itr := txn.NewIterator(DefaultIteratorOptions) require.NotPanics(t, func() { // Now that multiple iterators are supported in read-write // transactions, make sure this does not panic anymore. Then just // close the iterator. txn.NewIterator(DefaultIteratorOptions).Close() }) // The transaction should still panic since there is still one pending // iterator that is open. require.Panics(t, txn.Discard) itr.Close() txn.Discard() // (Regression) Make sure that creating multiple concurrent iterators // within a read only transaction continues to work. t.Run("multiple read-only iterators", func(t *testing.T) { // Run multiple iterators for a RO txn. txn = db.NewTransaction(false) defer txn.Discard() wg.Add(3) go iterate(txn, &wg) go iterate(txn, &wg) go iterate(txn, &wg) wg.Wait() }) // Make sure that when we create multiple concurrent iterators within a // read-write transaction that it actually iterates successfully. t.Run("multiple read-write iterators", func(t *testing.T) { // Run multiple iterators for a RO txn. txn = db.NewTransaction(true) defer txn.Discard() wg.Add(3) go iterate(txn, &wg) go iterate(txn, &wg) go iterate(txn, &wg) wg.Wait() }) }) } func TestDeleteWithoutSyncWrite(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) kv, err := Open(DefaultOptions(dir)) if err != nil { t.Error(err) t.Fail() } key := []byte("k1") // Set a value with size > value threshold so that its written to value log. txnSet(t, kv, key, []byte("ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789FOOBARZOGZOG"), 0x00) txnDelete(t, kv, key) kv.Close() // Reopen KV kv, err = Open(DefaultOptions(dir)) if err != nil { t.Error(err) t.Fail() } defer kv.Close() require.NoError(t, kv.View(func(txn *Txn) error { _, err := txn.Get(key) require.Equal(t, ErrKeyNotFound, err) return nil })) } func TestPidFile(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { // Reopen database _, err := Open(getTestOptions(db.opt.Dir)) require.Error(t, err) require.Contains(t, err.Error(), "Another process is using this Badger database") }) } func TestInvalidKey(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { err := db.Update(func(txn *Txn) error { err := txn.SetEntry(NewEntry([]byte("!badger!head"), nil)) require.Equal(t, ErrInvalidKey, err) err = txn.SetEntry(NewEntry([]byte("!badger!"), nil)) require.Equal(t, ErrInvalidKey, err) err = txn.SetEntry(NewEntry([]byte("!badger"), []byte("BadgerDB"))) require.NoError(t, err) return err }) require.NoError(t, err) require.NoError(t, db.View(func(txn *Txn) error { item, err := txn.Get([]byte("!badger")) if err != nil { return err } require.NoError(t, item.Value(func(val []byte) error { require.Equal(t, []byte("BadgerDB"), val) return nil })) return nil })) }) } func TestIteratorPrefetchSize(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { bkey := func(i int) []byte { return []byte(fmt.Sprintf("%09d", i)) } bval := func(i int) []byte { return []byte(fmt.Sprintf("%025d", i)) } n := 100 for i := 0; i < n; i++ { // if (i % 10) == 0 { // t.Logf("Put i=%d\n", i) // } txnSet(t, db, bkey(i), bval(i), byte(i%127)) } getIteratorCount := func(prefetchSize int) int { opt := IteratorOptions{} opt.PrefetchValues = true opt.PrefetchSize = prefetchSize var count int txn := db.NewTransaction(false) it := txn.NewIterator(opt) { t.Log("Starting first basic iteration") for it.Rewind(); it.Valid(); it.Next() { count++ } require.EqualValues(t, n, count) } return count } var sizes = []int{-10, 0, 1, 10} for _, size := range sizes { c := getIteratorCount(size) require.Equal(t, 100, c) } }) } func TestSetIfAbsentAsync(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) kv, _ := Open(getTestOptions(dir)) bkey := func(i int) []byte { return []byte(fmt.Sprintf("%09d", i)) } f := func(err error) {} n := 1000 for i := 0; i < n; i++ { // if (i % 10) == 0 { // t.Logf("Put i=%d\n", i) // } txn := kv.NewTransaction(true) _, err = txn.Get(bkey(i)) require.Equal(t, ErrKeyNotFound, err) require.NoError(t, txn.SetEntry(NewEntry(bkey(i), nil).WithMeta(byte(i%127)))) txn.CommitWith(f) } require.NoError(t, kv.Close()) kv, err = Open(getTestOptions(dir)) require.NoError(t, err) opt := DefaultIteratorOptions txn := kv.NewTransaction(false) var count int it := txn.NewIterator(opt) { t.Log("Starting first basic iteration") for it.Rewind(); it.Valid(); it.Next() { count++ } require.EqualValues(t, n, count) } require.Equal(t, n, count) require.NoError(t, kv.Close()) } func TestGetSetRace(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { data := make([]byte, 4096) _, err := rand.Read(data) require.NoError(t, err) var ( numOp = 100 wg sync.WaitGroup keyCh = make(chan string) ) // writer wg.Add(1) go func() { defer func() { wg.Done() close(keyCh) }() for i := 0; i < numOp; i++ { key := fmt.Sprintf("%d", i) txnSet(t, db, []byte(key), data, 0x00) keyCh <- key } }() // reader wg.Add(1) go func() { defer wg.Done() for key := range keyCh { require.NoError(t, db.View(func(txn *Txn) error { item, err := txn.Get([]byte(key)) require.NoError(t, err) err = item.Value(nil) require.NoError(t, err) return nil })) } }() wg.Wait() }) } func TestDiscardVersionsBelow(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { // Write 4 versions of the same key for i := 0; i < 4; i++ { err := db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry([]byte("answer"), []byte(fmt.Sprintf("%d", i)))) }) require.NoError(t, err) } opts := DefaultIteratorOptions opts.AllVersions = true opts.PrefetchValues = false // Verify that there are 4 versions, and record 3rd version (2nd from top in iteration) db.View(func(txn *Txn) error { it := txn.NewIterator(opts) defer it.Close() var count int for it.Rewind(); it.Valid(); it.Next() { count++ item := it.Item() require.Equal(t, []byte("answer"), item.Key()) if item.DiscardEarlierVersions() { break } } require.Equal(t, 4, count) return nil }) // Set new version and discard older ones. err := db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry([]byte("answer"), []byte("5")).WithDiscard()) }) require.NoError(t, err) // Verify that there are only 2 versions left, and versions // below ts have been deleted. db.View(func(txn *Txn) error { it := txn.NewIterator(opts) defer it.Close() var count int for it.Rewind(); it.Valid(); it.Next() { count++ item := it.Item() require.Equal(t, []byte("answer"), item.Key()) if item.DiscardEarlierVersions() { break } } require.Equal(t, 1, count) return nil }) }) } func TestExpiry(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { // Write two keys, one with a TTL err := db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry([]byte("answer1"), []byte("42"))) }) require.NoError(t, err) err = db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry([]byte("answer2"), []byte("43")).WithTTL(1 * time.Second)) }) require.NoError(t, err) time.Sleep(2 * time.Second) // Verify that only unexpired key is found during iteration err = db.View(func(txn *Txn) error { _, err := txn.Get([]byte("answer1")) require.NoError(t, err) _, err = txn.Get([]byte("answer2")) require.Equal(t, ErrKeyNotFound, err) return nil }) require.NoError(t, err) // Verify that only one key is found during iteration opts := DefaultIteratorOptions opts.PrefetchValues = false err = db.View(func(txn *Txn) error { it := txn.NewIterator(opts) defer it.Close() var count int for it.Rewind(); it.Valid(); it.Next() { count++ item := it.Item() require.Equal(t, []byte("answer1"), item.Key()) } require.Equal(t, 1, count) return nil }) require.NoError(t, err) }) } func TestExpiryImproperDBClose(t *testing.T) { testReplay := func(opt Options) { // L0 compaction doesn't affect the test in any way. It is set to allow // graceful shutdown of db0. db0, err := Open(opt.WithCompactL0OnClose(false)) require.NoError(t, err) dur := 1 * time.Hour expiryTime := uint64(time.Now().Add(dur).Unix()) err = db0.Update(func(txn *Txn) error { err = txn.SetEntry(NewEntry([]byte("test_key"), []byte("test_value")).WithTTL(dur)) require.NoError(t, err) return nil }) require.NoError(t, err) // Simulate a crash by not closing db0, but releasing the locks. if db0.dirLockGuard != nil { require.NoError(t, db0.dirLockGuard.release()) db0.dirLockGuard = nil } if db0.valueDirGuard != nil { require.NoError(t, db0.valueDirGuard.release()) db0.valueDirGuard = nil } require.NoError(t, db0.Close()) db1, err := Open(opt) require.NoError(t, err) err = db1.View(func(txn *Txn) error { itm, err := txn.Get([]byte("test_key")) require.NoError(t, err) require.True(t, expiryTime <= itm.ExpiresAt() && itm.ExpiresAt() <= uint64(time.Now().Add(dur).Unix()), "expiry time of entry is invalid") return nil }) require.NoError(t, err) require.NoError(t, db1.Close()) } t.Run("Test plain text", func(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opt := getTestOptions(dir) testReplay(opt) }) t.Run("Test encryption", func(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opt := getTestOptions(dir) key := make([]byte, 32) _, err = rand.Read(key) require.NoError(t, err) opt.EncryptionKey = key testReplay(opt) }) } func randBytes(n int) []byte { recv := make([]byte, n) in, err := rand.Read(recv) if err != nil { panic(err) } return recv[:in] } var benchmarkData = []struct { key, value []byte success bool // represent if KV should be inserted successfully or not }{ {randBytes(100), nil, true}, {randBytes(1000), []byte("foo"), true}, {[]byte("foo"), randBytes(1000), true}, {[]byte(""), randBytes(1000), false}, {nil, randBytes(1000000), false}, {randBytes(100000), nil, false}, {randBytes(1000000), nil, false}, } func TestLargeKeys(t *testing.T) { test := func(t *testing.T, opt Options) { db, err := Open(opt) require.NoError(t, err) for i := 0; i < 1000; i++ { tx := db.NewTransaction(true) for _, kv := range benchmarkData { k := make([]byte, len(kv.key)) copy(k, kv.key) v := make([]byte, len(kv.value)) copy(v, kv.value) if err := tx.SetEntry(NewEntry(k, v)); err != nil { // check is success should be true if kv.success { t.Fatalf("failed with: %s", err) } } else if !kv.success { t.Fatal("insertion should fail") } } if err := tx.Commit(); err != nil { t.Fatalf("#%d: batchSet err: %v", i, err) } } require.NoError(t, db.Close()) } t.Run("disk mode", func(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opt := DefaultOptions(dir).WithValueLogFileSize(1024 * 1024 * 1024) test(t, opt) }) t.Run("InMemory mode", func(t *testing.T) { opt := DefaultOptions("").WithValueLogFileSize(1024 * 1024 * 1024) opt.InMemory = true test(t, opt) }) } func TestCreateDirs(t *testing.T) { dir, err := ioutil.TempDir("", "parent") require.NoError(t, err) defer removeDir(dir) db, err := Open(DefaultOptions(filepath.Join(dir, "badger"))) require.NoError(t, err) require.NoError(t, db.Close()) _, err = os.Stat(dir) require.NoError(t, err) } func TestGetSetDeadlock(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") fmt.Println(dir) require.NoError(t, err) defer removeDir(dir) db, err := Open(DefaultOptions(dir).WithValueLogFileSize(1 << 20)) require.NoError(t, err) defer db.Close() val := make([]byte, 1<<19) key := []byte("key1") require.NoError(t, db.Update(func(txn *Txn) error { rand.Read(val) require.NoError(t, txn.SetEntry(NewEntry(key, val))) return nil })) timeout, done := time.After(10*time.Second), make(chan bool) go func() { db.Update(func(txn *Txn) error { item, err := txn.Get(key) require.NoError(t, err) err = item.Value(nil) // This take a RLock on file require.NoError(t, err) rand.Read(val) require.NoError(t, txn.SetEntry(NewEntry(key, val))) require.NoError(t, txn.SetEntry(NewEntry([]byte("key2"), val))) return nil }) done <- true }() select { case <-timeout: t.Fatal("db.Update did not finish within 10s, assuming deadlock.") case <-done: t.Log("db.Update finished.") } } func TestWriteDeadlock(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) db, err := Open(DefaultOptions(dir).WithValueLogFileSize(10 << 20)) require.NoError(t, err) defer db.Close() print := func(count *int) { *count++ if *count%100 == 0 { fmt.Printf("%05d\r", *count) } } var count int val := make([]byte, 10000) require.NoError(t, db.Update(func(txn *Txn) error { for i := 0; i < 1500; i++ { key := fmt.Sprintf("%d", i) rand.Read(val) require.NoError(t, txn.SetEntry(NewEntry([]byte(key), val))) print(&count) } return nil })) count = 0 fmt.Println("\nWrites done. Iteration and updates starting...") err = db.Update(func(txn *Txn) error { opt := DefaultIteratorOptions opt.PrefetchValues = false it := txn.NewIterator(opt) defer it.Close() for it.Rewind(); it.Valid(); it.Next() { item := it.Item() // Using Value() would cause deadlock. // item.Value() out, err := item.ValueCopy(nil) require.NoError(t, err) require.Equal(t, len(val), len(out)) key := y.Copy(item.Key()) rand.Read(val) require.NoError(t, txn.SetEntry(NewEntry(key, val))) print(&count) } return nil }) require.NoError(t, err) } func TestSequence(t *testing.T) { key0 := []byte("seq0") key1 := []byte("seq1") runBadgerTest(t, nil, func(t *testing.T, db *DB) { seq0, err := db.GetSequence(key0, 10) require.NoError(t, err) seq1, err := db.GetSequence(key1, 100) require.NoError(t, err) for i := uint64(0); i < uint64(105); i++ { num, err := seq0.Next() require.NoError(t, err) require.Equal(t, i, num) num, err = seq1.Next() require.NoError(t, err) require.Equal(t, i, num) } err = db.View(func(txn *Txn) error { item, err := txn.Get(key0) if err != nil { return err } var num0 uint64 if err := item.Value(func(val []byte) error { num0 = binary.BigEndian.Uint64(val) return nil }); err != nil { return err } require.Equal(t, uint64(110), num0) item, err = txn.Get(key1) if err != nil { return err } var num1 uint64 if err := item.Value(func(val []byte) error { num1 = binary.BigEndian.Uint64(val) return nil }); err != nil { return err } require.Equal(t, uint64(200), num1) return nil }) require.NoError(t, err) }) } func TestSequence_Release(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { // get sequence, use once and release key := []byte("key") seq, err := db.GetSequence(key, 1000) require.NoError(t, err) num, err := seq.Next() require.NoError(t, err) require.Equal(t, uint64(0), num) require.NoError(t, seq.Release()) // we used up 0 and 1 should be stored now err = db.View(func(txn *Txn) error { item, err := txn.Get(key) if err != nil { return err } val, err := item.ValueCopy(nil) if err != nil { return err } require.Equal(t, num+1, binary.BigEndian.Uint64(val)) return nil }) require.NoError(t, err) // using it again will lease 1+1000 num, err = seq.Next() require.NoError(t, err) require.Equal(t, uint64(1), num) err = db.View(func(txn *Txn) error { item, err := txn.Get(key) if err != nil { return err } val, err := item.ValueCopy(nil) if err != nil { return err } require.Equal(t, uint64(1001), binary.BigEndian.Uint64(val)) return nil }) require.NoError(t, err) }) } func TestTestSequence2(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { key := []byte("key") seq1, err := db.GetSequence(key, 2) require.NoError(t, err) seq2, err := db.GetSequence(key, 2) require.NoError(t, err) num, err := seq2.Next() require.NoError(t, err) require.Equal(t, uint64(2), num) require.NoError(t, seq2.Release()) require.NoError(t, seq1.Release()) seq3, err := db.GetSequence(key, 2) require.NoError(t, err) for i := 0; i < 5; i++ { num2, err := seq3.Next() require.NoError(t, err) require.Equal(t, uint64(i)+3, num2) } require.NoError(t, seq3.Release()) }) } func TestReadOnly(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opts := getTestOptions(dir) // Create the DB db, err := Open(opts) require.NoError(t, err) for i := 0; i < 10000; i++ { txnSet(t, db, []byte(fmt.Sprintf("key%d", i)), []byte(fmt.Sprintf("value%d", i)), 0x00) } // Attempt a read-only open while it's open read-write. opts.ReadOnly = true _, err = Open(opts) require.Error(t, err) if err == ErrWindowsNotSupported { require.NoError(t, db.Close()) return } require.Contains(t, err.Error(), "Another process is using this Badger database") db.Close() // Open one read-only opts.ReadOnly = true kv1, err := Open(opts) require.NoError(t, err) defer kv1.Close() // Open another read-only kv2, err := Open(opts) require.NoError(t, err) defer kv2.Close() // Attempt a read-write open while it's open for read-only opts.ReadOnly = false _, err = Open(opts) require.Error(t, err) require.Contains(t, err.Error(), "Another process is using this Badger database") // Get a thing from the DB txn1 := kv1.NewTransaction(true) v1, err := txn1.Get([]byte("key1")) require.NoError(t, err) b1, err := v1.ValueCopy(nil) require.NoError(t, err) require.Equal(t, b1, []byte("value1")) err = txn1.Commit() require.NoError(t, err) // Get a thing from the DB via the other connection txn2 := kv2.NewTransaction(true) v2, err := txn2.Get([]byte("key2000")) require.NoError(t, err) b2, err := v2.ValueCopy(nil) require.NoError(t, err) require.Equal(t, b2, []byte("value2000")) err = txn2.Commit() require.NoError(t, err) // Attempt to set a value on a read-only connection txn := kv1.NewTransaction(true) err = txn.SetEntry(NewEntry([]byte("key"), []byte("value"))) require.Error(t, err) require.Contains(t, err.Error(), "No sets or deletes are allowed in a read-only transaction") err = txn.Commit() require.NoError(t, err) } func TestLSMOnly(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opts := LSMOnlyOptions(dir) dopts := DefaultOptions(dir) require.NotEqual(t, dopts.ValueThreshold, opts.ValueThreshold) dopts.ValueThreshold = 1 << 21 _, err = Open(dopts) require.Contains(t, err.Error(), "Invalid ValueThreshold") // Also test for error, when ValueThresholdSize is greater than maxBatchSize. dopts.ValueThreshold = LSMOnlyOptions(dir).ValueThreshold // maxBatchSize is calculated from MaxTableSize. dopts.MaxTableSize = int64(LSMOnlyOptions(dir).ValueThreshold) _, err = Open(dopts) require.Error(t, err, "db creation should have been failed") require.Contains(t, err.Error(), "Valuethreshold greater than max batch size") opts.ValueLogMaxEntries = 100 db, err := Open(opts) require.NoError(t, err) value := make([]byte, 128) _, err = rand.Read(value) for i := 0; i < 500; i++ { require.NoError(t, err) txnSet(t, db, []byte(fmt.Sprintf("key%d", i)), value, 0x00) } require.NoError(t, db.Close()) // Close to force compactions, so Value log GC would run. db, err = Open(opts) require.NoError(t, err) defer db.Close() require.NoError(t, db.RunValueLogGC(0.2)) } // This test function is doing some intricate sorcery. func TestMinReadTs(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { for i := 0; i < 10; i++ { require.NoError(t, db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry([]byte("x"), []byte("y"))) })) } time.Sleep(time.Millisecond) readTxn0 := db.NewTransaction(false) require.Equal(t, uint64(10), readTxn0.readTs) min := db.orc.readMark.DoneUntil() require.Equal(t, uint64(9), min) readTxn := db.NewTransaction(false) for i := 0; i < 10; i++ { require.NoError(t, db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry([]byte("x"), []byte("y"))) })) } require.Equal(t, uint64(20), db.orc.readTs()) time.Sleep(time.Millisecond) require.Equal(t, min, db.orc.readMark.DoneUntil()) readTxn0.Discard() readTxn.Discard() time.Sleep(time.Millisecond) require.Equal(t, uint64(19), db.orc.readMark.DoneUntil()) db.orc.readMark.Done(uint64(20)) // Because we called readTs. for i := 0; i < 10; i++ { db.View(func(txn *Txn) error { return nil }) } time.Sleep(time.Millisecond) require.Equal(t, uint64(20), db.orc.readMark.DoneUntil()) }) } func TestGoroutineLeak(t *testing.T) { test := func(t *testing.T, opt *Options) { time.Sleep(1 * time.Second) before := runtime.NumGoroutine() t.Logf("Num go: %d", before) for i := 0; i < 12; i++ { runBadgerTest(t, nil, func(t *testing.T, db *DB) { updated := false ctx, cancel := context.WithCancel(context.Background()) var wg sync.WaitGroup wg.Add(1) go func() { err := db.Subscribe(ctx, func(kvs *pb.KVList) error { require.Equal(t, []byte("value"), kvs.Kv[0].GetValue()) updated = true wg.Done() return nil }, []byte("key")) if err != nil { require.Equal(t, err.Error(), context.Canceled.Error()) } }() // Wait for the go routine to be scheduled. time.Sleep(time.Second) err := db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry([]byte("key"), []byte("value"))) }) require.NoError(t, err) wg.Wait() cancel() require.Equal(t, true, updated) }) } time.Sleep(2 * time.Second) require.Equal(t, before, runtime.NumGoroutine()) } t.Run("disk mode", func(t *testing.T) { test(t, nil) }) t.Run("InMemory mode", func(t *testing.T) { opt := DefaultOptions("").WithInMemory(true) test(t, &opt) }) } func ExampleOpen() { dir, err := ioutil.TempDir("", "badger-test") if err != nil { panic(err) } defer removeDir(dir) db, err := Open(DefaultOptions(dir)) if err != nil { panic(err) } defer db.Close() err = db.View(func(txn *Txn) error { _, err := txn.Get([]byte("key")) // We expect ErrKeyNotFound fmt.Println(err) return nil }) if err != nil { panic(err) } txn := db.NewTransaction(true) // Read-write txn err = txn.SetEntry(NewEntry([]byte("key"), []byte("value"))) if err != nil { panic(err) } err = txn.Commit() if err != nil { panic(err) } err = db.View(func(txn *Txn) error { item, err := txn.Get([]byte("key")) if err != nil { return err } val, err := item.ValueCopy(nil) if err != nil { return err } fmt.Printf("%s\n", string(val)) return nil }) if err != nil { panic(err) } // Output: // Key not found // value } func ExampleTxn_NewIterator() { dir, err := ioutil.TempDir("", "badger-test") if err != nil { panic(err) } defer removeDir(dir) db, err := Open(DefaultOptions(dir)) if err != nil { panic(err) } defer db.Close() bkey := func(i int) []byte { return []byte(fmt.Sprintf("%09d", i)) } bval := func(i int) []byte { return []byte(fmt.Sprintf("%025d", i)) } txn := db.NewTransaction(true) // Fill in 1000 items n := 1000 for i := 0; i < n; i++ { err := txn.SetEntry(NewEntry(bkey(i), bval(i))) if err != nil { panic(err) } } err = txn.Commit() if err != nil { panic(err) } opt := DefaultIteratorOptions opt.PrefetchSize = 10 // Iterate over 1000 items var count int err = db.View(func(txn *Txn) error { it := txn.NewIterator(opt) defer it.Close() for it.Rewind(); it.Valid(); it.Next() { count++ } return nil }) if err != nil { panic(err) } fmt.Printf("Counted %d elements", count) // Output: // Counted 1000 elements } func TestSyncForRace(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) db, err := Open(DefaultOptions(dir).WithSyncWrites(false)) require.NoError(t, err) defer db.Close() closeChan := make(chan struct{}) doneChan := make(chan struct{}) go func() { ticker := time.NewTicker(100 * time.Microsecond) for { select { case <-ticker.C: if err := db.Sync(); err != nil { require.NoError(t, err) } db.opt.Debugf("Sync Iteration completed") case <-closeChan: close(doneChan) return } } }() sz := 128 << 10 // 5 entries per value log file. v := make([]byte, sz) rand.Read(v[:rand.Intn(sz)]) txn := db.NewTransaction(true) for i := 0; i < 10000; i++ { require.NoError(t, txn.SetEntry(NewEntry([]byte(fmt.Sprintf("key%d", i)), v))) if i%3 == 0 { require.NoError(t, txn.Commit()) txn = db.NewTransaction(true) } if i%100 == 0 { db.opt.Debugf("next 100 entries added to DB") } } require.NoError(t, txn.Commit()) close(closeChan) <-doneChan } // Earlier, if head is not pointing to latest Vlog file, then at replay badger used to crash with // index out of range panic. After fix in this commit it should not. func TestNoCrash(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err, "cannot create badger dir") defer removeDir(dir) ops := getTestOptions(dir) ops.ValueLogMaxEntries = 1 ops.ValueThreshold = 32 db, err := Open(ops) require.NoError(t, err, "unable to open db") // entering 100 entries will generate 100 vlog files for i := 0; i < 100; i++ { err := db.Update(func(txn *Txn) error { entry := NewEntry([]byte(fmt.Sprintf("key-%d", i)), []byte(fmt.Sprintf("val-%d", i))) return txn.SetEntry(entry) }) require.NoError(t, err, "update to db failed") } db.Lock() // make head to point to second file. We cannot make it point to the first // vlog file because we cannot push a zero head pointer. db.vhead = valuePointer{1, 0, 0} db.Unlock() db.Close() // reduce size of SSTable to flush early ops.MaxTableSize = 1 << 10 db, err = Open(ops) require.Nil(t, err, "error while opening db") require.NoError(t, db.Close()) } func TestForceFlushMemtable(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err, "temp dir for badger count not be created") ops := getTestOptions(dir) ops.ValueLogMaxEntries = 1 ops.LogRotatesToFlush = 1 db, err := Open(ops) require.NoError(t, err, "error while openning db") defer db.Close() for i := 0; i < 3; i++ { err = db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry([]byte(fmt.Sprintf("key-%d", i)), []byte(fmt.Sprintf("value-%d", i)))) }) require.NoError(t, err, "unable to set key and value") } time.Sleep(1 * time.Second) // We want to make sure that memtable is flushed on disk. While flushing memtable to disk, // latest head is also stored in it. Hence we will try to read head from disk. To make sure // this. we will truncate all memtables. db.Lock() db.mt.DecrRef() for _, mt := range db.imm { mt.DecrRef() } db.imm = db.imm[:0] db.mt = skl.NewSkiplist(arenaSize(db.opt)) // Set it up for future writes. db.Unlock() // get latest value of value log head headKey := y.KeyWithTs(head, math.MaxUint64) vs, err := db.get(headKey) require.NoError(t, err) var vptr valuePointer vptr.Decode(vs.Value) // Since we are inserting 3 entries and ValueLogMaxEntries is 1, there will be 3 rotation. For // 1st and 2nd time head flushed with memtable will have fid as 0 and last time it will be 1. require.True(t, vptr.Fid == 1, fmt.Sprintf("expected fid: %d, actual fid: %d", 1, vptr.Fid)) } func TestVerifyChecksum(t *testing.T) { testVerfiyCheckSum := func(t *testing.T, opt Options) { path, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer os.Remove(path) opt.ValueDir = path opt.Dir = path // use stream write for writing. runBadgerTest(t, &opt, func(t *testing.T, db *DB) { value := make([]byte, 32) y.Check2(rand.Read(value)) l := &pb.KVList{} st := 0 for i := 0; i < 1000; i++ { key := make([]byte, 8) binary.BigEndian.PutUint64(key, uint64(i)) l.Kv = append(l.Kv, &pb.KV{ Key: key, Value: value, StreamId: uint32(st), Version: 1, }) if i%100 == 0 { st++ } } sw := db.NewStreamWriter() require.NoError(t, sw.Prepare(), "sw.Prepare() failed") require.NoError(t, sw.Write(l), "sw.Write() failed") require.NoError(t, sw.Flush(), "sw.Flush() failed") require.NoError(t, db.VerifyChecksum(), "checksum verification failed for DB") }) } t.Run("Testing Verify Checksum without encryption", func(t *testing.T) { testVerfiyCheckSum(t, getTestOptions("")) }) t.Run("Testing Verify Checksum with Encryption", func(t *testing.T) { key := make([]byte, 32) _, err := rand.Read(key) require.NoError(t, err) opt := getTestOptions("") opt.EncryptionKey = key testVerfiyCheckSum(t, opt) }) } func TestMain(m *testing.M) { flag.Parse() os.Exit(m.Run()) } func removeDir(dir string) { if err := os.RemoveAll(dir); err != nil { panic(err) } } func TestWriteInemory(t *testing.T) { opt := DefaultOptions("").WithInMemory(true) db, err := Open(opt) require.NoError(t, err) defer func() { require.NoError(t, db.Close()) }() for i := 0; i < 100; i++ { txnSet(t, db, []byte(fmt.Sprintf("key%d", i)), []byte(fmt.Sprintf("val%d", i)), 0x00) } err = db.View(func(txn *Txn) error { for j := 0; j < 100; j++ { item, err := txn.Get([]byte(fmt.Sprintf("key%d", j))) require.NoError(t, err) expected := []byte(fmt.Sprintf("val%d", j)) item.Value(func(val []byte) error { require.Equal(t, expected, val, "Invalid value for key %q. expected: %q, actual: %q", item.Key(), expected, val) return nil }) } return nil }) require.NoError(t, err) } badger-2.2007.2/dir_unix.go000066400000000000000000000070161372173116500153520ustar00rootroot00000000000000// +build !windows /* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "fmt" "io/ioutil" "os" "path/filepath" "github.com/pkg/errors" "golang.org/x/sys/unix" ) // directoryLockGuard holds a lock on a directory and a pid file inside. The pid file isn't part // of the locking mechanism, it's just advisory. type directoryLockGuard struct { // File handle on the directory, which we've flocked. f *os.File // The absolute path to our pid file. path string // Was this a shared lock for a read-only database? readOnly bool } // acquireDirectoryLock gets a lock on the directory (using flock). If // this is not read-only, it will also write our pid to // dirPath/pidFileName for convenience. func acquireDirectoryLock(dirPath string, pidFileName string, readOnly bool) ( *directoryLockGuard, error) { // Convert to absolute path so that Release still works even if we do an unbalanced // chdir in the meantime. absPidFilePath, err := filepath.Abs(filepath.Join(dirPath, pidFileName)) if err != nil { return nil, errors.Wrap(err, "cannot get absolute path for pid lock file") } f, err := os.Open(dirPath) if err != nil { return nil, errors.Wrapf(err, "cannot open directory %q", dirPath) } opts := unix.LOCK_EX | unix.LOCK_NB if readOnly { opts = unix.LOCK_SH | unix.LOCK_NB } err = unix.Flock(int(f.Fd()), opts) if err != nil { f.Close() return nil, errors.Wrapf(err, "Cannot acquire directory lock on %q. Another process is using this Badger database.", dirPath) } if !readOnly { // Yes, we happily overwrite a pre-existing pid file. We're the // only read-write badger process using this directory. err = ioutil.WriteFile(absPidFilePath, []byte(fmt.Sprintf("%d\n", os.Getpid())), 0666) if err != nil { f.Close() return nil, errors.Wrapf(err, "Cannot write pid file %q", absPidFilePath) } } return &directoryLockGuard{f, absPidFilePath, readOnly}, nil } // Release deletes the pid file and releases our lock on the directory. func (guard *directoryLockGuard) release() error { var err error if !guard.readOnly { // It's important that we remove the pid file first. err = os.Remove(guard.path) } if closeErr := guard.f.Close(); err == nil { err = closeErr } guard.path = "" guard.f = nil return err } // openDir opens a directory for syncing. func openDir(path string) (*os.File, error) { return os.Open(path) } // When you create or delete a file, you have to ensure the directory entry for the file is synced // in order to guarantee the file is visible (if the system crashes). (See the man page for fsync, // or see https://github.com/coreos/etcd/issues/6368 for an example.) func syncDir(dir string) error { f, err := openDir(dir) if err != nil { return errors.Wrapf(err, "While opening directory: %s.", dir) } err = f.Sync() closeErr := f.Close() if err != nil { return errors.Wrapf(err, "While syncing directory: %s.", dir) } return errors.Wrapf(closeErr, "While closing directory: %s.", dir) } badger-2.2007.2/dir_windows.go000066400000000000000000000075031372173116500160620ustar00rootroot00000000000000// +build windows /* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger // OpenDir opens a directory in windows with write access for syncing. import ( "os" "path/filepath" "syscall" "github.com/pkg/errors" ) // FILE_ATTRIBUTE_TEMPORARY - A file that is being used for temporary storage. // FILE_FLAG_DELETE_ON_CLOSE - The file is to be deleted immediately after all of its handles are // closed, which includes the specified handle and any other open or duplicated handles. // See: https://docs.microsoft.com/en-us/windows/desktop/FileIO/file-attribute-constants // NOTE: Added here to avoid importing golang.org/x/sys/windows const ( FILE_ATTRIBUTE_TEMPORARY = 0x00000100 FILE_FLAG_DELETE_ON_CLOSE = 0x04000000 ) func openDir(path string) (*os.File, error) { fd, err := openDirWin(path) if err != nil { return nil, err } return os.NewFile(uintptr(fd), path), nil } func openDirWin(path string) (fd syscall.Handle, err error) { if len(path) == 0 { return syscall.InvalidHandle, syscall.ERROR_FILE_NOT_FOUND } pathp, err := syscall.UTF16PtrFromString(path) if err != nil { return syscall.InvalidHandle, err } access := uint32(syscall.GENERIC_READ | syscall.GENERIC_WRITE) sharemode := uint32(syscall.FILE_SHARE_READ | syscall.FILE_SHARE_WRITE) createmode := uint32(syscall.OPEN_EXISTING) fl := uint32(syscall.FILE_FLAG_BACKUP_SEMANTICS) return syscall.CreateFile(pathp, access, sharemode, nil, createmode, fl, 0) } // DirectoryLockGuard holds a lock on the directory. type directoryLockGuard struct { h syscall.Handle path string } // AcquireDirectoryLock acquires exclusive access to a directory. func acquireDirectoryLock(dirPath string, pidFileName string, readOnly bool) (*directoryLockGuard, error) { if readOnly { return nil, ErrWindowsNotSupported } // Convert to absolute path so that Release still works even if we do an unbalanced // chdir in the meantime. absLockFilePath, err := filepath.Abs(filepath.Join(dirPath, pidFileName)) if err != nil { return nil, errors.Wrap(err, "Cannot get absolute path for pid lock file") } // This call creates a file handler in memory that only one process can use at a time. When // that process ends, the file is deleted by the system. // FILE_ATTRIBUTE_TEMPORARY is used to tell Windows to try to create the handle in memory. // FILE_FLAG_DELETE_ON_CLOSE is not specified in syscall_windows.go but tells Windows to delete // the file when all processes holding the handler are closed. // XXX: this works but it's a bit klunky. i'd prefer to use LockFileEx but it needs unsafe pkg. h, err := syscall.CreateFile( syscall.StringToUTF16Ptr(absLockFilePath), 0, 0, nil, syscall.OPEN_ALWAYS, uint32(FILE_ATTRIBUTE_TEMPORARY|FILE_FLAG_DELETE_ON_CLOSE), 0) if err != nil { return nil, errors.Wrapf(err, "Cannot create lock file %q. Another process is using this Badger database", absLockFilePath) } return &directoryLockGuard{h: h, path: absLockFilePath}, nil } // Release removes the directory lock. func (g *directoryLockGuard) release() error { g.path = "" return syscall.CloseHandle(g.h) } // Windows doesn't support syncing directories to the file system. See // https://github.com/dgraph-io/badger/issues/699#issuecomment-504133587 for more details. func syncDir(dir string) error { return nil } badger-2.2007.2/doc.go000066400000000000000000000023021372173116500142670ustar00rootroot00000000000000/* Package badger implements an embeddable, simple and fast key-value database, written in pure Go. It is designed to be highly performant for both reads and writes simultaneously. Badger uses Multi-Version Concurrency Control (MVCC), and supports transactions. It runs transactions concurrently, with serializable snapshot isolation guarantees. Badger uses an LSM tree along with a value log to separate keys from values, hence reducing both write amplification and the size of the LSM tree. This allows LSM tree to be served entirely from RAM, while the values are served from SSD. Usage Badger has the following main types: DB, Txn, Item and Iterator. DB contains keys that are associated with values. It must be opened with the appropriate options before it can be accessed. All operations happen inside a Txn. Txn represents a transaction, which can be read-only or read-write. Read-only transactions can read values for a given key (which are returned inside an Item), or iterate over a set of key-value pairs using an Iterator (which are returned as Item type values as well). Read-write transactions can also update and delete keys from the DB. See the examples for more usage details. */ package badger badger-2.2007.2/errors.go000066400000000000000000000130671372173116500150500ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "math" "github.com/pkg/errors" ) const ( // ValueThresholdLimit is the maximum permissible value of opt.ValueThreshold. ValueThresholdLimit = math.MaxUint16 - 16 + 1 ) var ( // ErrValueLogSize is returned when opt.ValueLogFileSize option is not within the valid // range. ErrValueLogSize = errors.New("Invalid ValueLogFileSize, must be between 1MB and 2GB") // ErrKeyNotFound is returned when key isn't found on a txn.Get. ErrKeyNotFound = errors.New("Key not found") // ErrTxnTooBig is returned if too many writes are fit into a single transaction. ErrTxnTooBig = errors.New("Txn is too big to fit into one request") // ErrConflict is returned when a transaction conflicts with another transaction. This can // happen if the read rows had been updated concurrently by another transaction. ErrConflict = errors.New("Transaction Conflict. Please retry") // ErrReadOnlyTxn is returned if an update function is called on a read-only transaction. ErrReadOnlyTxn = errors.New("No sets or deletes are allowed in a read-only transaction") // ErrDiscardedTxn is returned if a previously discarded transaction is re-used. ErrDiscardedTxn = errors.New("This transaction has been discarded. Create a new one") // ErrEmptyKey is returned if an empty key is passed on an update function. ErrEmptyKey = errors.New("Key cannot be empty") // ErrInvalidKey is returned if the key has a special !badger! prefix, // reserved for internal usage. ErrInvalidKey = errors.New("Key is using a reserved !badger! prefix") // ErrRetry is returned when a log file containing the value is not found. // This usually indicates that it may have been garbage collected, and the // operation needs to be retried. ErrRetry = errors.New("Unable to find log file. Please retry") // ErrThresholdZero is returned if threshold is set to zero, and value log GC is called. // In such a case, GC can't be run. ErrThresholdZero = errors.New( "Value log GC can't run because threshold is set to zero") // ErrNoRewrite is returned if a call for value log GC doesn't result in a log file rewrite. ErrNoRewrite = errors.New( "Value log GC attempt didn't result in any cleanup") // ErrRejected is returned if a value log GC is called either while another GC is running, or // after DB::Close has been called. ErrRejected = errors.New("Value log GC request rejected") // ErrInvalidRequest is returned if the user request is invalid. ErrInvalidRequest = errors.New("Invalid request") // ErrManagedTxn is returned if the user tries to use an API which isn't // allowed due to external management of transactions, when using ManagedDB. ErrManagedTxn = errors.New( "Invalid API request. Not allowed to perform this action using ManagedDB") // ErrInvalidDump if a data dump made previously cannot be loaded into the database. ErrInvalidDump = errors.New("Data dump cannot be read") // ErrZeroBandwidth is returned if the user passes in zero bandwidth for sequence. ErrZeroBandwidth = errors.New("Bandwidth must be greater than zero") // ErrInvalidLoadingMode is returned when opt.ValueLogLoadingMode option is not // within the valid range ErrInvalidLoadingMode = errors.New("Invalid ValueLogLoadingMode, must be FileIO or MemoryMap") // ErrReplayNeeded is returned when opt.ReadOnly is set but the // database requires a value log replay. ErrReplayNeeded = errors.New("Database was not properly closed, cannot open read-only") // ErrWindowsNotSupported is returned when opt.ReadOnly is used on Windows ErrWindowsNotSupported = errors.New("Read-only mode is not supported on Windows") // ErrTruncateNeeded is returned when the value log gets corrupt, and requires truncation of // corrupt data to allow Badger to run properly. ErrTruncateNeeded = errors.New( "Value log truncate required to run DB. This might result in data loss") // ErrBlockedWrites is returned if the user called DropAll. During the process of dropping all // data from Badger, we stop accepting new writes, by returning this error. ErrBlockedWrites = errors.New("Writes are blocked, possibly due to DropAll or Close") // ErrNilCallback is returned when subscriber's callback is nil. ErrNilCallback = errors.New("Callback cannot be nil") // ErrEncryptionKeyMismatch is returned when the storage key is not // matched with the key previously given. ErrEncryptionKeyMismatch = errors.New("Encryption key mismatch") // ErrInvalidDataKeyID is returned if the datakey id is invalid. ErrInvalidDataKeyID = errors.New("Invalid datakey id") // ErrInvalidEncryptionKey is returned if length of encryption keys is invalid. ErrInvalidEncryptionKey = errors.New("Encryption key's length should be" + "either 16, 24, or 32 bytes") // ErrGCInMemoryMode is returned when db.RunValueLogGC is called in in-memory mode. ErrGCInMemoryMode = errors.New("Cannot run value log GC when DB is opened in InMemory mode") // ErrDBClosed is returned when a get operation is performed after closing the DB. ErrDBClosed = errors.New("DB Closed") ) badger-2.2007.2/go.mod000066400000000000000000000013231372173116500143030ustar00rootroot00000000000000module github.com/dgraph-io/badger/v2 go 1.12 require ( github.com/DataDog/zstd v1.4.1 github.com/cespare/xxhash v1.1.0 github.com/dgraph-io/ristretto v0.0.3-0.20200630154024-f66de99634de github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2 github.com/dustin/go-humanize v1.0.0 github.com/golang/protobuf v1.3.1 github.com/golang/snappy v0.0.1 github.com/kr/pretty v0.1.0 // indirect github.com/pkg/errors v0.8.1 github.com/spaolacci/murmur3 v1.1.0 // indirect github.com/spf13/cobra v0.0.5 github.com/stretchr/testify v1.4.0 golang.org/x/net v0.0.0-20190620200207-3b0461eec859 golang.org/x/sys v0.0.0-20190626221950-04f50cda93cb gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect ) badger-2.2007.2/go.sum000066400000000000000000000153221372173116500143340ustar00rootroot00000000000000github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/DataDog/zstd v1.4.1 h1:3oxKN3wbHibqx897utPC2LTQU4J+IHWWJO+glkAkpFM= github.com/DataDog/zstd v1.4.1/go.mod h1:1jcaCB/ufaK+sKp1NBhlGmpz41jOoPQ35bpF36t7BBo= github.com/OneOfOne/xxhash v1.2.2 h1:KMrpdQIwFcEqXDklaen+P1axHaj9BSKzvpUUfnHldSE= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= github.com/cespare/xxhash v1.1.0 h1:a6HrQnmkObjyL+Gs60czilIUGqrzKutQD6XZog3p+ko= github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc= github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE= github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk= github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dgraph-io/ristretto v0.0.3-0.20200630154024-f66de99634de h1:t0UHb5vdojIDUqktM6+xJAfScFBsVpXZmqC9dsgJmeA= github.com/dgraph-io/ristretto v0.0.3-0.20200630154024-f66de99634de/go.mod h1:KPxhHT9ZxKefz+PCeOGsrHpl1qZ7i70dGTu2u+Ahh6E= github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2 h1:tdlZCpZ/P9DhczCTSixgIKmwPv6+wP5DGjqLYw5SUiA= github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw= github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo= github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4= github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NHg9XEKhtSvM= github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g= github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI= github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ= github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= github.com/spf13/cobra v0.0.5 h1:f0B+LkLX6DtmRH1isoNA9VTtNUK9K8xYd28JNNfOv/s= github.com/spf13/cobra v0.0.5/go.mod h1:3K3wKZymM7VvHMDS9+Akkh4K60UwM26emMESw8tLCHU= github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo= github.com/spf13/pflag v1.0.3 h1:zPAT6CGy6wXeQ7NtTnaTerfKOsV6V6F8agHXFiazDkg= github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= github.com/spf13/viper v1.3.2/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0= github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/net v0.0.0-20190620200207-3b0461eec859 h1:R/3boaszxrf1GEUWTVDzSKVwLmSJpwZ1yqXm8j0v2QI= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190626221950-04f50cda93cb h1:fgwFCsaw9buMuxNd6+DQfAuSFqbNiQZpcgJQAgJsK6k= golang.org/x/sys v0.0.0-20190626221950-04f50cda93cb/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= badger-2.2007.2/histogram.go000066400000000000000000000116311372173116500155240ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "fmt" "math" ) // PrintHistogram builds and displays the key-value size histogram. // When keyPrefix is set, only the keys that have prefix "keyPrefix" are // considered for creating the histogram func (db *DB) PrintHistogram(keyPrefix []byte) { if db == nil { fmt.Println("\nCannot build histogram: DB is nil.") return } histogram := db.buildHistogram(keyPrefix) fmt.Printf("Histogram of key sizes (in bytes)\n") histogram.keySizeHistogram.printHistogram() fmt.Printf("Histogram of value sizes (in bytes)\n") histogram.valueSizeHistogram.printHistogram() } // histogramData stores information about a histogram type histogramData struct { bins []int64 countPerBin []int64 totalCount int64 min int64 max int64 sum int64 } // sizeHistogram contains keySize histogram and valueSize histogram type sizeHistogram struct { keySizeHistogram, valueSizeHistogram histogramData } // newSizeHistogram returns a new instance of keyValueSizeHistogram with // properly initialized fields. func newSizeHistogram() *sizeHistogram { // TODO(ibrahim): find appropriate bin size. keyBins := createHistogramBins(1, 16) valueBins := createHistogramBins(1, 30) return &sizeHistogram{ keySizeHistogram: histogramData{ bins: keyBins, countPerBin: make([]int64, len(keyBins)+1), max: math.MinInt64, min: math.MaxInt64, sum: 0, }, valueSizeHistogram: histogramData{ bins: valueBins, countPerBin: make([]int64, len(valueBins)+1), max: math.MinInt64, min: math.MaxInt64, sum: 0, }, } } // createHistogramBins creates bins for an histogram. The bin sizes are powers // of two of the form [2^min_exponent, ..., 2^max_exponent]. func createHistogramBins(minExponent, maxExponent uint32) []int64 { var bins []int64 for i := minExponent; i <= maxExponent; i++ { bins = append(bins, int64(1)< histogram.max { histogram.max = value } if value < histogram.min { histogram.min = value } histogram.sum += value histogram.totalCount++ for index := 0; index <= len(histogram.bins); index++ { // Allocate value in the last buckets if we reached the end of the Bounds array. if index == len(histogram.bins) { histogram.countPerBin[index]++ break } // Check if the value should be added to the "index" bin if value < int64(histogram.bins[index]) { histogram.countPerBin[index]++ break } } } // buildHistogram builds the key-value size histogram. // When keyPrefix is set, only the keys that have prefix "keyPrefix" are // considered for creating the histogram func (db *DB) buildHistogram(keyPrefix []byte) *sizeHistogram { txn := db.NewTransaction(false) defer txn.Discard() itr := txn.NewIterator(DefaultIteratorOptions) defer itr.Close() badgerHistogram := newSizeHistogram() // Collect key and value sizes. for itr.Seek(keyPrefix); itr.ValidForPrefix(keyPrefix); itr.Next() { item := itr.Item() badgerHistogram.keySizeHistogram.Update(item.KeySize()) badgerHistogram.valueSizeHistogram.Update(item.ValueSize()) } return badgerHistogram } // printHistogram prints the histogram data in a human-readable format. func (histogram histogramData) printHistogram() { fmt.Printf("Total count: %d\n", histogram.totalCount) fmt.Printf("Min value: %d\n", histogram.min) fmt.Printf("Max value: %d\n", histogram.max) fmt.Printf("Mean: %.2f\n", float64(histogram.sum)/float64(histogram.totalCount)) fmt.Printf("%24s %9s\n", "Range", "Count") numBins := len(histogram.bins) for index, count := range histogram.countPerBin { if count == 0 { continue } // The last bin represents the bin that contains the range from // the last bin up to infinity so it's processed differently than the // other bins. if index == len(histogram.countPerBin)-1 { lowerBound := int(histogram.bins[numBins-1]) fmt.Printf("[%10d, %10s) %9d\n", lowerBound, "infinity", count) continue } upperBound := int(histogram.bins[index]) lowerBound := 0 if index > 0 { lowerBound = int(histogram.bins[index-1]) } fmt.Printf("[%10d, %10d) %9d\n", lowerBound, upperBound, count) } fmt.Println() } badger-2.2007.2/histogram_test.go000066400000000000000000000065541372173116500165730ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "testing" "github.com/stretchr/testify/require" ) func TestBuildKeyValueSizeHistogram(t *testing.T) { t.Run("All same size key-values", func(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { entries := int64(40) err := db.Update(func(txn *Txn) error { for i := rune(0); i < rune(entries); i++ { err := txn.SetEntry(NewEntry([]byte(string(i)), []byte("B"))) if err != nil { return err } } return nil }) require.NoError(t, err) histogram := db.buildHistogram(nil) keyHistogram := histogram.keySizeHistogram valueHistogram := histogram.valueSizeHistogram require.Equal(t, entries, keyHistogram.totalCount) require.Equal(t, entries, valueHistogram.totalCount) // Each entry is of size one. So the sum of sizes should be the same // as number of entries require.Equal(t, entries, valueHistogram.sum) require.Equal(t, entries, keyHistogram.sum) // All value sizes are same. The first bin should have all the values. require.Equal(t, entries, valueHistogram.countPerBin[0]) require.Equal(t, entries, keyHistogram.countPerBin[0]) require.Equal(t, int64(1), keyHistogram.max) require.Equal(t, int64(1), keyHistogram.min) require.Equal(t, int64(1), valueHistogram.max) require.Equal(t, int64(1), valueHistogram.min) }) }) t.Run("different size key-values", func(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { entries := int64(3) err := db.Update(func(txn *Txn) error { if err := txn.SetEntry(NewEntry([]byte("A"), []byte("B"))); err != nil { return err } if err := txn.SetEntry(NewEntry([]byte("AA"), []byte("BB"))); err != nil { return err } return txn.SetEntry(NewEntry([]byte("AAA"), []byte("BBB"))) }) require.NoError(t, err) histogram := db.buildHistogram(nil) keyHistogram := histogram.keySizeHistogram valueHistogram := histogram.valueSizeHistogram require.Equal(t, entries, keyHistogram.totalCount) require.Equal(t, entries, valueHistogram.totalCount) // Each entry is of size one. So the sum of sizes should be the same // as number of entries require.Equal(t, int64(6), valueHistogram.sum) require.Equal(t, int64(6), keyHistogram.sum) // Length 1 key is in first bucket, length 2 and 3 are in the second // bucket require.Equal(t, int64(1), valueHistogram.countPerBin[0]) require.Equal(t, int64(2), valueHistogram.countPerBin[1]) require.Equal(t, int64(1), keyHistogram.countPerBin[0]) require.Equal(t, int64(2), keyHistogram.countPerBin[1]) require.Equal(t, int64(3), keyHistogram.max) require.Equal(t, int64(1), keyHistogram.min) require.Equal(t, int64(3), valueHistogram.max) require.Equal(t, int64(1), valueHistogram.min) }) }) } badger-2.2007.2/images/000077500000000000000000000000001372173116500144435ustar00rootroot00000000000000badger-2.2007.2/images/benchmarks-rocksdb.png000066400000000000000000002025721372173116500207230ustar00rootroot00000000000000PNG  IHDR"8gAMA a cHRMz&u0`:pQ<bKGDIDATxw\SgC@TQā{['uQGQQkEmZV޸"K*{=BMX( <9;+8OwPP(@ EPBTP(@ EPBTP(@ EPBTP(@ EPBTP(@ EPBTP(@ EPBTP(@ EPBTP(@ EPBTP(@ EPBTP(@|~׌ Y$뻺iN 733ٳg5mӦMEEEC[˗>|جY]~n޼}5kQnڴŋfͪ]vuQffŠlllzݮ]+(bJ C&=z̬͛]v "wJ啘;vԯ_]7{6Ȉ.]qNDZZ~\.ܔ;MMM_U6mrׯݿRÐH$͛7-6mX޿?\QT=z֭[ u@ (g &h߾}e|ĉu)**vڭ[~^z >vJDgΜiѢ}P|_~8yR:v쨆0V^bkk_׭[w/_NDɎ|woP(W_}>g]vsΙ3gCD%%%Ν{95lG,gddܺury```ƍׯOD7o|aAAM=*RPP_%ڵkעEҋ9ĥKBCC֭;pқ4nXOO^zDTTTteXsڵ`++*׿rJHHHݺu p޽,OOkѣٍI %**ʕ+B?{n׮]6lf>x 99E-[ebcc!''ƍM4߭ pE"ݻիW0@ ;wѣG5իPsٳ:::-Zb^߱cDŽ .hiiFԳgϦ׮]O>EO>pBIIIVڶm/._ђ%KwND}~q׮]j'q-[doC=p@MMSN5lذW^b2e ;%%ܹs^b+k.//e/Jқkjj_rǀK?vw}WzfӦM7ۤ9;;Gi֬YAAիWK P(C-=$22A˿4%%B VTTq\9dW`_[O؆FR\\\6lXSW^}-N孧AAAʙD4j(R"hl5j=آґQǎ"[nƍ0773rH۷rחۥ <"4i[@DSLDlGңGljjz=hMhРARo7iҤ Y[[XÇ ?{񇺰0((Ĥtꉋ+ƴiӈH[[{W^U'ҷ=iddĕ>(+ O>zqܷ~KD:tycظ5ɓH,ܿڵkܹsW^e BO;vXn1͜9rm6ƍ+y.]ڰaȑ#:tgӦM/r ~ŋ'N0ayf֭[GDk>u5رcnjsƍ+W9ǎ+DT~ 9....]Zt)tttx%Kdjj*qD_ܽ{g@r/MMMӧO? 㒒 SNp+5jP(gϞl3*C]PPN6ƍ'+8_okk CWWPBqʕ-ZxA"թS'6.ɂv8]TSn+bbb -PfyÇlNs6VO5k;6!!!l)yǏ,77W( ̲a-\^@ ؾ};[Zv04vKىrrG(C"=F.:uwQ ޾}{ǎ7nHII)ADFFF!@寢&M!㉈|D԰aCrŊ^"be2gGDT^=CCCeF:u277066nժ&=}-[vZ"&(e{~(gu%" 卦YYY{9w\HHڿ{QFܹsj.\hРeJpqq_ EHHH``urrr.]zĉ|jSYzB]T+W\r/^nݺgϞ-\pܸqʗd(ɓ'u9s ?s@B!Ο?ݺu/СÌ3޽[6KѩSﯯ?mڴΝ;={vo\Ƣ"BF^g9r"ۿQ_ⅈؓT&"T _PPХKF)guCCÊieZ4uܡC-[xyy{J0$%&&qn%M4)bjnܸ_FGGbwwwwwwL|Ȳ5433;vlX[[ܹsRRұcZn0eGGǮ]fdd]|}}oݺehhxY3TkD2cƌMMѣGё#G( `޼y-[LLLT\.gG%cǮZ,Fqq޽{ٜ۷Qݺuׯ_! @ӧ5"Ylb7455YUƔmo^xN;vlĉopP~cwFEEծ]$55T(iF*6k֬bFEDDxbҥQ~~׉N:lv wf?yyyӦM3fLVVVڵK,M&11ϏqH[[̙sQv.tP}Ǐ=砭mddd`` fBc۶mE7n8Ny{;vXx1YXXlܸ/`7V3fD"@Vf(.YDk(G_+O]AdHHH~~61bӦM!mÇ(8l"6IkbJJ ;S޻wŋ[XXъ+vYfD[[֭[A`` [釚֭[ҥKٴnݺ Cyq̛֭7|Fl2>z@XXa'+ٱ:uF78p [|Ì3YUIDgYXXvPV(5gd2rڵk+/bhii-[w-9?ݵ\\\\bddDkViY(rm6eCMXN?Oqo:H4yΝ;r Ebbb/>䓬,(((HYs̛7PHRV2 4`rJv͍{j}||A߾}322ɡCJtѢE2-UAAAo̧߿ S}mB.|R[[c;;; 5LK"_k*űk~=o֭[lL'JJ?N+ԩOl˗j*[JMM̴efiqqqueTG%''M|$55^yPgddZ[[/D"ׯW^{ * "O?4e"wtt<~jժPCV\\ܧO˗/+ݻO>|D(/QQQŵj,{%"{@ EPBTP(@ EPBTP(@ EPBTP(@ EPBTP|xgzz2v[IWPѫޫo>JΉ?_p.؆Wn}X1(m-_}>>>o3fƍSRRH$YZZΎ$ڵk:ZZZ...D| D}NĭR=<<ݍ={<_:gΜQ(vvvqqq}u߿A7oKs"(ٳ)8zh"Jߗ-Zg+FGG;99IRRRVVT*wL&kڴr:uRutt\hѕ+WؤM>}ҥ|\qNE LvڵkQFFF=سg'OO?4nܸk̙7oVn'H$:u*(({QQQe2bĈW^Ĕ)J_ޢE c̉O?8.--! ֯_ߥKo~˗oذaƍ.]INN^jɓ'K,=<<^zh";d166NIIS9;;BŻw3Fs"(Yhʕ+wﮯrJkkk"l׮[CO\bń >✈Bލc&MMֵkI&T*e+ BV͛mmmׯ__;w޳g uebXKK`РAfff|\qND4k,\n:":r\.'zzz6hР $$"""Ə/^z˗/߷o_hhhhhh^jժ뛘qL&+..믿.\8vX---HPE.\HNNnԨD<$WXۯ_'Ν;k׮bdÆ :::k֬>}eZZںuw3gμyN>ݴi.]XYYoذ0uT"Դ:t<({C$ۯYN __NDojՊ FD[l/**233cs:tpƍ4333#""آO?O?%UVԪUK(xt[[׎>'PFOOOOO@`ii[Tw='P=wUrb`0PBTJyp%e[PѫZ޷Wq |L#'V ]P"ETGǧϰ{sK/Ԭcx7Pq.z;H. {oMFu5>@_.s/[|7fӷ2H!L"jP[{+l\KrbŠP%H֮]>kiiO$׮]+((ի-6HP~o%%"M 1v~ô|ωhp:z[$ c}Cl_[yHG [ -Y ^f_;j*߿A7o #""+cX]*McÁ-\H&/^|,|9W>,xo;Q8EFsw4Hr- 3iDy_HμΪ_ݚ뫡ADnnnӧO_t)dڵk5sssܹSNқGDDxxx$&&:88b"LJJrqq"tuu4iw[7eJqF~q"A!HSC 7>&jΉQQQ͚54KC$fʨ ͈l uEr"\KS* B"!RkȒQ9od۶Jׯ_oѢ 2fժU ={˗/GѮ]Ν;ʕ+]ֶmǟ:uvܹqqq+W\`Aݏ?~ 7n?VۭPo)cQ:W7%"=mNK~wPYԖL2tHVRV*Z;+O-6vFNγIERձߎ )W>x5fD_%[KXUBDEREBzS׆ׯjHPݻwQ\\\nf͚EDm۶ ={֭[ 0|";v,[СC7n044fg\~ѣGBpȑÇ!"@p951xO]MwqzvHHG?7^:g˳'S4 "234jd2[62 8,*JgxvIC]Qn? U944T$IK.M0… NNNEEE111?"ٳgM<==SSS### ܜ-;;{ҤIl̤$"jڴ)DFw2YՔMF,teA|99P-hX<3WvFq Z62yLLDb TG  %?./b֠A6n3{3gS"X455R)ۄ8"200(((P)**""H6g6…,YjkkP2ܲ|#ΉU{|= 9SZ"eDq)51lb45n{n# 2-4g k3|7?( LV\\|…FݻwkĈ...W\dDԩSǏ+ Tzy"j޼yjj۷yyyDlddnԜ9+ErQbzOKsw3LŎutC] fwf^uu7@Ң(&?ĞDk֬ׯߘ1cFYRRҲe‰'~ݺuDl7]]-[,Z(??]vbXSSy'Nӧmrr Ԝ_x1`G;Z+ E`+&TH|5}~jè̍4BQз}yׯ_OOO777Da}"ڱcGFFsyyyqqqgϞ%cǎф ƍܜ&Ow[KNtrrzA+ EybeL 4 %*^RaF*??/8P#;_.W2\"VZe盙)?lܸѣGǏ8pDm+I=[p8r" EF+9߶,[|wDG&$$= 5s/mƭ|8c]U_Q5U?'P166; UB%vt51䲜{K^=#" {;6-y4n*#-/P(EEEl222200ۻm۶D$ˏ90b +ďf́['%0BC { i6| UޣHDtSFGGɇ7y|͉'hϟڵkHH̙3R '~LvN|whEwt[w&"=3,G "!7Eںu訜sҥ>lȐ!D$JO<9`˗/oٲյI&mڴlZrG&A˱vwɮuDTvװofzAϡE"0 6mCCCo߾7<]v-//O>zDve˖ ~N5RΙ7oӧDqСÇ!#9d6eߥ$Q_:YY{@uUTTss~ſy~~'tpEmQPPЁ9BDF2eʝ;wRRR֮];QIojVROTPѫޫs""c벌-h1mښk&V~O'#@3tО={QnnիW"dkvss+Ç]\\444gWWWv-222))m.J߿/[hfFDDxxx$&&:88(U``e˶o^~ &ر#66v 6{TQԿاqE_9P+iժ%}j!,!˻rK]CE^vQ="*{^ "@ٱ0>":sL6mjժUPP0d{{{UV 0`ʭ>䓻ws΍suu]r w^322>ƍ~7򋣣ʕ+]ֶmǟ:u}BvرgϞ{4?n``ADM4 CR9k8"ا(S[qtRܠIHzQV(;vlҤID׭[YfQ۶mJe]~ѣGBpȑÇl~.]WXqС7nFGG[B Ξ=k׮CYZZd2@Ꚗw9Zjůu""YjB eS+KIШU;U%/ٳe˒[bbb?~UNBCCYIDIII666l9}tC6ltrr244d|jP(ۓڷo&]\\N}N>]PP0h ͛7gϞ=s̞={J$+VD pGD2 Q#̙VXpyR}... lqEEE:oK^ӽ{w4333 1ď"GRv<#>C]dI~~?ܻwkĈDtRLVz}ccǏmʕ+lמ={ݻ7g"%B܏H$:rHΝE"===7ozmOOP(@iӦcs7f̘#Gl255P3LbeeXV-"j޼ĉckk_vw۷oĉsڵX,611ٰa-[-Z߮];XMCՏu]Yo&LkK6G{*f3# tZyyvdgoٳ'_޼%ϟOD_|[:jԨ!C噙f &7NI>lْ_TT"9v옉{ "T?z^;-|piL $.|K9ZDd4xA;V4o[{n@MQ7TM~?WuND"ZFZZZeG)7cutt6n#GGǏ8=_H(:7t=?/*3]JcOdn-Ht‚ϺȒ^ܾeBc\TP lwY3 tL$=z4000!!a>>>OPh5tQfZM&yF@GOɸyO²D%R-GYjٴ7/]:&wg uL'3$qŅ}}}?| b;G~C ]}}#S=Puge?ФiDͦ,6dRh`l1,%AǭMM'HAIQ]2F'8 E.M>"T7cOr9ܹӴn.R, o,kLNZl_oD&qvKrNw聎gaMPSM2gѢ{`xj.Pt ][M%iB- :""Res%R<7Kմud)\TIDq%)B >$='hƥͿmd$w /' \"?p_T3Y#vt%"ͺ Ь=]"2II Yrw?l+>/|*KzuiԪХ( AIrCF#gN/:c4U"pEE~n4տ˒^iM[,5QwpoREF&ɋd:;~dߌ#ogAiԲM߸Dwj ^9vKh6R=jʕ+ haqZ| PP'N̬4[eLpIs ICSۤ1>m/21Wq"P ?gt/ɴS44X뮫:XdLba!%VA {I E eAITZ.I5_)&-3gc-i\\*GW_,s&:yљ(ʒH$k׮e\\\'sk׮Ջ< n4yODq9љ-%D\_Qװ XIeK_߿A7o #"""q =jW=㙥^1ͻt^h DXjs}ag\wWCCܦOtR"dkvssck޹sN:7HIIHLLtppDbeeED))):88DEE5kL @;&uQ(6֒j~[C:l3&%Tz-ZQAA!CmllVZ5`ٳg|rĈڵKOOܹ3\ڵkm۶]~}||SlmmΝr t'N8qSL:thdd$+)+ EP-Hm9;;BŻw&nݺ͚5ڶm0{[0`D4vXڡCnܸahhY~GBȑ#CDܹs*ӧOӴJ/Ryڵ ''Ν;{zz)@% DRҥK&Lp႓SQQQ@@@LLǏ(22rlOOH'''CCC"rrr277g{Ξ4i[3333))6mҩ ŕ+WJR77;fee=~モ~~~^^^jn0_p##544 qИٳgϜ9gϞdŊD)}qDd``PPPSTTDD"mΜ9l… YUz,gϞf ]n]PPмyyU[vSw[IWPѫ5W^{ *KPd2\~ڵF?~kĈDtRLFD:u:~xnd2ݛ7ozmOOמ={ݻ7gΜ+W>\qq'O\\\>lbbb:vZ{'Oݻwj /8y a(D"{{5ko̘1#G,))iٲejjjaaĉnݺD"6𩮮-[-Z߮];Xټy'699Ak^x1`G;Z+ ED2bĈYfeee ><66믿3fLrzb"p}yׯ_gBD"6 ر=tjgQNNαcLLLh„ ƍSnND'OV j:mܿÇOAAA{)..&"~jiiivvvwS7uZneBwxSi>}@_FDD|GڵSO?Ϙ1C9gO>}@ ,000PՔZh )7o|yz Eysʓ|&??&1d]]QF۷bcc:?+N>]zs[[[GGGGG@X>lڴiO?ebXOO/775Gٓ]t;w.r"@uwΝz͙3M=z#-44T"ѹs5kQz.]?O6mdXXmHHH6m=zT^=GGG嶑III...VVV-jhmm=eʔ޽{߻w'N477777WMIS}/_Qݷoikkwŋl{ݎ1>[n?f̘ϟ?g+̝;wٲew9r >H[tE1++UVG=sLBBB֭GCT)ʓlX<84h޴iӚ5k,,,w_KDԻw~mذao[B1iҤ^zeee=ŋ%%%;w8p`RRңGBȑ##޳-^(>}t߾}zzzk֬:u*-[V<yyyyyyݿΝcǎ=q[ԱcǯJ"9swf͚QttT*̢ȸI&III666ِJ/6lhggww^|յiӦ7^rٳg+;*'O>n֭]a=<<5kɖboo/9sfƍqqqogmmm"D .TηܔF.\ܥgǎt%K?O?n޼PENj֬իg̘wt 8[~}xx8[t9kkz{n ڵkWpiii&&&cƌIHH 622rpp;viQ,,,w޽{w;vlvܹӥK0<7hРO?TSS3??n֭"HNvfϞ=k֬NOO?yRRҨQX'NӧmrrX,~Ptuu]p+lllU N|lmm?^vӧMifff0`Dfio[JOFEE)?mƍiii*M0aܸqcQ}||ƌtRDCTʓ&L;D"J }W$ժUCUBBa޽ׯ_Coo_~E&3'OSSŋ#Gzzzifٲe 42eJXXS􌢁ĉ6lXv/\`'Ow?ɋ/ؼ< ٓ<ݵk}:))믿<_|E߬Y"l޼9P411iԨխ[wڵ!!!YYY|7@pJ//_'8q͛NR(DT\\,^{@MPU3ӦM{axxxݺu/_| zj„ ,G˗/ mڴ);yڧOPPUzhbb&.]4vXW^_~}=殮p=rb_ӥWn.{ @UQOllllllzw 'OPw NGi}GoHHY4e~ ;8(F=E_Grb}&lIߎp2X4ڎ?"qEEE5 u_$VbƆ>B 7#~fc&N̐#+5 HNNnڴ[fZjwxT 2;53&" IXADreɜPT(>xe˖W~ÇCCCO8qY_k `ajq5$At.i+DvdΟD!F'9SzFZ^j.gX|T! Xxgvgo_OrEtEQWWn:vXitia!p<ŧEڙ>u>r dbC]c~_ܟ'4w5T&MN>N|G!k8Y>ޞ4D |u/*P]-aAȷM^)GMbQQQqqqF77777]X_#1}ȑY uEdP{sk3 3ׂ0P3=j.<$WiCYfs> |Qi\YB!ߑ@y*PqÆ wA^.믿P|O1yc[Y3ڙPsb?B$95Sy VL^fr& NB $WPW?vL.-9Q*P(**Z`[ӦM6mڵkW c<og) "Xe|YBQ_Gvjl_8@ P>y9Xb;J  R%Knݺuƍ9rW{{.$z.IC$}2+O)U&h料A2@ͅPÕ8TEgիWwر}[nݺ5w۫=zM?dIdq 7Օ$DtaָV{/I[G犄ˆgz.I$$"c1>)T^(Λ7ӧOwrrb3MMMn{K(ҳKCDg6ٿq~Hg#G[G[l؆37=cWuS.*P pԩS ڶm۠A==cǎcƌ5㶞Lfɔ37OT(#GFDg{|w`{JOO777WV?\pp;wKui]7ީ顈ݻܾ}͛.]*..6774hw'ToE %9V%@ՁϽ*an3<1uи}R-ZN[9ʨhhK}O ////// '<ìN@zs2H}trtwoBAJJ/駟-Zt~7o,-qk׮MJJr"ZB=`6]O1;Gdlwqb||ヂs&LҶmە+WÇǍмyoĉʕwܹeDw;r":?uYey ROc ' zt3gjii7oޜ9sGaa!%KѪUsҥ>lȐ!D$JO<9`">}#!X.Xe";HSl)uŗ 5]QxQ|||nZcc'OO9'""Օ}vuuJ͛.}^+֭_|7Ν;… jnm&M{ g}}}LMA<((G!իWݻQFzЊ3x%@N8<|GDG&gڵחdrs´~6zߕ2"Lr` *s499'O+n5yK?gHdhKZEaA5 n^|.44{,%:tȨC<=|0ߡ?cСgJ$.KL[-[dBCCBCC:ŋǏ|Gǣs ZD.~aUb-NZo{ZWf.߭f>֓ɋ-211yPT(sNNX,*D| oE(>~͛Ǐwuuuuu5j"t;۝~DDB=OmYNcB@.a-̭ko8b|7(O~lҤI_-JrJÇ0`q&MYFOO82 bcJF_0 :u܈"K#H.AR|||ffQFU\\, ou̙[ƍ R`mU`6|U3bĈÇkiiΫlooowPW̙3 W^DwM[[8##s̱ߐqƍ; '|@=pK~DDfaawԭM6?ҥː!Cիw/^|d9Q"""ZhwDn*PZvƪUKDQ̊FECC7n{̙3+ uC6 clmmލ:Ro&"D2l0 Аׯ]c:vرcGcǔ/gϞw_rPBT|TWM2>?xJ 588ݽZZ{իWmۖcǎt#RI[QūW>k׮/^x GBBªUΝkoo;o޼'N 5HLDDD=<>>44TE$E矶͚5sqq[.PSHHDD}… ,[xժUAAA|5G%/",,ݻSN<ر5kwP*T(")0>8$))e˖)))|5Hի]]]ٳϟKR E$E{͛G5tи#F1ȈQܻw` FFFf1bĸqF 6mڴiFekkKD;v;(7+aaa;w޵kL&500رc1 4&Lعs˗}||Μ9w8 E$E%ŋ7m4p{E$EҼϜ93rٳgO8ٳg|GBM"@)BsIII={;>Mi,)]gϞO>ꓑjggү_?CPQ""y7nܸ}'Ovvv;(7BI`"z͝@g/;X!/ոWn.؆ PJ/>@DZZѹmlt3GoFSߨ̬OjuDZB" м͈ls#͝ i|P"@@):\& >ܥQ]݈|#P(Z9MLlki}~fNׂ0HOdj]w5 ŪEѩq6:ڙ6 m-8鉞<~}9@lHbB 23| PŪe'cpW\~^u`uDg:fV^q+UH˂踂\DBjP[l(,5hTW7@.%̜m,5t p4WZ;ގ^4Xü-q d'u~moEdˆgn>/ W2'瓣^bKD٤U^ڄҲKl{VVgr?M ;R bբ%nb}ں \1m@mM D>*Ŋ4D<y0q?p] 2j DDL W(;(JW(gU"TA(.F_H."kYi/*Li/7<4L.n@MBjnҥIzvk}Zڄ˃m*==\=[p-եu۰b} *G 2>FϽ*aƾ<1uи}R-ZWᭌ-2*;3W׹n"7;G^'?##/׮]zɠׯ~&&&Do߾Lm۲՞?~\޷o_WWW8[nȑ#nANʄb-׾GI˼ks&}FܻwPS|̷r믿.^833ٳgڵk۶m[XX8j(B?d--:,Y$00 fgg2~>wܹeDwr"@՗{Pe[]?#3HY>Mdl]afŏjq㊊dIIO8QNoooH=yd"?|>}n߾ݬYaÆу">}-x7ȉՆ@``]eL{l˖-5222$$ӧnrss#"T:UVijbHRf =~83M',@rA 1;w i`zzzvvElll߿?j(\]paĈ ^^^DT~6m|cǎ իWݻQFƃm{mREr"'LSddwߡA1zZVFFٳgGѹsiӆw޽{G7?c^^͛7544,Yt޽JJJ E[ 3oWKN Mt$}Vz~y$fA?666bؘBMff˗/yӳu։.]AP5iF~:. ;4qjPPvm6[bbbTTT-g͚OD.\355uuu|B ׯ=?vޝTr"@դi[_ 3_=*fzyYf۷/11q…VVVVVV޽{Zrrr6nHDSN={. 1AN}fDxyg''gfddikksΝ:ujnnnZ==m۶r##vKP-!'[ Ų^]FWWWWW|G PjbXi9m]zN\pข{We /J`:i|U@CC>]g&~n>cE ESLɻz "Tq{uSzzzRႃ}եu۰b}R]Z @Pr#"a=Aɋ'pyDda,-ӓ-vED5AǨvL'}]pZl2 )rYCSl)utNVw*Jdnm{䫑97/yK?;Xmoϋϟ?ߺuQzzɓ'L377I& x/X t *-b})Q%@EGG;::NNNO>-)Qy_e*7qrrRn#"zP( YPA("ٙl"//O__}8 -ZM(66V"xyyϙ3ɓ'| *VEҧ/z;qQo۲3_jwPD"\>+ "o\.gZ8PHDgϞU^jWjժ>v?7$| *"Pͣ󞞞D$JƲZnHRBq%ݻϜ9CDEEEW^mժ-@&&&ӧO>}L&[j%''O<ٳoضmېcNJD"GGݻܹs.]zڵ:tԉ@EP"]z{{ggg"[k/T1cF/b„ RАͱ߽{D"MPA(o"6~m Mg@ EP[O+ítsss\Q(@ EPBTP(@ EPBTP(@ EPBTP(@8c*W@ EPBTP(@ EPBThGKP\t۷H$;!33ܹsNNNݺu DpYLֽ{ӿto~~ӧSRRZjwSȘ}ɐ7oؘWv>0gΜIOOoٲe֭]UiJJJΟ?kii٧O}}}" {[LJ0ҥKsF5kFoS5D"9{l9ׯ_B/L8Q9?Z*(V={\re˖6m;!///8UVW\H"̙3¢nݺ ,HJJ^??-Zl۶ڵk|j_r_n:WWWT:w\BAo7v>Q~~_~ID-Zغu7譿U۰֭[gffΘ1N8WPPw<ضmD"a=P\\LjBQ{aRc*EEEW>~rhQ*L&۷oZZq;jۋ-bSRRzP(߿n:6s׮]?ѣGcƌaL2T-+VX`ŋW_}uy֬Yo~c|QU:thshh+Wk^9Q)''gذa%%%lѣGǍ=:66x6hРfS|??{5󫒞rʞ={9E=pER蘛ghӦͲe/_ O:::QQQoV{;M DrrNz5**T͚5 _D"QΝk^9QDeff.ݽ{w۷o6?LPPPBB°aÈ~U7|3uTѢ(+Enn.{;$55~;v,v&7vo険ը[2ʑӧY^׸4Dwޤ$##K޾}Ui*))YbEΝ---_x!Ban'"BO0AKKK.+MM *Ưqwwg9rd„ oDBM%KSxΧOfCq4nxРAYYY5jÇ'bΝ;J" ԩWE (VZjedd(NKK;!44t&M8p SV4uttn rrr?jժCn޼޽{3gΤ*J쳕UVV-ѫP '>c ''şWOOOSS [RsӧOoӦ͛UV.\ jꍽZnPUuƍ"" rss{%zjD%\7ti B.st|"r劵uM7((H9[jEoS|ɃH_%Ѣs绹Q5}Çs)-[ܾ}[$5l𫯾Do?cfff2lʕfff|7jp­[["]lYNNNNN B-|J8nΝ7n^l-ѫP '2w+=߿M6;vr劅ENN_ T SRRlѢEoS5̀֯__^=圚U1bE P(VLCCC2"TjhhXzf&&&X]zf^}c*Aek^9R4;;ܼ毼}jff&H֮]ۡCp߾}YYYrwa",((%`x#.]zʕW޻wO&ٕ~III\\7o,^qqUڴisтC"##moua]cǎ?ڵkϟ?_fMU(Bm6!11qС]t)**ڳgOվ;w(33胯k׮}ÇCCC_~ECCkd=zT*Ull\]]4ibllhѢիW?~\k(ϯcǎb8##mj_uܸqVVVǏoڴ)SN:;v,++922r̘1Dwƍk?+ӛ4ig :t֬YDԸqOJRXLDߏܸqcmԩjhhfd{5mڔ]a:::%%C,&..nnnYYYnnn/_rww`<{,&&ɩ^zD$Û5kFDOhaaa56lڵk޽ǏoffFDNNNDu~d2 ז&^˹uԱOKKWV^ .\(((h߾T*m׮]Æ """K 68qEOOO ;w׏BCC-[ֱcG"4hPƍsrr o޼ٻwoAx[Onzg}`dzvj߾%}'NtssSn(:wqFXܴiӌCJ$˗;::֫Woʔ)jrppիףGnݺոqsfgg'''/^K.&&&=+֭۫Wï\r=z( ܷo_ڵ6mzȑ/^7cǎ^ظQF3g;v,kw}7|ujhh\i׮]/^|xzꕧ'q{Ɔm>cƌgFDDܼyٳ/^ܴiEF鱲^rH$244_QbbbrssW\ٽ{w}}}5---٤oֲeK[[ӧ_tI l۶-==M6+W|⅞^Nf̘һ9rd霫=v[n=z444400p۶mD>e˖?xoԨQg?oܸnݺqƕceҥݻw޽rN&MFADk׮]hL&۷oD"ar; E_XXXnnӉnݺ5n822̙3욕ғ'Orŋիߎ5***jÇ'"MMͻwm6""-&&&##O?mժUNNǏ7olfffoo+{{{[B\?77wZyf@@@wyk֬aaaG/d2sQQщ'^xw^MMϙ3aaaBWʕ+ .l׮WXX\.̊7n,JF{zz.\ ƽ8<>>NSSӼ<"{_~9o޼]]]++Ǐk׎?C ҥ'O\lْɓ vtWWW@XPP0qD"z왍ɓ'ǏIDjJOO'Ւ)44gϞD4yiӦuaƌ}hb$--VZb8,,H E``ŋ 6mƽO?eŝ>}Z$XPRR%8TvգG[[[y[6xSN߸qc5WIIɣGXBa?䤩iaa1{/_u&&&e*͹S6vӧg^yf۶m簰0//ļ<={fmm]J/,,d7ܹs &4k,,,{V"lDnh0 㸈͛7;;;=ʥ7~Clll(6hЀ}~𡣣cQQ'|bcc`۷ veP111 .zի...rѣG,]Zzes.K%%%%SNuvv;w.[?Bqqeff;99h?Ӣeg϶nګW/"H$ѯ=f PSYXXr4nݺ޽{ͬ2dMbbbvڣGQFuIP4jԨo߾NNN={vIIɳgϔ{.Zhܹ .,?~=ߨQ\VZaaal" 8p _~y!Cޟ~y۶m]\\iӦNnZ&͝;ҥKu|c$"h޼yvvvjn{N<?1?ѣG~~ϔ)S^қO0RSSsss\챲LJJ9s??qD$@v}gggLfffֳgϣGNnj3n8ZjAXviݾse2ً/4irȑ7oC +?/P7nQ: [ڼyx)))[nӋH$@.ߟUŇ0`WNq1Tă-Zt9DW:'Ke2YZZ;yY8.M tl)))fffc8.==][[[ P^y洴KZXX.--@[[(ꂂPZ\\GAlX\ "q.-rn9ya֭r/x 333MLLtuu_3''gȐ!g2@B;wFGG|R=L4G ;{ʕ+wYyovZvdQJPlRj{(FLMM^,JKK8kj,\QTP(@ EPBTP(@ EPBTP(@ EPBTP(@ EPBTP(@ EPBTP(@ EPBTP(@ EPBTP(@ EPBTP(@ EPBTP(@ EPBTP(@ EPBTP(@ EPBTP(@ EPBTP(@ EPBTP(@ EPw]v+'ݛ4ia{n"3g@ }؜Kyyy,uFGG>}lܸqp.@A%7~{璒JjÇ x7|Sv+̙~+s?-'V̰aÈhذalGNN΂  &ODNNN(Kr|ǎ@u+jҵk׎;\.~իW!CуޙP(o㸬;vM:Ν;|V+)q[ݻ7ttt6m%" 0a߁P(IǎgLfmm~KJJΝ;s"jذX,&kkkSNxյ{;KN8all\AAA@ h׮vhSSS'Nz+??رciii۷oٲ念E (BDu֝9srH$"SN5lذW^-ӧO_ibbݠA墈˗/ 4}͛>,((ѣ'quիW;v옐p--~(zڵk )))88X Ջ^~TRN,..>qD\\\-:vx9"RDZNnzwJ.\())ԩӡCd2ِ!CLMM%ٳguttZhEDO<&[YY[=ߘ._,}||]lee5p@]] +Mn""\ظq[&WJR'''Ht P}InW"7r\YfoDԡCM*~gl? :t`ح)))ڵ+aÆqw%"jܸr7|,?\-SHTžM(wmܸÓ"hϞ=lի,,,7nL~B:th&DFFrWzf޽ˏL^"[zҧZMMMݻV}6[mǎT TkX`FUr~K+};%###"R^y۷ JdҤI-Yvi9;..6eXeRty3i`IYfH JǒǏ?r:u#558vgn޼y1vNܹs"1ƍ&L "Pq܈#G.](teoocǎuֱ3g_( &ĞlܸqٶBQ ;Vy*wѢEgMv X~144 ?۷Y[8.**J(_>i$eZ6r2ߵks^X`ql3f8vX9qr/MMMw6}%%%! Nz'EF EϞ=h֬Yl'ݻw'˗B*/'N:ԩsĉG*oM)](S`ܹsǎq7wH$޼y[n HNN~P,'}ڌ7o~ҥ 6|ȑm@|ɓH,ܿVɓ'HKKkݺuTF.P(V 6ӧue2Ypp0c׮].;"u~dBBD>dKw".((P)IHH8kBXi޼  qǏ'"+++\q\nnP( V/]8caÆl'2nݺ(9yq111#"fc'+'K"߿?q6m" P%I@@СCnW(ŋ|JʉǙ޽{Nvڥ,/l>qB3e ųgϊ Μ9vŒ,2KoemĖ?P+.EE===Z9.?!Cф آk˖-'. <&aٳk.B'(N:|;wLBܶvS2,+++??24""3٩͢6GOO]+cX,f_٭22"?>$KK1cưO>%*6mĬ_?xIfQ&MND"Q&M^zU~};nܸR'ʏMN:^FXse4>>֭[O>-))رcz>7 91//uCY=m~yEYYY{9w\HH21%V~f(Kz'.mmm F.=Kz{{{{{9rdW^e!̟?ݺu/СÌ3޽?nvk3 ؇BRv{dQQB`[ه҇ "MMͷogƍ===/\0|3gvرcKommߺu뜜ѣGKO?R!:uӦMܹٳgׯ_Z3r,Y/*$&&[,9rӦMbc0=}𜨭-rD"as233K7?N uܡC-[xyyP[ۤ넷tYTTTPP`jjJHGE~_}l۶Z&;FG!yl211md?(..޻w/nL[kѢܹs?3f^^޴iƌUvmK.丸޽{СKAv۩Dx sQ";vU|||J/eryqg޽{IIIDUvm9{W/]nruVB!J׭[\ZwM!^xLDǎ8qky5Ib{tvk.333v7,!]T|+;{? c۶m ooy&M(eK]ׯg0` f̙D$ |fcdd lRdVZ8kݺ5.]Yf3,mYvAAD)*Ш5F%bn=j4bƨQcw` XR,umۼ&"A3;s<4B DQԽ{[n*,ӦMkݺ5ݖZQTUUխ͛?!x޽}njChllLqss zRٳgAI!&LRo_1;M0UiEoeeejjEBBBp4;;7n0W?B#p T Q|>?22Դo߾@$˗/Z[[w {F@=P E"TD*A QJ(@%H$P E"TD*A QJ(@%Hh݃##Xxpp[vtKj^bbl_RO;Ӣz)}n>AL8H7mDp8nnn{ 7n\`N-ݻ733rMLL>skkkBH^^Ν;iTUUFrh}LĥPW"ɓnnnzzzc^ӧKJJjڵk2:%%eĈϞ=#KK={s;vƛ+BBB8N֭AUUUmm/0,,D$-Z$JwE8wT*%>}gϞڵ+-- !DDD|ҦMM6?aaaaaaÇ755MOOK(J"TTT~zժU|󍪪*L& h߾= c"Qyf__ߑ#GN>}ҥTQQ_w1|]v1wXdɲeˮ^2`33_~`Μ9eee q͜9Y(}?˵ٱc:)v5DJGG'>>^~Ln臹B_RRR^^nhhHӧϣG A_ׄ-[*))Bn޼Y---,->&"Qz)?ᘘ|\M]SHخ@q0"TKO,>pC@Q"@c3P ("k.mgǥ];g@HEIDBMG#(4CR\Dž \1;x5="@sq(@]ܹ4rH.3>x𠴴tl5ӇYٮh1{|*wκUFz54V?sȘ>+|Za]]OPRRrv۷oƍu1"""8A M@qq~b^DQ$f x^d8)[cR+!?ͰUSQ: k~g; %]kO!_~=!D"DDDjՕŋVVVGDDdffڪB"##!$33 ;b\.W$џ;iin ٮ'QpLԩO> Qx 58!ηSUQӃf*=; ^VV!{Qp}Jz|*H.]BJKKmll,,,l2zŋ''';'''11??kϞ=W\\tiJJߊ+ rK.QաCB033ENp gYPXL={q"##锲A!QxI%gR~YO# N9}m^^BFH~ġEh !2LEE寿" 4hѢE^zm߾}=z !| =ٙ3g=zKߞè(%%'N0as b 3lذ[񅫭k߾}!Qx1EG_JI(Rv}^jimw94faaa\.W$ݾ}{ڴi۷oOHHQUU%DFF.^gϞYYY:::###3fS B@@qqqA?D59|f(4 GNʨo賋M SUU/x 6,//ö́eeeߓEBKKK1.dzU`vC^N0ԃfS*pȲ0"7S~ѡid" ##}^qrr{D"!ŋ2L$ݼyҹs第ϟBN:U\\Lյͥ/h+**T"@*]'EfWoшK+RDߏrmllvamm=rɓ'O8Q,w5++l4h˥򦡡kז[EEEYYsӧO1beFFƍp>@PpLLJJ=ztTT}EkBа2Ի5ոHPo@ 2>ף>|0''Ȉ7B:Mǹ└ׯB /\O6mڔ)S !fba%&:88ԸІDSm;?F2o ٮ,@Cr2eooE///LnlW5D@ Y"(\P.{yI&я%ðʭ۱]hA?EP===___kFSgrOOOz022_,{zzՋ"Jϝ;ikkデ@sUcLTUU$?CUU1YEBy9sbcc)Svy՗.]"lڴ͛ Yp!UhvvvtL|ŋ۷ov"9p'1oߞ:u7!D$]|ywٿsǎ{!J4uvvvtL\jՁ۹s !%%%w޽?Ǐ'%%6g$''_re޼y @Q$7ol߾=3fٲeJ޾}kbbBqww?sLFFٳg]]]cL|B!b"@%%%'OdS#1cտ^zmݺuժU8;;ڵk h (}SN;wW_͞=ŋ;w{ (EM`7閦'DB/1D ;v0?EFF '''333:t@ѝUQQaeeEOYQQbŊ۷3g8xw0 <<ٳgnnn4Y/( <ضmiӦ:t=11q̘1vvvu)'ð.}qI3&&ZXXTTT|tLhfJKKOަM-[נB.]bŊ!C_~9!d `W^)))_k?`__:X[[;vRf5;tжmێ?N֦?tؑ]GESSShSN}]ii)%>|0**ɓk֬9tڵk);v,0/_6)!--zצM4BT*:uO?tgϞ߼y?txMӦblTEt(1^+kB_st6.MM(6m g?kQp;OݾFF&#;_蔠*&ienzmnGm\M !Eort5^g;BtTlv#!QliXg|8-Hdn)}w"!p8ku|+"j ;BJD|Φ̌Oӌd2By)07(f)PRRrv۷oƍu1"""887QVn] j*KVYL\P6=PP*V0SvH+Zueٺ^l7@q|}}y<!uׯ'H$VZSxJ~Lwwt[[[BHdd@ prr233#dffRakkݩS'}?$@R+_4گuv2=U2߻_dM}"J\#@B:48H.]BJKKmll,,,l2zŋ''';'''11??kϞ=W\\tiJJߊ+ rK.QաCٳg7.22N)E2֨K5lWL_BRRR h"BH^o߾x=z儐o̙3=щoY|aTTĉ'L0x`Bùq}۷iErE"۷MP^^}UUUBHddŋYz왕ࠣCqpp022K+((1c=P(\kZE'"ɓ'x ##}/^!_^"BwAI$7ou9++={}Ĉ7nКѣGGEEW6(ݣҷo_BHEE_uǏO4Ia`b"@Sgii_}Ç\.~ !СCMtW\\ruBHaa  !ӦM2e 3;!d֬YL5.!(g˼<@?tP@FꫯS1yrt'А9Cw^??s͞=ˋ}+(5j„ [l=v옏τ TUUYEBLPO;!pM4~ FDQMM"77ԩS.\011177 lB!&!DOOחZ.=ѣGLL̀۴i$ccch)";}taaNBB… ׮])Q:b"4~ Kjjj/_dlBLLq/kڵkW===/n;;5k0;𒒒͛7߹sC_DŽo5N4PAbDDDPPЋ/ﯤO?)f b"@3PRRrI:Q,)):ujZZR^^~1~iŴ.]BTTTw͊ ,@LhN,b޽;wdddB( DWYYY)))ƍw^QQ=XTTt1cB$Ihhª,444T"0Ri~"%...&-b"@QZZ:u6mܹyҥK7lŋ'p8߾}v^bJvuuy&=xڵ=z9? 7oޮ];vBsIII~b0ssٳgt1٠8566VR Ç&N8a„;ҥKyAA˗/w!_ȸqΟ??vXBȅ f̘AIII4hТE!zھ}ŋkIp84%ݺu;kҺw>qD,QALh6t>ĉ<,,N!BP |۶m 7o߿|!Æ ۰aCFFFEEEJJ'!| 111 qݴO߾}?hjjfddرcΜ9]6Dff<o۶msqssspprK,'Xj7\B$>bĈW~%O>]x… yfY8EQDB?~JE;;;kk배+Wtw۶m ,o4R߾}ǎ;2`]]][[ofرgΜ޽{ƍwk׮7+''w2B޽Kyr?".=ozŋ|>Ν;*`b"@rQF_~֭ӧO1beFFƍUTT!Bp̘157RQQallܦMzȑ#'O|)d儐...644dfyr?^jժ͛7O6-!!A1 h=.kjjdksСGR|XY>EBg}?xɓ'_>//OahTZCCnݺmٲ|%%%?|Ϟ=rS&@ rO2E[[|ݣ'&&ڶknذa|>̘1Gg1?%ӧOkժs~VXqYf ECCCOOOP(͙8zh555;@qSP(J,Xp} HpBee={ fܠW 1?%۷WWW'nzΝ!!!l7@kDqӦMcǎtӧO\"!\.WUU(b"4 ~ܹsCCC[niӦM6kݻwӦMLmg7dMIo(b"4 z)bСl7eJ}pWe~"f80;+_Lclû[;TF.?NάG[X"9 Dv=S8h2>)_oa hps=-!D]Uu4j5!!J}<҄?4V%gJI.\s߯|%$qAғ_XLN=(ʈ'K)*//o!71#컔|nk/"(8&[Ws_is4PbnSDBHN6G l%jܹ[\dl7QP3z茌 sssWWN:u֍Dj*JO LU% e^O0,?.Z4Gόn&VP~P?8)<=bpWBFX|1MhDYyyyEEşw^wwwWWWWWׁ|%k!꺚7)1Լ/-ye_D[vbǙ4@z0:kE'> /8cXS ^y0獌3۞Yߑ)}(z513Ps.5;_vO:DIOO|>?DP`_ogHe"PsFvZzZAB+Ь &~I5/g>8rr2O)XB)(RT;0'l敖K+Q OsssWX2p@w}kRIZ",3ϭENpP\FH]ļ" ەh<2=ZEEE$[?{999:t8wh[-=OYĿp:|]vDh*oKl2d?۷oWTTY[[ڵc1?=fxxxxxxd!&@#vXyaf~[vݻw[nݴiӧO(jΝ D+){nĞ={V񩩩l_zӧO.\^VVF Yn!DWWw˖-#F}ԩS !"ˣG&^zؘv|tLݻ7M3`k֬)**bzzz/_9r$3&""ٙ)-[6k,z۷oMLL!"h[lQVVfl4јp Ϗ=BLhtF+***55u֬YcǎП(*..600<~ԩSΝ#l۶?o߾.> -<88ͭA}j1qŧO&={[__D100pРA}ݛ7oݻtҀO\DKƥR)!Æ l۶ӧO|_UQQA2LINc= W܆'0hLiժUFF9##CGGGSSrСm۶?~ݝr7oٳ{龾of$5ј{nΞ=۽{w///DGAbn޽{O?B322tuun> MII!?ӓrG:um۶D*Y&YXX>}zȐ!l&utt("Ykܸqw^x1b"@KO=u#Fn޼YEECC׏?D,9rrA@g899]|@3ѤcbO8̒ߡCLAP( [lٸqc^^ڛ9 6l0fpȑÇ322p8DU').=MNN9r[MLLx<ަMnxAGM(ZZZ4bbbrԩ Pvۛ[ /#ڻutbUۼ]G 1?x]]>}0O:|ٳgn{g8aDqDf#ۉTvZDOW!ȥӊ%26[fR{&S 8 aj'>s>h^(B{ٿ 9r$,,ˋ76$rǨu5({D"$S5rw;hT)Y5HQKP=(E:xr۪d ~Fk׮􇰰X>v̙l9KBQw/HED&c-b"G'x Rf^iRqfwٸBʥ'*UK/V9R.^~K: В)a6B𫯾ꫯ***$\؀4<X=wDy+zK;ٮ@K <֖+r3ˣ?;nMQs-PBWUt3,^I В)a6yyy>>> ??С...ΝcYޱ5 Ygn==^ٲ]<۵h>B߹MsMItT:u"Ch%4Uj<O)s),ht"4H]zz=''СC999l]Al6P999FFFl>XdKk<BCeO!ͬO(QLKK۲eҥKmll-[v%Dh(FDD 2]]]]CC;wJq08)(QlӦͳg$ =hbbbddv 1?ݣؾ}mێ?<&&l7@SSOn_ڵŋ -b"4r :ѧOѣGd6!&@㧠Dx޽QQQ&&&......ݺucEkkK$Ǐ8pܹsl7@SPx<'''''>|13:qҥD!111yyyl7@SET'SRRTUUn>!&@bXXspp8t!$===<<<11qҤIl7@qhD1<<999:t:x`v 1O}}}}}}BaXXXxx+W6m$H;v1 ݣh``Iv &@㧸cٳɓ':urrrjݺ5m`b"4r z=Flll@@w}_|;gDhtF1!!ӳo߾;v숌?wܹs~WCj &@SN6ȑ#wYf9::jhhj &@S(:4gwk "w5B>CLhާ^\ؠz"4~ ~)A[MEIx=`>TF.?Nά (8^ZG'^}#!@DO $UL|}_Dfw^"ͲmJtR3.:4!e-U !8zUwOkBґEbȂ>3No§ֿ焪 pF5 58!cc>z=:\yrX!r o9XIxћ C]oBηSUQӥ8e"k"J&W‹s![k8hF$$bJ]UitobBԲS!zCӳ5sglqq^3F66UJ.HWǣIdq9b 1̍j8-:UyQ0mY}-u.vb0@ڈ?E0Ly?$߮oyoi5 P $tjRzIxvv37uB4>bӉm5LT !EYjhs& eƪ[B3q^MEhq,OnkSofXBomz[*]qhz;~ՊY_LQԦom, !V&MբK, U6}V]UҿbIj&B{2W^׋i`:8bcTrEno?WcdiZ@3獌3۞Yߑ)}(\ßVn=.=W+Jxv}|w5Vl7$BkvogH!R!<,Ԯ<+ж{eEH`o&WFs(,HW9;_ !999P(}֭[OIKK?~ӷ~yȑДKL;wnyy9S#G|||޽{qUƬ9Q2eJyy=(s%+++OOO.[PP9k,BHjj͛7GN:?t='''BHllիWnتUgϞߟ &4wIN8Mn߾?;wu!ׯ_ڵ+!9222$$۷Ϟ=suu%D˗oٲEYY|ObBR)b"@֜(?y2rrr ֮];jԨO^zٳt زeX, m۶G3gTVVqww'l۶?o߾V> &-^^>i$5o݂IAK1O:yfBGAD 9'^~ǧ=z 2dȐ!ϟ2eÇw]\\Snݺ׏=_X,ɔt>6ðZT`%^mj>(&^zmٲBQT>}GD---8 Ԝ/=BEEEOOdaa! F! Ç pFygϞݻwOOO}6 Hy<!$""?߳g;333CCCnԳ(ڶjՊ~[zzzttt.]RSS-ZTRRB 600pvvsL&#<|i͚5C aASb˲e˘عsge]zo߾E?V2333335jԐ!CLMM K3gŋ===utt444I}Ę:g?Qy&빹jjjȥKΙ3ԔKR]]*>~6| D(VghX2ڦvMb"Ԩݣ5:w|c.@cDQ6FCK_ix+۵xlW%^zb/)N~7~VAiCo]Gh~B!$QT>w @(Z ݾ@*[HsTjvQgJҢ5?Njʊ dŅJںY:B ԫr{X҃oZޙHdJ=,r!bLǵO)Q%b%js=,7>Ov)C>o+){߼l-Ql8(x%%SR.]Q&,$S5\]Zv0,v)!QqGQ!zSfw ԖNdYa[KRh 8q UMN(+sxʖ_,gdF8BQi+IF*5hl}5F$pC갶LN ̸3V~MkU|Z'cخ)@ShX}̘I<--egj)\-{"z2B-_4i&z#Ԫq^ .=m6}g0}?8dlb,@cLF 5$A֙b]JbRCYJR*u4*`V.Q8e^hhpFe)~I(.݄\iH%lW dop}"tJ !cp(BHw a W.}sQQuHj졥,b)(QWRa~n4[H[:0*4Ogb.in !Dčj.SUHY>|>TUKY,QRD=L1.q^.=ig@vTƑQyPR+l$/:WGPFhֳr6j+5LFq.m*yn"4g8PNE.S'=hoPU=cV?v6,w3Ѭ w{;NgCuDҗO]ʇ.3 *Xfoh H߆W_>V9e*z"eoBh:BY_MB!yej<h HGCkT=U B_]\yQJ(@%>msrrhf|]UO!`77PSpF*A QJ(@%H$P E"TD*A QJ(@%H$P ((@%H$P E"TD*Apׯ_v'Lv֭k׮edd))<((111666 .ܽ{(++++P2СCnnnX,~z@@@bb !$--ӯ^244gʊliiI{̙3AAA:::mSϞ=rJxx3GZZZjjj@SҶ_RmS1U~k1 !!!!ϟVSS{_Ը>oR%ŝ?իW\.ܜ|PrDj:P j4֛رcwڵkppvutҥ?K.K.d5kvuTm۶]x?ݻ  ⼼%KnzŊZ7111\hQVV^[¶g;vSNOMMe:' !-pU&Em4Mj~kf1kܫ~z׮]"9<5vT6!U^ttUZnݡCΝ;+CWFRȺ u\?P$_|MQTYY٨Q خ;b@ >\PP5ydzLHHٳٮ 6SXX8~xXLFEEOQ5ەbG֭MMM߿OʊرcΝcbb222!n޽;dT*]zu~[L&J)֭E"L&sN {{gϞѻ@;;M߿|ƍjjjRT&͞=311ٽ{w޽@Т7^Ǝj^BH~~~PPPnjwteÇ/^ܵkϯP&~ 2}Eq(bS||ƍB]]]ٮk'uuI& 6r޽J$???J-AAA !/_\f7nѣ?rV\IilkllL_VVe˖MMMuu5km/ڻ̙3jÆ @Sr_yjQ[cG5Y^hÆ cƌ&YUwT6-Qoܹo޼aۿW:HydW:HlCذBܳ,??_]]~Y*M*s8kD"74RT[[[~$갪@S·^ojƽzaa ZzƎqf+CWdW:H>Hܣ QJ(@%H$P E"TD*A QJ(@%H$P E"TD*A QJ(@%H$P ۷?sljlݺ /f͚}TdTѣG?4|₂==Z)))9t萾6H$z#4'Nc by<+իu)IIIO>K $77WOOteW4ܹseee5..?\㴺I#5H$Cr8رcԉh Ql|k{AII !$//իFFF^T*OKKMKK ?o@^sssA ///gƌC"ܹSTLLLIII``Ǐ)*,, `~ӧ!D&zΝ;EEETUTTtRDrݼ<$''3c233"##+ "Z\\ܭ[RiddD"y˗/޽{  \\\!ӧwϯ9{l St!^J}ëEW̻wղbۈrXYPPp޽h@@@iig}F)((sӧOk"(66Ç/^`f)..~ӧO 66ÇwԤ;L)bOj[[wNj5ǣ?NQTbbbkL&puuMII߮G*{7o2DģELZjjjfff||<>d +P(qpp`׸D'ϿwH$bY}{w^`3һwﲲ葡B:D{wݼy3;;"zmgg}Ō'5Żڏ=>:3kѕ sNEEE-#P 4{=&1SFEE1uGA4[w*Qv_NQϭ^wޥ|x~~~TTԲe˦MvرqAAA#G/صkמ={bcc'MtUBٳg'L7o\)ZE2lŊW:u˗ !QQQϟ;wݻw-[viBɓ'NŋS>}p__[n|rذa-￧Mƌ(*<<>?~wV\zꈈS޿^[z9;T'ON81""b˖-k׮qgQEYYT* o/^L#"""BQԶm@9f̘s΍3=+W|ъ+蚿~/W0JQ_lٲC|2###//(?! ]vݸqcԨQ!K>x !O>{MhѣGv]= oש駟|EC}ո1b2֮]x{=zt̙s}… CJf֡CG'5ǪG:֓%Kl޼޽{3gLLLСC}|sbccsss=<<1L/]t鯿"<|.sλwXFDDĴi͛ѣ &lڴ駟~r !!!!}x3xW˱G{f޽_jӧϜ9,]ڵknz 8-??رc^ȐRBSO9ƍ?EGQv=krϟ? m۶ϗyї/_ҟswRݣ}EQ]v_)D7nE"EQΝ;~xJJJ=(JÆ { DT,lmm?ydԨQi׮]PPEQ~~~_~eyy9EQ}E[~gٸqcppǻu떓CQw}m6\Ç <|w322(*//ԩSEEE?~|ܸqׯ_K?~xsss)ڱcO?ܩSl>k׮qqqÆ {EQ.]/b1]Suم R֭[7/^ >qssʢ(5==wN8tP92d:矙=}'L07?~#33ϯW^>gٹsƎKm5F:%%e˖-B<L<1Tqssݣ/_~a&V2={lذ_}/W~zc=/^8zhTJQ֭[ FմooӃ0tKR;jԨ=zX_ջwoFڰaEQiii...E?ڻw/=>N(jʕ~~~Ƚ{Ud}BU; L߽{ujz̶CQԊ+d޼y'OH$3EAc;Q֦M*768;;_|ѣEEE۷ yŋ/^Hy]NNNzzzqqԩS !qqqW\;v2!⒒k׮YXXՋYbBBŋvڭ[7Bț7o!zrss#DGG?effٝ={>SfBӧCC۷o?$882k֬sgSNݿ?ݼy3=}[7oǘW|ѷo8622"XYY7y{{0`„ f͒Cܹs'ɓtҵkWB/^L)++nooOpcǎӧ_JZcƄ555]]ܷoVY"ԩ|iȯ>9"_O2hG̙Vrt_Bbcc C_*ݻ7V|gggbFF3<>zj9= ;7!аŋ{XZ$??_[[[Q5|]_zz/^ 99]Z5FL&)))"6::zС l^^^Lr İ#G2I2iҤ*Q|I!:::Τ};ӜPt"L/deeEEEٳȑ#>>><H-_W_}ehh(J߾}{:t@cnn+x_o-=ܮ];zlpBϞ=:88TVV6c Bs&\F_˓ڡC)$}QPi&NHGyCC me̘15քv׮]Lo >6H7( lܸC999_ǏҺtBl@QTIIIv?~NPzֶm[̈vکW/ُ7џP~=zП***E&yyy6667o8;; !$++KSSy`IXXXv222?/qFZZȑ#XD"̙3 (==ښ)֖q1#/^?03-MHHXjĉw1a„7o2?ǍQcoo\>/n߾=!ڦnٲJ {ә '''++M6ϟDG <.LJJׇ8h ! O8ٳ͛7}cǎe]dIHHӥtÆ {:27Ը*n߾&k󼼼tYf|>޼yZt|X~˜zѢEfffsiժҥKјj֥K8۷o~_vmpp"KX([z &^zոogsUVUm:4mڴ̫W^v~K_}!͛7;z$t_ヂǻsrqIJJj|h',,}R=>Nʒd֭,** bғfͪ~ԪU+琐:OYYYbbbǎ$//ѹ"))IAGtfٱʏԩS~ 嗎;B"""666fffwݱc|Fx!$((hڴiLڵ۰aÀ#Pf^MJJrHÇ믥iiicccHD?S'66?ӑ#G/^lbblcchkkۺuJJJ[dee !EEEˤIJKK{!!{w}ڵk/_^%$.5@֭{e2֭[G]}W^}䉋 S:ݺo>y>ʨ$UXX1s̘!VVVgϞH$z;w |ߪL)?si(pYSSS''=L?(^{ߟj#bbb=v]= o!!!&MG5o?GFFZYYiii1TDҦMf|\X(Yzѷݺuťiˋ/dIFFFdd!CtttΜ9奣Sc}RRRsSLw}vm;;;wիxbXC_\DL#ӃZ(dV'[nԩfffEoE6l4iR6mwamm=w9sO?,++>FE?V2[nѿIM{#E(YzΛ7oΜ9W^VSSk߾r};*Aaa9s\P(2e yOfJw6l}| z =xW˱GQOW)))G \hΜ9t!i7n>}EqqAg̔+'Տi"##+U(򋁁ACAfQSS .ܽ{w5O8Aӡ9`&IPDB-/??o*ϿT*̤z͟?СC hZdYYYeee"T*蛼k!Gxiiiua@ ?}yy9s;{S(.**}ܒf_]nEQ-U߿_*˭}>L qU忭WaHD'uɤI>}ZJ[lգ*ϡaj6{1(Yz^ۥRiZZZ%K5..vuz`}"V^^^5̬ UzNII R &|U uG?;::… <N7g^|Ύ%͘1cС 9ra˗cbb>uֳgfohU# ??עJgϦfC/100011mgUy,4i8੧P E"TD*A QJ(@%H$P E"TD*A QJ(@%H$P E"TD*?#xIENDB`badger-2.2007.2/images/diggy-shadow.png000066400000000000000000001005151372173116500175410ustar00rootroot00000000000000PNG  IHDR{gAMA a cHRMz&u0`:pQ<bKGDtIME 8IDATxwx[?h{o;7I$K@ҖFa2 !{$vqx,Ee[xu%={CDADADADADADADADADADAP"81uW2D= `jW)PX'`'CD &  !i@Xmd.$1HR[3,A`/pI"#C@%tWE =r|(B!MQhN4P%(A1;bqt i6 ^OLL IIIdeeIZZN@faq8={{KJJL>= XF h] Hlja@ZZ b <~Mtt4& ^4Z>~F*"Dff1b:u:u1bҥK}  nh_" |权*111 4C`CnO>{v~wꫯ-$v9y$Ųev;g{/-[fMMMU^JUUc6m'NdȐ!$%%yᠤ={߲b N8Y52A'~&,ؽ{7o֭'Fb/6;8G;qDQVVփbt:ENNEfffe]O?n4t9ZV\|8yp:nn"++s m@R[H_+^uqo/\ FI$yQ[[+\.x7EJJJݺu'Nhkn n[\KhZQw}髀 OSN5-"8ߜNĤI|tR1`|rcߋ?OMOxwBQ!ƍۚcaAMMxŰa˽#fԩSb߾}رcTOl߾=rrrĔ)S<. Co f6m;v6lӧOl111"115&I|mͳVĉ xAr-"77WTUUM6^xAyb„ gϞ"##C,1f1o<ѻwo1}tQPPp7nzg7U#=#v)v)>z! 5Z -֭['Lh"~ߋ-[+)))Bu}4xꩧknx< i[T-AhyDZUUINNfm߾ vm}Z̼yXvo1  ߏl;Gaʕϭn.~}5NQnf 2s=C!Ru<VKll,P[[;(++rhz k3,/G͐OuuuwNTTj6=!kp;{aa!f9ywўj??66%--(Y;%Eǎ]t^y|cjRTT^) "d ǁ EEE`2Z!1O?4UUUm}8T|LLb h4zw:ߡ a>41b^ !0^G'DQh,? ^oq9x :EGGyn;}O III\|~)f[+**p8Ɩ0ڣG>!kHKKc̙ǻt[V{h4~A"..>cpÖ#իW˙3g<ףIN8y|~߲e˖ ^x!ɓxA*Aƍy/xC0 \q~&nۛWi/\d{@!bx|@% _@^^8z;P1x`ƌ0K='Orl)))iiiXt@,\0x(voS\\5<]Bn0 ̞=&S2f3()// ##2 ÀAjgZôv7z,X?6/`ԨQL2I,h4N%wDF֖a2pvf4;Ym6*N">>-[]Cs7Eѐ˙3g:Cdd Ե|Pk׮u֭>?;vBjHPG3QՓbuV*++ Y[ Ԁ}s…j^yV_f4C馛MU`ZٰaB'n38t G_0tV,Ye˖rZ|9999ZF&)){&_!'NsN3n[YCP)z>ЫW/˶ʘ?w^*om%9#F*d2h" 7[^`#]޽[y7|Æ >s\2TVV6{]4 K].כtඏ8߅aLL }9/>{,oVb EEElݺ ۷ogǎˁo[YCCv!w@߾}˶k6mP^8+|hڇE !khݻ7))g;ﴹako>},K@s.\c^n?1\YCCW 0 4/]-[\S ;n7~)+}( RRRZr;EEEo e\:4"d \kh6lXyw;\>V+o6?<{졲OjׯK.MP ><]j7nr^aZꫯXv-YYY`-n{azzj*"d :.`>0d^zu555{:߷%BP^^Ny72cq38xhPWRUԢ8ȑ#.{qv]ft H !f.F&OL߾}Ups]wj;66>ݪ=ZÇ3z@ʈ"!&yƊx@FC G.K$''ӳgOzMzz?uF׮][0B;:= "V( gn`4&u}Yz)$O4[=zEj;I&1p@8s K,aɒ%xF;ds>={hv;Vwȑ#\t\wu~uKV' ƒYfzzΝK׮]4~t:پ};<Æ 7 6,l~>uN[nᦛn"11!V>7|Ѧ\s ={,%%zZ6"dAA3QˤIi#%`^jS5oe׮]!}UW]Nv( Wn@TUUINNFQJJJk=l׮]xM&ȇl@<@iS xHvj+;wd߾}T޿+Vj6OX,!C=ٳtޮl,X:m4.]˙5kV8{L>C6\ϣPC/꫽$B0jԨΆ ¨Qv;w 9 "&&xYfw>*6m"77oyN7ȑ#>|8syS.ՇFqõ!BPlݻwgذa nuٳC6m4v_XUUnnFnpfr:9rkO)S0a„)aZjG54'Ӑ!CHII!99{Ӫܹsя~aSSS+w'Nx֙8q">(111~DỦ'#7>tRUUu^֢Llll0[#DL!TקOo@II fUU$--/$j2~zWX#G^>99,̛nw!3g??ZB߄aҤIL6e|<@a !khr?NGvv6 sUuѣGK|| [5**ᄏ|quDjU Qdggs)7p 8Պw.\H||<_~yt۶m!)Aݛ{븩zBEYbE7n 믿YM̙3\tzm>O5tF:/@1dz=yyy֢(TVVj*ONnr./2ꚛn!C5otB0vX>cΞ=^~et:f9,瓒½۬ndlu(!"`'B8͟_~dgg{[<\.v;Ν;)))aܹao9ydH/w΍7vׯ3fh2\[[KEEEXEoK/mvJَYCG>Y ȑNG.]HIIjRRROZ/#FW^RW^ɠAzXj8y)b1b/=s挰Z r~;jk6[,.\jo߾dfftt:Ö-[瞫bֲhѢ犱̙3E!99pZZaժUʙJLL 555-J?LLL7MкyyyZ T"7 6Ga,wy G^9r$&L`رdee5ȸq\|GٓK.$l'wސ _𻸸8n.]3a.]ÇrynSNE[oOh/~5\:yyyZiщr=Հ_  \ꎌ.(l6c6)**bƍdeeq饗r3qDohX{,/^ܤ>*PULf36a5kSNV+iii%H\wuW p8rrrc "7:Yc>EZed ~$s꼿ʎtRRR^Çs!?jr9'|Œ3x5j~!III̘1#l'STTĒ%K?FS<0Ldeeq \.aސNtt4(y_|1=IIIP]]ڳg@'\Nȩ#]"IhZә8q"C !99FAUUrssyGYjU@/hEE};w{aѢEl2 TlذOOO*cSǚno@_rf,#3tP~az=dBբwQPPP^XXvAC'kE{wP&;;ロ;ݻ7Z6bϟϓO>… 3gΜngѢE!6 t@RRzӧO{Ebpɠ'++ jjj0^FAբjޤDL&[lEW;%~d+(7GyYfygϞ :_|ѷrw9~xȡrLL {l6.oS~頷_\\?Obb"]t{ۗJtt4o$t޽v6`2SNLp̄ x=ztPD !!~˗7ƍٳz"WԩSA/jݻwjFMM fJJJX,\.t:z]vQUtznTT&MgϞޤj Xf K.pܸqL0lZٳ7n< |֋C'h4l~_ҭ[?5 &MbС|W<ݻaֲtҐQQQt=enӧ9}4EEESVVFII gϞjV+Bjjj ٓ;iӦyGxa- V2Ν;y裏&۷/ SN&޷jd߾}޹dBB/WXlA/?fy@;֯_ݻsp 99+L0rp:fjjj> >c<8gU0رc{ z9sp饗raI}vJKKJд4Fә6m{*_Qz=$$$P^^Nqqv&66~m>ЁC& B0pP\\Ln݂`ʕAUU͛hnK!`Ŋz\=HHH`ܸq<?~<,ZIZX9r$ƍcĉ3P}t:҈S=wN 7܀`V럐= NC&+ԥc۽(_TTTEZZZP-[xGbÆ c޼y!=9sY_!x8|p=fM&IIIٓ~1h L߾}s}iJKϩz{7RPP~zwQ ?tQPTTf`04XvSTTF7;wzF' Z=[n IotVdh4MOMM%55.]Evv6]t!66?AѵkWn'7;ʾ}nE&qNCHSXXXHEE\N'OtPZt<lݺ٥^ʏ~){i\?ƠhHHHW^Ӈٳ'&++ 蕠 E|CXm>)o駟2G>v/жC֦UVVrȑ&{:N "))Z-|A6n]{̛7/}k<())iRӇ+qѵkWbbbh4 vڬQ[`0A^^NU1c0qDVX1 -NrdIʮ]K=O*jjjt  x'K 80(/hغukH6c={6wy']v+gSvMT9}4~@Q>[n% ]xꩧxꩧ8xPNyXjUPSNzOS[zʼnm)SpEyo!׽W@R[kCYA\LƯI&@(o߾|Æ k0wBĠA8tgΜiӧsI,YR&Mh*cƌvK/..OSr)l„ ($$$ vr6nHaa! 9zȾDȜVΓ'OꫯR\\ܠDNAVVII B111L>ϤB~rrrػw)uw2{lvލDGGSZZ6sٰaCŽ 4o@ \#/6nK/DYYaE̙3^/ !CPFFAQj?OMMM 999:t[;0qD5}&l>9 W ZΞ=VuE#竝?"n{ŋqw}dddx+MHAA6m^OJJ L<4oGSQQI&!Va޽ٳ)Svmt fGbXVuP'rY \\/_ :UU)++kPhX~=K.vVU@tt4tޝÇ3fNjjW6 bȑJɘfZ}fݻw3i$oғ^vɓk$&&2w\_+D[Nl٢;u7pW_}u@I`.rp8pv͢E&553vXƎː!CݜNgpСCٳĄbt:())K`L&]vN:a/"dm9H?+((~饗͔ !?7|mt;{tˎ;5jz7O? )))k&`2Gfhta3QUUjne*}Y,~m.\Ġ&PIf~2$i;EBDhېd [ĉ$%%SYYIf͚Vrxbrss[=z0k֬F=B, ^}*UUBQL&qqq(hګbwa$p  yn0<w5Q{"3:tDF<6|lɓL:ASQQ_O||]Eryf^{5Q7BO hVH)0VMr)n{̹Gs>ɨuzЩjR`.A5#G_s~09 hll,& ^ 9NV+V3|pƍG[8ݼy3|w)*< ?7AnHAn\S ap9ܟ墪rrr8pСCP]]bi,J!OoќPͣkꈪ( W]u3g:k 6PP^7ȑ#ILLn_k.S׵kW ƨQ2dh4Vyje޽|[+i(BB<1-3EUѐHbbWXjzSA9Ç9rѣyF` tT]l7d\!k sx>8p ~;&K*M;w.JXZR/2t:+Woit:111&-3 $$$NVV=zO>deem\q:ի!k0"ML/2&OptB}GMM ֭kΣ{1LhZfϞ =]6ybBB3fছn{flƶ]?)[8jkk9s q ?ٳgm`0Y*4Cw4o FHI P^ [Il*5d׿rs@^a-ً50"֭W]u7 9n{2d>999A9tBuu5>h qqq;k#Gq^-[(((v3~1V8WKj1rj***(--KJPIA=(һ;N #5] Zք7W#k aZ9uzj6 sԨQ?Ȝ!Bֆ7R8}t,< ӦMcذa,['N4IBM=]ǎݗgn?dѢEz AwWhBk2`A6;55bXpXmh4pAu 1$]7j02_7n=oU!HN&q8)?tRv%gT< =n+VY]"vV!9Bh餦W~aVzk9|5 cFOwO>k諕G.=+nN8hBLN"eUUEEE=zǏǩS(--m@zx^r#dHEvRP,*._Xyѐ3֟Wz_t; AP'`ҁU#?o~M(=z4B^z%Z9|[u:JKKa׮]۷k)vX #d''##p⫫Vfߎ< ]ntsyjIuN Ui)9N[duB]={_OTT^{-,X^K%5B WhdUh|9AM5o+**طo6mbǎߔ)R#.er^fJd 4U'^yf|-RVI 7,JN'Tϕ5MWN~^DGCCey*q@||7jrI֯_Ϻu8|p 9'P9idRir!D u}kz=a7(n Yz5 .h_T­FaJ}Q14R# +d$7ôV^wrb *+g lj#U/P`Jt4ڵkײ}vΞ=[?]ԁ l!5O b Y%}kZ-ٖ_Luu5'Np@>ҷ;`Cz^/P$m },& 8gj fwy[t_RRg}kBsiK ۘkeqy8mVv='N_~(`#Iv3jY%UFӨX SQ j1c ҨW^^oyӟrW6HPaf+;Te:wCUj0kp\u]ܹӷ lP] gn S'R!74Z ڇvHtfjIOƅM7RiLbOqVE{<*]tG&OkƆ oTx ^/`B.t½U\s w}7;wdݺuTTT_[kq0|I*o hu2p 4 _oG:6 In5菬,"G-qSWـ3ZcexuBYj2k:u*6mblڴ'NPSSӠ##mx 7<^2׏6lЀ`Ǫx g Pt:Z↽3ICp3UA>^yՅw6bm׸#^ YXdzD1 CEBB3gd:cǎǑ#Gغu+VJX:4idP-Lˎj _VGᢋ.76RX  cwi%@}A>] : tJ2NETdMnA&O{@ff& W^(//WeÆ WUUR0_!!+++$%`hHNN&99adя墬k/vZ68떯Y&w 6MsznV@6v] e+ pEKAJ'Nw8eݐt$O"eX;\|\~1$Fko/^UU$$Y yZ#^h4RSS;w.'Ngal|"2Y3cy99r~q嗳a6txq rt,f!X/#=dQAG%e?t;o.u#~5k"clD9ddd?jy着i:-0cbbxǼ @ Kرc+VhvvtAwTE9>ـgcZy$ )L &$$?L<&n zmF5Sgdv=,%%%絾1$$$ӳgx$ ^, |c7q[>Eojr7|c 5>ՑQcGֽN(Y>x7v2YSRSS͝wI|||PԞiiikMqOSg \. lC3gڌK/mH(98#FG X|ZMCikFfꐙYъ+VIBG"Hd2q2}to`Pbذa'?q" 'Nb40].O0(!pB1hРm5e~( +--W^i n ]j  RaP, A&7@?S_ܻ>Jg@G"@Ǿ<~xMh FM߾}ם:u dWVȑ#zFޣGRTTTk#***`K: mTS9sWwMjnM/i0bH{']UB+c>0"Zq4 3I[.];2VWWݐ#aժU#/_AY1"vdCmm-Oȑ#8q,KPfxKQ\\.W8A_~ 6_|A<5???3eʔk6qnb$&ðӧdGzwV<_}40t 0Gnp ZXQ|@;\W=at]j_cO۰]6DG T>|8Ç'++ƾq~݊зoYfɓw:2{֭[Yv-f TUUyGtէS` 99#nXl7on}"̆k !!!G}nݺ}Mzz:O]5fzl t!z Qh@||<&GU555TUU+ «qpA {,cv3f|r~Tk}Y$kpB^!eus#:IIˑfd)effr7߰m6o FEE1tPz.FeZL4/ͷ±cxg⋽Bnl6TUUtr:q=ft:9|0/rQ N M oСGM63τ Mo'D!G%*|?I+O;7osҁ㐵 >+Bرcٻw/łhK. <,Ea̙8q°`?'΁x'9s&3f̠{DEE5nf vZ>Fgpt[#cd2C1`ߝ:u'x#GC(8Z+;lY _I_#ݽNqLu[8Vݞl8G.Q-(--7qyi4t:WN%'[ol\RR‚ Xt)C aȐ!ՋL#ec)=B^wf3yyyڵsСffqNFzFK^{̝;*ԩf{9e>*j]I1;t:(PCm)_ SFJŎ(6%!GY 'mسg# MZZwuG-`={o5kx{FGGGtt4l3tAee%EEE`6Weh?, %AAVLIH^xꩧ7)ǎ'\:d -FW킃B6Np - ʨ?\[.,baDG"k.rdL6ͯMknQFqW"p!&LH]aųłbi 74Ȝ߉zQ:9z*2?yp:pw}dz>뭡 1iM%a2dSQG|14^\}nߢzK֎Q{֬Yr9t:Z _?#{qM)9J^baI"FʚtU:zYN'f~y3?O<ɓ'[|u/lUaT04iNv8$`]U\W_ [E][Ӵ): YA,I6>'OZnl̙ D&KGfQl{w}=ZMWehJW6MˤԽR~H#h0~  .*(&f͚Evv6555ObÆ -n&NIS!YHA9N{>mWegԿ Lu?CX`?0F1i~_~9˖-c g <3O z:Q^XŹt@JTmTd҈5eW6i스0{lE7o(M  /Xl~}NYP]x;x:Xw@W^عHeϐRBGY,^%KBRSS=9 ppbHy^uLUjz^Y*t{ROݫW c予==5UEK^ B`^9Zv2w\ ? Zo~F@JL }(;S.#ʅ]歵j׬$z(2WnGw(tD~_=l9uFm-f۶mm ܆Os/n7.u=Vt]Z캆l3;w.?07t'NO?7xb]c&\;H}kZW9Pwm sn %5Kf/؎maFV/|?ܵkoa+ɓ99)o*;<M]-頫?׵p`Ga޼;?>wy']wM-+ od˩d"o@qQ\o2;:ە$\:'M$8`uqHV CٺtR˰ !֭F~IQs\C'M1 iS6C 퐐ļy󈋋~g0IJJ z{ 2Kw1N # n =\E . l'jon(GVA7#4&yS '1aRdR R+`AfZ4QN9<(wATTlZy▉I{|| | .'YZyU#;EѲuGWM0+5_8~;<_+-zMN98;.}  Tf.z={//+2|e}dXR+V0l0Bw6*WƿSz!:yk8lU)*)UFՍznm"GŵN>LϞ=IMMjOSRP௱cn{6\mj0W#ہU-dpQ&di f4sXUQ Oc"O?"z!J 0]=5b^իWGʫRx$wcdzc*?Ao-i4L0*29"N58ə'ɶ[x F =7ſ;s T0f\aUYwINp70Ɋ-G,G=iUTqڶ"CcE:ò4{>M #W7Tw}6 Q S4c)+.ʹzwt$xz.߮U9v3ܙ 2 j UU˽~Pp8-/ԫlNzg@nb.γRX;{T=n7Ukn&U.T"N%sfrW.t2}K;N 󦸸ʆMuD훯@UUWBAcyzU`V}`I47"}qN&b@S 񰻑RJ$ZyA:M%ס1t6ڀ=Xm6gϞ_BQPNt>U״*u邍,PwX NE"G˦"n|zٗQ ݻ8?*~:i{H fs(L0>}.u"veUֵ"h ??³5P&fҡ^_?.͆~4&W~$EaUWgZ{g : p+{ZyWT1J#F"4(vf[w}iѵQsGVS7FzY\M LC5M\T&xkY8I}tFV#~]Uő/)a2^()%3\WVU>Ϻaud(kz1Z; #2ꦀY)uYc#q/%6Hj%Q NwGOpI{9vQ97 S*`? R|."}G]_sCE̵W3РqpA  N$hZcbpUW믲pN0B_iSCqvxUÏ+QSAGJJ ]wBu8I, pc421?+\V N ~-.Бӂ;:ĸ8V|_XaI1Ȯg|uKQ̒N:ݻ{UMtN^kepVz6˗Nd9sVb ܡXcV^DGEq*}GK6 u.eմlMLLd޼yڵ+]ӊkNmL|1@#焛/`SwKŁ$j";d0PLd2b> pro-ҳ۽Ng.7t2[)ZY _#o|_ $3JĽC@?`w[^HVRb.c|JE@ݰKp_L0Gy;ve u(LGKbE}YX)oPrn#N4q%_;`NYI }^#d 3ȋĀ4oR(pH߿?>,̟?jfzh"d` 6p MZSe*cJ'_fjiҥԑ`V;-4xSݻ7ߙ4i+WNí&)/vKPґR4Y_LB `"N!g2&d'R66s&~eY@4 {ķ%m}:rwlV :oL6\`6Qc4q#CUϴtu &u]IAd r8 IT$Uݗ"7 8>M;NvdkFck74 cƌ5kFݦM;8rpODs"7,~9BAyXs qV9xW#?^ -2 `*0 in̎~δ[)ȋ{ځ(Ջy1oǏ/֬Y#\.W؈Z_x X9Օ[, 4%pk[4 ſ_T(q5׈Jl &x[E7Q*oԏ$!w"''缑n[޽[|MXX<ve5p/rCkOz-"A^^5kw{]ThġxĈs)SC]B<3"))) a_E8LcR=0 G6 Æ }]oRq EQ s&xI{ 7oDn{Odee5&*x;LDJG$\7LmA5++KX7joNQP<"7Yi}-z6#n[|G MO bmR@5al8"+U(cIш?r"..NB Ğ:GGp߷rKP >Ldff68K֑DD7X@ @?L,Y{7_tl6m6ѷo_f[57*V:|&֮]/o"d/x,ao!Qw&#nAJDJt &-0 5]GMvv6lܸ 6PYY̙3+^QQg> p o}׶TT+1cڗ~l^-.ڛF&O7жpBeyw!lݺ[nnӵkWN4?@ o<')[l4+ )))4w.C>/vި%!4<t ,k4WYAQɚl~֯_=òe˰Zr-̜9&;t/26+U9𴂼a׫̤K.vvO K_iFZ-^ڳ UiӎJ֞HyfۧOfΜN֡&''7 rD*wK4X),,h’HNO'w_7x3pEjJJ ={D `/D#4+ ;@JJAQR[[E~۪J\\~={УG[ߘpPVQB/58:a_U>@P<oE nAvv61118NnKrj:w:9t\އgT?~VWWw(*DpQJzP#`]==νNx*`x yBG%kZddd`4-))͆餦#Gx LPc.說:d( ^EZo;`!}}s), M'Onֻ9I9*pmF:&=fpuKVJJJByo\󧬬 Fᠸ}8Lhm9* : ׸\.GQn7۷oȑ#`m6lt:222voȪ*%1+42KK>'z%K5 H i={QZF×(Bl6B4 O EMN ʹ̭brjZ:u|jjjj ݽδv:v.WUrrrXWQv;[ѽi-3n 'APs֓h +T7*EeE3$V"?Ҵ^gbLlCz=_}Rf$?Td/Ȗ/kneaAc"6AIId_ $X93nsĉ[!Zx;Z]v&;M* K>BBG%k%R :h8N*++^QG} rqZ٥ۃP[[UQjkk9r&ɛH,TU%'''zƒ9һmBNC"dI)..nrt9<&J172q\СC線h((( //BM e?w.~ 7!S4Ls!j`+^2hcX"r:XhN}FR6mlZcsPUk–`hrymm- .䩧\oǣeRI*t`q@!* 2<2K)6BG&)`m(+;vFɪh .ZɈzBrBZ-v͛7r@<{Faƶmxy).*:#< q2 {HC "hCt HKF:7G|!F\\ #=.<ʕ+[Z|6l؀toyl6JKKٳgk֬a֭cTn!VV 5h8u=bkD ƷngÆ \~V%%%HAW ]O?eرݻEaȖ-[8|0ʑ#IU^^۷l6¯ឨF(вPZ#H ۶m#77Ceggc0lwˊ QsǏg<裘L氞r-l6( ׿fncZq LejcnP\HՉch8a-#2Om7dXlxE0 h"N#=zf ʄnb*6-[F>}曃@q V\=>~_hkjRꆨ<7vnTi9 ;_A)Jio+Wd۶m ?Bѣg봕*H`6LX`K.EIQ+N:K\X)NYVWQ#0QTXgcRa*٤  hq#EhY\ t8kS8\<z~Bv-,2rnw`~~aV8|"4<"r\ 8ns)"i-:'/wTVVbn|79NzɨQ#i@pb z!֭[x-|^r (JWvno BVj|2)#ȩaFtYGff_(DGG*.Ç /Oz;"ZdUI}%V XZ}GÇK#coQ#Q`/CM[n\{L0oV̩͛7xb9dٜ译T4S# HBzoCVv6'|@m`%FRy 9Iȶ@, fVKBBDGGc8{,~fu@ Uf"GBlu-=M̞7牨FЉq%Rm=R7WB5]t4B[RߏoD>"SCtN%z";uC?rc8 UA 9_KcG L+ w_!-؇ -2t՛QEdr AhFr4l 2c"`]&s_wv@1\p2$YeH>%H:8U4 =t~e"pϼӎ,/BHrCY( yG<%>"!+ XwZJ`#Ro|-_'N֘125=/g҆hYS9lc#G&Hxﻬ3=}?γMgOק[Ӑ6͍ J35rnlnFh !Oc#(ԗ&nKFPFeZ&5@ic>ByR5뿇MƮGYo Dh'e016uLYqv ;RZ)gIs7hK_fS`<߁O/%KK Z k)_)sV jw5uƎuo}M賖Y㢑嚻>4h)YZ9Z&<rN05`)k5GK`w7}ckZr]N֒5m[2ͭ5Y(A|̲.`iCAq(`nPG "B< y-]|(a EIDAT`7w\pcQZ~7By(A.ך2ϥ3].DADCN%tEXtdate:create2017-10-10T11:27:29+06:00%tEXtdate:modify2017-07-31T12:01:56+06:00IENDB`badger-2.2007.2/integration/000077500000000000000000000000001372173116500155215ustar00rootroot00000000000000badger-2.2007.2/integration/testgc/000077500000000000000000000000001372173116500170125ustar00rootroot00000000000000badger-2.2007.2/integration/testgc/.gitignore000066400000000000000000000000101372173116500207710ustar00rootroot00000000000000/testgc badger-2.2007.2/integration/testgc/main.go000066400000000000000000000112371372173116500202710ustar00rootroot00000000000000package main import ( "encoding/binary" "fmt" "log" "math/rand" "net/http" _ "net/http/pprof" "os" "sync" "sync/atomic" "time" "github.com/dgraph-io/badger/v2" "github.com/dgraph-io/badger/v2/options" "github.com/dgraph-io/badger/v2/y" ) var maxValue int64 = 10000000 var suffix = make([]byte, 128) type testSuite struct { sync.Mutex vals map[uint64]uint64 count uint64 // Not under mutex lock. } func encoded(i uint64) []byte { out := make([]byte, 8) binary.BigEndian.PutUint64(out, i) return out } func (s *testSuite) write(db *badger.DB) error { return db.Update(func(txn *badger.Txn) error { for i := 0; i < 10; i++ { // These keys would be overwritten. keyi := uint64(rand.Int63n(maxValue)) key := encoded(keyi) vali := atomic.AddUint64(&s.count, 1) val := encoded(vali) val = append(val, suffix...) if err := txn.SetEntry(badger.NewEntry(key, val)); err != nil { return err } } for i := 0; i < 20; i++ { // These keys would be new and never overwritten. keyi := atomic.AddUint64(&s.count, 1) if keyi%1000000 == 0 { log.Printf("Count: %d\n", keyi) } key := encoded(keyi) val := append(key, suffix...) if err := txn.SetEntry(badger.NewEntry(key, val)); err != nil { return err } } return nil }) } func (s *testSuite) read(db *badger.DB) error { max := int64(atomic.LoadUint64(&s.count)) keyi := uint64(rand.Int63n(max)) key := encoded(keyi) err := db.View(func(txn *badger.Txn) error { item, err := txn.Get(key) if err != nil { return err } val, err := item.ValueCopy(nil) if err != nil { return err } y.AssertTruef(len(val) == len(suffix)+8, "Found val of len: %d\n", len(val)) vali := binary.BigEndian.Uint64(val[0:8]) s.Lock() expected := s.vals[keyi] if vali < expected { log.Fatalf("Expected: %d. Found: %d. Key: %d\n", expected, vali, keyi) } else if vali == expected { // pass } else { s.vals[keyi] = vali } s.Unlock() return nil }) if err == badger.ErrKeyNotFound { return nil } return err } func main() { fmt.Println("Badger Integration test for value log GC.") dir := "/mnt/drive/badgertest" os.RemoveAll(dir) db, err := badger.Open(badger.DefaultOptions(dir). WithTableLoadingMode(options.MemoryMap). WithValueLogLoadingMode(options.FileIO). WithSyncWrites(false)) if err != nil { log.Fatal(err) } defer db.Close() go func() { _ = http.ListenAndServe("localhost:8080", nil) }() closer := y.NewCloser(11) go func() { // Run value log GC. defer closer.Done() var count int ticker := time.NewTicker(5 * time.Second) defer ticker.Stop() for range ticker.C { again: select { case <-closer.HasBeenClosed(): log.Printf("Num times value log GC was successful: %d\n", count) return default: } log.Printf("Starting a value log GC") err := db.RunValueLogGC(0.1) log.Printf("Result of value log GC: %v\n", err) if err == nil { count++ goto again } } }() s := testSuite{ count: uint64(maxValue), vals: make(map[uint64]uint64), } var numLoops uint64 ticker := time.NewTicker(5 * time.Second) for i := 0; i < 10; i++ { go func() { defer closer.Done() for { if err := s.write(db); err != nil { log.Fatal(err) } for j := 0; j < 10; j++ { if err := s.read(db); err != nil { log.Fatal(err) } } nl := atomic.AddUint64(&numLoops, 1) select { case <-closer.HasBeenClosed(): return case <-ticker.C: log.Printf("Num loops: %d\n", nl) default: } } }() } time.Sleep(5 * time.Minute) log.Println("Signaling...") closer.SignalAndWait() log.Println("Wait done. Now iterating over everything.") err = db.View(func(txn *badger.Txn) error { iopts := badger.DefaultIteratorOptions itr := txn.NewIterator(iopts) defer itr.Close() var total, tested int for itr.Rewind(); itr.Valid(); itr.Next() { item := itr.Item() key := item.Key() keyi := binary.BigEndian.Uint64(key) total++ val, err := item.ValueCopy(nil) if err != nil { return err } if len(val) < 8 { log.Printf("Unexpected value: %x\n", val) continue } vali := binary.BigEndian.Uint64(val[0:8]) expected, ok := s.vals[keyi] // Not all keys must be in vals map. if ok { tested++ if vali < expected { // vali must be equal or greater than what's in the map. log.Fatalf("Expected: %d. Got: %d. Key: %d\n", expected, vali, keyi) } } } log.Printf("Total iterated: %d. Tested values: %d\n", total, tested) return nil }) if err != nil { log.Fatalf("Error while iterating: %v", err) } log.Println("Iteration done. Test successful.") time.Sleep(time.Minute) // Time to do some poking around. } badger-2.2007.2/iterator.go000066400000000000000000000513401372173116500153610ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "bytes" "fmt" "hash/crc32" "sort" "sync" "sync/atomic" "time" "github.com/dgraph-io/badger/v2/options" "github.com/dgraph-io/badger/v2/table" "github.com/dgryski/go-farm" "github.com/dgraph-io/badger/v2/y" ) type prefetchStatus uint8 const ( prefetched prefetchStatus = iota + 1 ) // Item is returned during iteration. Both the Key() and Value() output is only valid until // iterator.Next() is called. type Item struct { status prefetchStatus err error wg sync.WaitGroup db *DB key []byte vptr []byte meta byte // We need to store meta to know about bitValuePointer. userMeta byte expiresAt uint64 val []byte slice *y.Slice // Used only during prefetching. next *Item version uint64 txn *Txn } // String returns a string representation of Item func (item *Item) String() string { return fmt.Sprintf("key=%q, version=%d, meta=%x", item.Key(), item.Version(), item.meta) } // Key returns the key. // // Key is only valid as long as item is valid, or transaction is valid. If you need to use it // outside its validity, please use KeyCopy. func (item *Item) Key() []byte { return item.key } // KeyCopy returns a copy of the key of the item, writing it to dst slice. // If nil is passed, or capacity of dst isn't sufficient, a new slice would be allocated and // returned. func (item *Item) KeyCopy(dst []byte) []byte { return y.SafeCopy(dst, item.key) } // Version returns the commit timestamp of the item. func (item *Item) Version() uint64 { return item.version } // Value retrieves the value of the item from the value log. // // This method must be called within a transaction. Calling it outside a // transaction is considered undefined behavior. If an iterator is being used, // then Item.Value() is defined in the current iteration only, because items are // reused. // // If you need to use a value outside a transaction, please use Item.ValueCopy // instead, or copy it yourself. Value might change once discard or commit is called. // Use ValueCopy if you want to do a Set after Get. func (item *Item) Value(fn func(val []byte) error) error { item.wg.Wait() if item.status == prefetched { if item.err == nil && fn != nil { if err := fn(item.val); err != nil { return err } } return item.err } buf, cb, err := item.yieldItemValue() defer runCallback(cb) if err != nil { return err } if fn != nil { return fn(buf) } return nil } // ValueCopy returns a copy of the value of the item from the value log, writing it to dst slice. // If nil is passed, or capacity of dst isn't sufficient, a new slice would be allocated and // returned. Tip: It might make sense to reuse the returned slice as dst argument for the next call. // // This function is useful in long running iterate/update transactions to avoid a write deadlock. // See Github issue: https://github.com/dgraph-io/badger/issues/315 func (item *Item) ValueCopy(dst []byte) ([]byte, error) { item.wg.Wait() if item.status == prefetched { return y.SafeCopy(dst, item.val), item.err } buf, cb, err := item.yieldItemValue() defer runCallback(cb) return y.SafeCopy(dst, buf), err } func (item *Item) hasValue() bool { if item.meta == 0 && item.vptr == nil { // key not found return false } return true } // IsDeletedOrExpired returns true if item contains deleted or expired value. func (item *Item) IsDeletedOrExpired() bool { return isDeletedOrExpired(item.meta, item.expiresAt) } // DiscardEarlierVersions returns whether the item was created with the // option to discard earlier versions of a key when multiple are available. func (item *Item) DiscardEarlierVersions() bool { return item.meta&bitDiscardEarlierVersions > 0 } func (item *Item) yieldItemValue() ([]byte, func(), error) { key := item.Key() // No need to copy. for { if !item.hasValue() { return nil, nil, nil } if item.slice == nil { item.slice = new(y.Slice) } if (item.meta & bitValuePointer) == 0 { val := item.slice.Resize(len(item.vptr)) copy(val, item.vptr) return val, nil, nil } var vp valuePointer vp.Decode(item.vptr) result, cb, err := item.db.vlog.Read(vp, item.slice) if err != ErrRetry { if err != nil { item.db.opt.Logger.Errorf(`Unable to read: Key: %v, Version : %v, meta: %v, userMeta: %v`, key, item.version, item.meta, item.userMeta) } return result, cb, err } if bytes.HasPrefix(key, badgerMove) { // err == ErrRetry // Error is retry even after checking the move keyspace. So, let's // just assume that value is not present. return nil, cb, nil } // The value pointer is pointing to a deleted value log. Look for the // move key and read that instead. runCallback(cb) // Do not put badgerMove on the left in append. It seems to cause some sort of manipulation. keyTs := y.KeyWithTs(item.Key(), item.Version()) key = make([]byte, len(badgerMove)+len(keyTs)) n := copy(key, badgerMove) copy(key[n:], keyTs) // Note that we can't set item.key to move key, because that would // change the key user sees before and after this call. Also, this move // logic is internal logic and should not impact the external behavior // of the retrieval. vs, err := item.db.get(key) if err != nil { return nil, nil, err } if vs.Version != item.Version() { return nil, nil, nil } // Bug fix: Always copy the vs.Value into vptr here. Otherwise, when item is reused this // slice gets overwritten. item.vptr = y.SafeCopy(item.vptr, vs.Value) item.meta &^= bitValuePointer // Clear the value pointer bit. if vs.Meta&bitValuePointer > 0 { item.meta |= bitValuePointer // This meta would only be about value pointer. } } } func runCallback(cb func()) { if cb != nil { cb() } } func (item *Item) prefetchValue() { val, cb, err := item.yieldItemValue() defer runCallback(cb) item.err = err item.status = prefetched if val == nil { return } if item.db.opt.ValueLogLoadingMode == options.MemoryMap { buf := item.slice.Resize(len(val)) copy(buf, val) item.val = buf } else { item.val = val } } // EstimatedSize returns the approximate size of the key-value pair. // // This can be called while iterating through a store to quickly estimate the // size of a range of key-value pairs (without fetching the corresponding // values). func (item *Item) EstimatedSize() int64 { if !item.hasValue() { return 0 } if (item.meta & bitValuePointer) == 0 { return int64(len(item.key) + len(item.vptr)) } var vp valuePointer vp.Decode(item.vptr) return int64(vp.Len) // includes key length. } // KeySize returns the size of the key. // Exact size of the key is key + 8 bytes of timestamp func (item *Item) KeySize() int64 { return int64(len(item.key)) } // ValueSize returns the approximate size of the value. // // This can be called to quickly estimate the size of a value without fetching // it. func (item *Item) ValueSize() int64 { if !item.hasValue() { return 0 } if (item.meta & bitValuePointer) == 0 { return int64(len(item.vptr)) } var vp valuePointer vp.Decode(item.vptr) klen := int64(len(item.key) + 8) // 8 bytes for timestamp. // 6 bytes are for the approximate length of the header. Since header is encoded in varint, we // cannot find the exact length of header without fetching it. return int64(vp.Len) - klen - 6 - crc32.Size } // UserMeta returns the userMeta set by the user. Typically, this byte, optionally set by the user // is used to interpret the value. func (item *Item) UserMeta() byte { return item.userMeta } // ExpiresAt returns a Unix time value indicating when the item will be // considered expired. 0 indicates that the item will never expire. func (item *Item) ExpiresAt() uint64 { return item.expiresAt } // TODO: Switch this to use linked list container in Go. type list struct { head *Item tail *Item } func (l *list) push(i *Item) { i.next = nil if l.tail == nil { l.head = i l.tail = i return } l.tail.next = i l.tail = i } func (l *list) pop() *Item { if l.head == nil { return nil } i := l.head if l.head == l.tail { l.tail = nil l.head = nil } else { l.head = i.next } i.next = nil return i } // IteratorOptions is used to set options when iterating over Badger key-value // stores. // // This package provides DefaultIteratorOptions which contains options that // should work for most applications. Consider using that as a starting point // before customizing it for your own needs. type IteratorOptions struct { // Indicates whether we should prefetch values during iteration and store them. PrefetchValues bool // How many KV pairs to prefetch while iterating. Valid only if PrefetchValues is true. PrefetchSize int Reverse bool // Direction of iteration. False is forward, true is backward. AllVersions bool // Fetch all valid versions of the same key. // The following option is used to narrow down the SSTables that iterator picks up. If // Prefix is specified, only tables which could have this prefix are picked based on their range // of keys. Prefix []byte // Only iterate over this given prefix. prefixIsKey bool // If set, use the prefix for bloom filter lookup. InternalAccess bool // Used to allow internal access to badger keys. } func (opt *IteratorOptions) compareToPrefix(key []byte) int { // We should compare key without timestamp. For example key - a[TS] might be > "aa" prefix. key = y.ParseKey(key) if len(key) > len(opt.Prefix) { key = key[:len(opt.Prefix)] } return bytes.Compare(key, opt.Prefix) } func (opt *IteratorOptions) pickTable(t table.TableInterface) bool { if len(opt.Prefix) == 0 { return true } if opt.compareToPrefix(t.Smallest()) > 0 { return false } if opt.compareToPrefix(t.Biggest()) < 0 { return false } // Bloom filter lookup would only work if opt.Prefix does NOT have the read // timestamp as part of the key. if opt.prefixIsKey && t.DoesNotHave(farm.Fingerprint64(opt.Prefix)) { return false } return true } // pickTables picks the necessary table for the iterator. This function also assumes // that the tables are sorted in the right order. func (opt *IteratorOptions) pickTables(all []*table.Table) []*table.Table { if len(opt.Prefix) == 0 { out := make([]*table.Table, len(all)) copy(out, all) return out } sIdx := sort.Search(len(all), func(i int) bool { return opt.compareToPrefix(all[i].Biggest()) >= 0 }) if sIdx == len(all) { // Not found. return []*table.Table{} } filtered := all[sIdx:] if !opt.prefixIsKey { eIdx := sort.Search(len(filtered), func(i int) bool { return opt.compareToPrefix(filtered[i].Smallest()) > 0 }) out := make([]*table.Table, len(filtered[:eIdx])) copy(out, filtered[:eIdx]) return out } var out []*table.Table hash := farm.Fingerprint64(opt.Prefix) for _, t := range filtered { // When we encounter the first table whose smallest key is higher than // opt.Prefix, we can stop. if opt.compareToPrefix(t.Smallest()) > 0 { return out } // opt.Prefix is actually the key. So, we can run bloom filter checks // as well. if t.DoesNotHave(hash) { continue } out = append(out, t) } return out } // DefaultIteratorOptions contains default options when iterating over Badger key-value stores. var DefaultIteratorOptions = IteratorOptions{ PrefetchValues: true, PrefetchSize: 100, Reverse: false, AllVersions: false, } // Iterator helps iterating over the KV pairs in a lexicographically sorted order. type Iterator struct { iitr y.Iterator txn *Txn readTs uint64 opt IteratorOptions item *Item data list waste list lastKey []byte // Used to skip over multiple versions of the same key. closed bool // ThreadId is an optional value that can be set to identify which goroutine created // the iterator. It can be used, for example, to uniquely identify each of the // iterators created by the stream interface ThreadId int } // NewIterator returns a new iterator. Depending upon the options, either only keys, or both // key-value pairs would be fetched. The keys are returned in lexicographically sorted order. // Using prefetch is recommended if you're doing a long running iteration, for performance. // // Multiple Iterators: // For a read-only txn, multiple iterators can be running simultaneously. However, for a read-write // txn, iterators have the nuance of being a snapshot of the writes for the transaction at the time // iterator was created. If writes are performed after an iterator is created, then that iterator // will not be able to see those writes. Only writes performed before an iterator was created can be // viewed. func (txn *Txn) NewIterator(opt IteratorOptions) *Iterator { if txn.discarded { panic("Transaction has already been discarded") } if txn.db.IsClosed() { panic(ErrDBClosed.Error()) } // Keep track of the number of active iterators. atomic.AddInt32(&txn.numIterators, 1) // TODO: If Prefix is set, only pick those memtables which have keys with // the prefix. tables, decr := txn.db.getMemTables() defer decr() txn.db.vlog.incrIteratorCount() var iters []y.Iterator if itr := txn.newPendingWritesIterator(opt.Reverse); itr != nil { iters = append(iters, itr) } for i := 0; i < len(tables); i++ { iters = append(iters, tables[i].NewUniIterator(opt.Reverse)) } iters = txn.db.lc.appendIterators(iters, &opt) // This will increment references. res := &Iterator{ txn: txn, iitr: table.NewMergeIterator(iters, opt.Reverse), opt: opt, readTs: txn.readTs, } return res } // NewKeyIterator is just like NewIterator, but allows the user to iterate over all versions of a // single key. Internally, it sets the Prefix option in provided opt, and uses that prefix to // additionally run bloom filter lookups before picking tables from the LSM tree. func (txn *Txn) NewKeyIterator(key []byte, opt IteratorOptions) *Iterator { if len(opt.Prefix) > 0 { panic("opt.Prefix should be nil for NewKeyIterator.") } opt.Prefix = key // This key must be without the timestamp. opt.prefixIsKey = true opt.AllVersions = true return txn.NewIterator(opt) } func (it *Iterator) newItem() *Item { item := it.waste.pop() if item == nil { item = &Item{slice: new(y.Slice), db: it.txn.db, txn: it.txn} } return item } // Item returns pointer to the current key-value pair. // This item is only valid until it.Next() gets called. func (it *Iterator) Item() *Item { tx := it.txn tx.addReadKey(it.item.Key()) return it.item } // Valid returns false when iteration is done. func (it *Iterator) Valid() bool { if it.item == nil { return false } if it.opt.prefixIsKey { return bytes.Equal(it.item.key, it.opt.Prefix) } return bytes.HasPrefix(it.item.key, it.opt.Prefix) } // ValidForPrefix returns false when iteration is done // or when the current key is not prefixed by the specified prefix. func (it *Iterator) ValidForPrefix(prefix []byte) bool { return it.Valid() && bytes.HasPrefix(it.item.key, prefix) } // Close would close the iterator. It is important to call this when you're done with iteration. func (it *Iterator) Close() { if it.closed { return } it.closed = true it.iitr.Close() // It is important to wait for the fill goroutines to finish. Otherwise, we might leave zombie // goroutines behind, which are waiting to acquire file read locks after DB has been closed. waitFor := func(l list) { item := l.pop() for item != nil { item.wg.Wait() item = l.pop() } } waitFor(it.waste) waitFor(it.data) // TODO: We could handle this error. _ = it.txn.db.vlog.decrIteratorCount() atomic.AddInt32(&it.txn.numIterators, -1) } // Next would advance the iterator by one. Always check it.Valid() after a Next() // to ensure you have access to a valid it.Item(). func (it *Iterator) Next() { // Reuse current item it.item.wg.Wait() // Just cleaner to wait before pushing to avoid doing ref counting. it.waste.push(it.item) // Set next item to current it.item = it.data.pop() for it.iitr.Valid() { if it.parseItem() { // parseItem calls one extra next. // This is used to deal with the complexity of reverse iteration. break } } } func isDeletedOrExpired(meta byte, expiresAt uint64) bool { if meta&bitDelete > 0 { return true } if expiresAt == 0 { return false } return expiresAt <= uint64(time.Now().Unix()) } // parseItem is a complex function because it needs to handle both forward and reverse iteration // implementation. We store keys such that their versions are sorted in descending order. This makes // forward iteration efficient, but revese iteration complicated. This tradeoff is better because // forward iteration is more common than reverse. // // This function advances the iterator. func (it *Iterator) parseItem() bool { mi := it.iitr key := mi.Key() setItem := func(item *Item) { if it.item == nil { it.item = item } else { it.data.push(item) } } // Skip badger keys. if !it.opt.InternalAccess && bytes.HasPrefix(key, badgerPrefix) { mi.Next() return false } // Skip any versions which are beyond the readTs. version := y.ParseTs(key) if version > it.readTs { mi.Next() return false } if it.opt.AllVersions { // Return deleted or expired values also, otherwise user can't figure out // whether the key was deleted. item := it.newItem() it.fill(item) setItem(item) mi.Next() return true } // If iterating in forward direction, then just checking the last key against current key would // be sufficient. if !it.opt.Reverse { if y.SameKey(it.lastKey, key) { mi.Next() return false } // Only track in forward direction. // We should update lastKey as soon as we find a different key in our snapshot. // Consider keys: a 5, b 7 (del), b 5. When iterating, lastKey = a. // Then we see b 7, which is deleted. If we don't store lastKey = b, we'll then return b 5, // which is wrong. Therefore, update lastKey here. it.lastKey = y.SafeCopy(it.lastKey, mi.Key()) } FILL: // If deleted, advance and return. vs := mi.Value() if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) { mi.Next() return false } item := it.newItem() it.fill(item) // fill item based on current cursor position. All Next calls have returned, so reaching here // means no Next was called. mi.Next() // Advance but no fill item yet. if !it.opt.Reverse || !mi.Valid() { // Forward direction, or invalid. setItem(item) return true } // Reverse direction. nextTs := y.ParseTs(mi.Key()) mik := y.ParseKey(mi.Key()) if nextTs <= it.readTs && bytes.Equal(mik, item.key) { // This is a valid potential candidate. goto FILL } // Ignore the next candidate. Return the current one. setItem(item) return true } func (it *Iterator) fill(item *Item) { vs := it.iitr.Value() item.meta = vs.Meta item.userMeta = vs.UserMeta item.expiresAt = vs.ExpiresAt item.version = y.ParseTs(it.iitr.Key()) item.key = y.SafeCopy(item.key, y.ParseKey(it.iitr.Key())) item.vptr = y.SafeCopy(item.vptr, vs.Value) item.val = nil if it.opt.PrefetchValues { item.wg.Add(1) go func() { // FIXME we are not handling errors here. item.prefetchValue() item.wg.Done() }() } } func (it *Iterator) prefetch() { prefetchSize := 2 if it.opt.PrefetchValues && it.opt.PrefetchSize > 1 { prefetchSize = it.opt.PrefetchSize } i := it.iitr var count int it.item = nil for i.Valid() { if !it.parseItem() { continue } count++ if count == prefetchSize { break } } } // Seek would seek to the provided key if present. If absent, it would seek to the next // smallest key greater than the provided key if iterating in the forward direction. // Behavior would be reversed if iterating backwards. func (it *Iterator) Seek(key []byte) { if len(key) > 0 { it.txn.addReadKey(key) } for i := it.data.pop(); i != nil; i = it.data.pop() { i.wg.Wait() it.waste.push(i) } it.lastKey = it.lastKey[:0] if len(key) == 0 { key = it.opt.Prefix } if len(key) == 0 { it.iitr.Rewind() it.prefetch() return } if !it.opt.Reverse { key = y.KeyWithTs(key, it.txn.readTs) } else { key = y.KeyWithTs(key, 0) } it.iitr.Seek(key) it.prefetch() } // Rewind would rewind the iterator cursor all the way to zero-th position, which would be the // smallest key if iterating forward, and largest if iterating backward. It does not keep track of // whether the cursor started with a Seek(). func (it *Iterator) Rewind() { it.Seek(nil) } badger-2.2007.2/iterator_test.go000066400000000000000000000240551372173116500164230ustar00rootroot00000000000000/* * Copyright 2018 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "bytes" "fmt" "io/ioutil" "math/rand" "os" "path/filepath" "strings" "testing" "github.com/dgraph-io/badger/v2/options" "github.com/dgraph-io/badger/v2/table" "github.com/dgraph-io/badger/v2/y" "github.com/stretchr/testify/require" ) type tableMock struct { left, right []byte } func (tm *tableMock) Smallest() []byte { return tm.left } func (tm *tableMock) Biggest() []byte { return tm.right } func (tm *tableMock) DoesNotHave(hash uint64) bool { return false } func TestPickTables(t *testing.T) { opt := DefaultIteratorOptions within := func(prefix, left, right []byte) { opt.Prefix = prefix // PickTable expects smallest and biggest to contain timestamps. tm := &tableMock{left: y.KeyWithTs(left, 1), right: y.KeyWithTs(right, 1)} require.True(t, opt.pickTable(tm), "within failed for %b %b %b\n", prefix, left, right) } outside := func(prefix, left, right string) { opt.Prefix = []byte(prefix) // PickTable expects smallest and biggest to contain timestamps. tm := &tableMock{left: y.KeyWithTs([]byte(left), 1), right: y.KeyWithTs([]byte(right), 1)} require.False(t, opt.pickTable(tm), "outside failed for %b %b %b", prefix, left, right) } within([]byte("abc"), []byte("ab"), []byte("ad")) within([]byte("abc"), []byte("abc"), []byte("ad")) within([]byte("abc"), []byte("abb123"), []byte("ad")) within([]byte("abc"), []byte("abc123"), []byte("abd234")) within([]byte("abc"), []byte("abc123"), []byte("abc456")) // Regression test for https://github.com/dgraph-io/badger/issues/992 within([]byte{0, 0, 1}, []byte{0}, []byte{0, 0, 1}) outside("abd", "abe", "ad") outside("abd", "ac", "ad") outside("abd", "b", "e") outside("abd", "a", "ab") outside("abd", "ab", "abc") outside("abd", "ab", "abc123") } func TestPickSortTables(t *testing.T) { type MockKeys struct { small string large string } genTables := func(mks ...MockKeys) []*table.Table { out := make([]*table.Table, 0) for _, mk := range mks { opts := table.Options{LoadingMode: options.MemoryMap, ChkMode: options.OnTableAndBlockRead} f := buildTable(t, [][]string{{mk.small, "some value"}, {mk.large, "some value"}}, opts) tbl, err := table.OpenTable(f, opts) require.NoError(t, err) out = append(out, tbl) } return out } tables := genTables(MockKeys{small: "a", large: "abc"}, MockKeys{small: "abcd", large: "cde"}, MockKeys{small: "cge", large: "chf"}, MockKeys{small: "glr", large: "gyup"}) opt := DefaultIteratorOptions opt.prefixIsKey = false opt.Prefix = []byte("c") filtered := opt.pickTables(tables) require.Equal(t, 2, len(filtered)) // build table adds time stamp so removing tailing bytes. require.Equal(t, filtered[0].Smallest()[:4], []byte("abcd")) require.Equal(t, filtered[1].Smallest()[:3], []byte("cge")) tables = genTables(MockKeys{small: "a", large: "abc"}, MockKeys{small: "abcd", large: "ade"}, MockKeys{small: "cge", large: "chf"}, MockKeys{small: "glr", large: "gyup"}) filtered = opt.pickTables(tables) require.Equal(t, 1, len(filtered)) require.Equal(t, filtered[0].Smallest()[:3], []byte("cge")) tables = genTables(MockKeys{small: "a", large: "abc"}, MockKeys{small: "abcd", large: "ade"}, MockKeys{small: "cge", large: "chf"}, MockKeys{small: "ckr", large: "cyup"}, MockKeys{small: "csfr", large: "gyup"}) filtered = opt.pickTables(tables) require.Equal(t, 3, len(filtered)) require.Equal(t, filtered[0].Smallest()[:3], []byte("cge")) require.Equal(t, filtered[1].Smallest()[:3], []byte("ckr")) require.Equal(t, filtered[2].Smallest()[:4], []byte("csfr")) opt.Prefix = []byte("aa") filtered = opt.pickTables(tables) require.Equal(t, y.ParseKey(filtered[0].Smallest()), []byte("a")) require.Equal(t, y.ParseKey(filtered[0].Biggest()), []byte("abc")) } func TestIteratePrefix(t *testing.T) { testIteratorPrefix := func(t *testing.T, db *DB) { bkey := func(i int) []byte { return []byte(fmt.Sprintf("%04d", i)) } val := []byte("OK") n := 10000 batch := db.NewWriteBatch() for i := 0; i < n; i++ { if (i % 1000) == 0 { t.Logf("Put i=%d\n", i) } require.NoError(t, batch.Set(bkey(i), val)) } require.NoError(t, batch.Flush()) countKeys := func(prefix string) int { t.Logf("Testing with prefix: %s", prefix) var count int opt := DefaultIteratorOptions opt.Prefix = []byte(prefix) err := db.View(func(txn *Txn) error { itr := txn.NewIterator(opt) defer itr.Close() for itr.Rewind(); itr.Valid(); itr.Next() { item := itr.Item() err := item.Value(func(v []byte) error { require.Equal(t, val, v) return nil }) require.NoError(t, err) require.True(t, bytes.HasPrefix(item.Key(), opt.Prefix)) count++ } return nil }) require.NoError(t, err) return count } countOneKey := func(key []byte) int { var count int err := db.View(func(txn *Txn) error { itr := txn.NewKeyIterator(key, DefaultIteratorOptions) defer itr.Close() for itr.Rewind(); itr.Valid(); itr.Next() { item := itr.Item() err := item.Value(func(v []byte) error { require.Equal(t, val, v) return nil }) require.NoError(t, err) require.Equal(t, key, item.Key()) count++ } return nil }) require.NoError(t, err) return count } for i := 0; i <= 9; i++ { require.Equal(t, 1, countKeys(fmt.Sprintf("%d%d%d%d", i, i, i, i))) require.Equal(t, 10, countKeys(fmt.Sprintf("%d%d%d", i, i, i))) require.Equal(t, 100, countKeys(fmt.Sprintf("%d%d", i, i))) require.Equal(t, 1000, countKeys(fmt.Sprintf("%d", i))) } require.Equal(t, 10000, countKeys("")) t.Logf("Testing each key with key iterator") for i := 0; i < n; i++ { require.Equal(t, 1, countOneKey(bkey(i))) } } t.Run("With Default options", func(t *testing.T) { t.Parallel() runBadgerTest(t, nil, func(t *testing.T, db *DB) { testIteratorPrefix(t, db) }) }) t.Run("With Block Offsets in Cache", func(t *testing.T) { t.Parallel() opts := getTestOptions("") opts.IndexCacheSize = 100 << 20 runBadgerTest(t, &opts, func(t *testing.T, db *DB) { testIteratorPrefix(t, db) }) }) t.Run("With Block Offsets and Blocks in Cache", func(t *testing.T) { t.Parallel() opts := getTestOptions("") opts.BlockCacheSize = 100 << 20 opts.IndexCacheSize = 100 << 20 runBadgerTest(t, &opts, func(t *testing.T, db *DB) { testIteratorPrefix(t, db) }) }) t.Run("With Blocks in Cache", func(t *testing.T) { t.Parallel() opts := getTestOptions("") opts.BlockCacheSize = 100 << 20 runBadgerTest(t, &opts, func(t *testing.T, db *DB) { testIteratorPrefix(t, db) }) }) } // go test -v -run=XXX -bench=BenchmarkIterate -benchtime=3s // Benchmark with opt.Prefix set === // goos: linux // goarch: amd64 // pkg: github.com/dgraph-io/badger // BenchmarkIteratePrefixSingleKey/Key_lookups-4 10000 365539 ns/op // --- BENCH: BenchmarkIteratePrefixSingleKey/Key_lookups-4 // iterator_test.go:147: Inner b.N: 1 // iterator_test.go:147: Inner b.N: 100 // iterator_test.go:147: Inner b.N: 10000 // --- BENCH: BenchmarkIteratePrefixSingleKey // iterator_test.go:143: LSM files: 79 // iterator_test.go:145: Outer b.N: 1 // PASS // ok github.com/dgraph-io/badger 41.586s // // Benchmark with NO opt.Prefix set === // goos: linux // goarch: amd64 // pkg: github.com/dgraph-io/badger // BenchmarkIteratePrefixSingleKey/Key_lookups-4 10000 460924 ns/op // --- BENCH: BenchmarkIteratePrefixSingleKey/Key_lookups-4 // iterator_test.go:147: Inner b.N: 1 // iterator_test.go:147: Inner b.N: 100 // iterator_test.go:147: Inner b.N: 10000 // --- BENCH: BenchmarkIteratePrefixSingleKey // iterator_test.go:143: LSM files: 83 // iterator_test.go:145: Outer b.N: 1 // PASS // ok github.com/dgraph-io/badger 41.836s // // Only my laptop there's a 20% improvement in latency with ~80 files. func BenchmarkIteratePrefixSingleKey(b *testing.B) { dir, err := ioutil.TempDir(".", "badger-test") y.Check(err) defer removeDir(dir) opts := getTestOptions(dir) opts.TableLoadingMode = options.LoadToRAM db, err := Open(opts) y.Check(err) defer db.Close() N := 100000 // Should generate around 80 SSTables. val := []byte("OK") bkey := func(i int) []byte { return []byte(fmt.Sprintf("%06d", i)) } batch := db.NewWriteBatch() for i := 0; i < N; i++ { y.Check(batch.Set(bkey(i), val)) } y.Check(batch.Flush()) var lsmFiles int err = filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { if strings.HasSuffix(path, ".sst") { lsmFiles++ } if err != nil { return err } return nil }) y.Check(err) b.Logf("LSM files: %d", lsmFiles) b.Logf("Key splits: %v", db.KeySplits(nil)) b.Logf("Key splits with prefix: %v", db.KeySplits([]byte("09"))) b.Logf("Outer b.N: %d", b.N) b.Run("Key lookups", func(b *testing.B) { b.Logf("Inner b.N: %d", b.N) for i := 0; i < b.N; i++ { key := bkey(rand.Intn(N)) err := db.View(func(txn *Txn) error { opt := DefaultIteratorOptions // NOTE: Comment opt.Prefix out here to compare the performance // difference between providing Prefix as an option, v/s not. I // see a 20% improvement when there are ~80 SSTables. opt.Prefix = key opt.AllVersions = true itr := txn.NewIterator(opt) defer itr.Close() var count int for itr.Seek(key); itr.ValidForPrefix(key); itr.Next() { count++ } if count != 1 { b.Fatalf("Count must be one key: %s. Found: %d", key, count) } return nil }) if err != nil { b.Fatalf("Error while View: %v", err) } } }) } badger-2.2007.2/key_registry.go000066400000000000000000000315131372173116500162500ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "bytes" "crypto/aes" "crypto/rand" "encoding/binary" "hash/crc32" "io" "os" "path/filepath" "sync" "time" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/y" ) const ( // KeyRegistryFileName is the file name for the key registry file. KeyRegistryFileName = "KEYREGISTRY" // KeyRegistryRewriteFileName is the file name for the rewrite key registry file. KeyRegistryRewriteFileName = "REWRITE-KEYREGISTRY" ) // SanityText is used to check whether the given user provided storage key is valid or not var sanityText = []byte("Hello Badger") // KeyRegistry used to maintain all the data keys. type KeyRegistry struct { sync.RWMutex dataKeys map[uint64]*pb.DataKey lastCreated int64 //lastCreated is the timestamp(seconds) of the last data key generated. nextKeyID uint64 fp *os.File opt KeyRegistryOptions } type KeyRegistryOptions struct { Dir string ReadOnly bool EncryptionKey []byte EncryptionKeyRotationDuration time.Duration InMemory bool } // newKeyRegistry returns KeyRegistry. func newKeyRegistry(opt KeyRegistryOptions) *KeyRegistry { return &KeyRegistry{ dataKeys: make(map[uint64]*pb.DataKey), nextKeyID: 0, opt: opt, } } // OpenKeyRegistry opens key registry if it exists, otherwise it'll create key registry // and returns key registry. func OpenKeyRegistry(opt KeyRegistryOptions) (*KeyRegistry, error) { // sanity check the encryption key length. if len(opt.EncryptionKey) > 0 { switch len(opt.EncryptionKey) { default: return nil, y.Wrapf(ErrInvalidEncryptionKey, "During OpenKeyRegistry") case 16, 24, 32: break } } // If db is opened in InMemory mode, we don't need to write key registry to the disk. if opt.InMemory { return newKeyRegistry(opt), nil } path := filepath.Join(opt.Dir, KeyRegistryFileName) var flags uint32 if opt.ReadOnly { flags |= y.ReadOnly } else { flags |= y.Sync } fp, err := y.OpenExistingFile(path, flags) // OpenExistingFile just open file. // So checking whether the file exist or not. If not // We'll create new keyregistry. if os.IsNotExist(err) { // Creating new registry file if not exist. kr := newKeyRegistry(opt) if opt.ReadOnly { return kr, nil } // Writing the key registry to the file. if err := WriteKeyRegistry(kr, opt); err != nil { return nil, y.Wrapf(err, "Error while writing key registry.") } fp, err = y.OpenExistingFile(path, flags) if err != nil { return nil, y.Wrapf(err, "Error while opening newly created key registry.") } } else if err != nil { return nil, y.Wrapf(err, "Error while opening key registry.") } kr, err := readKeyRegistry(fp, opt) if err != nil { // This case happens only if the file is opened properly and // not able to read. fp.Close() return nil, err } if opt.ReadOnly { // We'll close the file in readonly mode. return kr, fp.Close() } kr.fp = fp return kr, nil } // keyRegistryIterator reads all the datakey from the key registry type keyRegistryIterator struct { encryptionKey []byte fp *os.File // lenCrcBuf contains crc buf and data length to move forward. lenCrcBuf [8]byte } // newKeyRegistryIterator returns iterator which will allow you to iterate // over the data key of the key registry. func newKeyRegistryIterator(fp *os.File, encryptionKey []byte) (*keyRegistryIterator, error) { return &keyRegistryIterator{ encryptionKey: encryptionKey, fp: fp, lenCrcBuf: [8]byte{}, }, validRegistry(fp, encryptionKey) } // validRegistry checks that given encryption key is valid or not. func validRegistry(fp *os.File, encryptionKey []byte) error { iv := make([]byte, aes.BlockSize) var err error if _, err = fp.Read(iv); err != nil { return y.Wrapf(err, "Error while reading IV for key registry.") } eSanityText := make([]byte, len(sanityText)) if _, err = fp.Read(eSanityText); err != nil { return y.Wrapf(err, "Error while reading sanity text.") } if len(encryptionKey) > 0 { // Decrypting sanity text. if eSanityText, err = y.XORBlock(eSanityText, encryptionKey, iv); err != nil { return y.Wrapf(err, "During validRegistry") } } // Check the given key is valid or not. if !bytes.Equal(eSanityText, sanityText) { return ErrEncryptionKeyMismatch } return nil } func (kri *keyRegistryIterator) next() (*pb.DataKey, error) { var err error // Read crc buf and data length. if _, err = kri.fp.Read(kri.lenCrcBuf[:]); err != nil { // EOF means end of the iteration. if err != io.EOF { return nil, y.Wrapf(err, "While reading crc in keyRegistryIterator.next") } return nil, err } l := int64(binary.BigEndian.Uint32(kri.lenCrcBuf[0:4])) // Read protobuf data. data := make([]byte, l) if _, err = kri.fp.Read(data); err != nil { // EOF means end of the iteration. if err != io.EOF { return nil, y.Wrapf(err, "While reading protobuf in keyRegistryIterator.next") } return nil, err } // Check checksum. if crc32.Checksum(data, y.CastagnoliCrcTable) != binary.BigEndian.Uint32(kri.lenCrcBuf[4:]) { return nil, y.Wrapf(y.ErrChecksumMismatch, "Error while checking checksum for data key.") } dataKey := &pb.DataKey{} if err = dataKey.Unmarshal(data); err != nil { return nil, y.Wrapf(err, "While unmarshal of datakey in keyRegistryIterator.next") } if len(kri.encryptionKey) > 0 { // Decrypt the key if the storage key exists. if dataKey.Data, err = y.XORBlock(dataKey.Data, kri.encryptionKey, dataKey.Iv); err != nil { return nil, y.Wrapf(err, "While decrypting datakey in keyRegistryIterator.next") } } return dataKey, nil } // readKeyRegistry will read the key registry file and build the key registry struct. func readKeyRegistry(fp *os.File, opt KeyRegistryOptions) (*KeyRegistry, error) { itr, err := newKeyRegistryIterator(fp, opt.EncryptionKey) if err != nil { return nil, err } kr := newKeyRegistry(opt) var dk *pb.DataKey dk, err = itr.next() for err == nil && dk != nil { if dk.KeyId > kr.nextKeyID { // Set the maximum key ID for next key ID generation. kr.nextKeyID = dk.KeyId } if dk.CreatedAt > kr.lastCreated { // Set the last generated key timestamp. kr.lastCreated = dk.CreatedAt } // No need to lock since we are building the initial state. kr.dataKeys[dk.KeyId] = dk // Forward the iterator. dk, err = itr.next() } // We read all the key. So, Ignoring this error. if err == io.EOF { err = nil } return kr, err } /* Structure of Key Registry. +-------------------+---------------------+--------------------+--------------+------------------+ | IV | Sanity Text | DataKey1 | DataKey2 | ... | +-------------------+---------------------+--------------------+--------------+------------------+ */ // WriteKeyRegistry will rewrite the existing key registry file with new one. // It is okay to give closed key registry. Since, it's using only the datakey. func WriteKeyRegistry(reg *KeyRegistry, opt KeyRegistryOptions) error { buf := &bytes.Buffer{} iv, err := y.GenerateIV() y.Check(err) // Encrypt sanity text if the encryption key is presents. eSanity := sanityText if len(opt.EncryptionKey) > 0 { var err error eSanity, err = y.XORBlock(eSanity, opt.EncryptionKey, iv) if err != nil { return y.Wrapf(err, "Error while encrpting sanity text in WriteKeyRegistry") } } y.Check2(buf.Write(iv)) y.Check2(buf.Write(eSanity)) // Write all the datakeys to the buf. for _, k := range reg.dataKeys { // Writing the datakey to the given buffer. if err := storeDataKey(buf, opt.EncryptionKey, k); err != nil { return y.Wrapf(err, "Error while storing datakey in WriteKeyRegistry") } } tmpPath := filepath.Join(opt.Dir, KeyRegistryRewriteFileName) // Open temporary file to write the data and do atomic rename. fp, err := y.OpenTruncFile(tmpPath, true) if err != nil { return y.Wrapf(err, "Error while opening tmp file in WriteKeyRegistry") } // Write buf to the disk. if _, err = fp.Write(buf.Bytes()); err != nil { // close the fd before returning error. We're not using defer // because, for windows we need to close the fd explicitly before // renaming. fp.Close() return y.Wrapf(err, "Error while writing buf in WriteKeyRegistry") } // In Windows the files should be closed before doing a Rename. if err = fp.Close(); err != nil { return y.Wrapf(err, "Error while closing tmp file in WriteKeyRegistry") } // Rename to the original file. if err = os.Rename(tmpPath, filepath.Join(opt.Dir, KeyRegistryFileName)); err != nil { return y.Wrapf(err, "Error while renaming file in WriteKeyRegistry") } // Sync Dir. return syncDir(opt.Dir) } // dataKey returns datakey of the given key id. func (kr *KeyRegistry) dataKey(id uint64) (*pb.DataKey, error) { kr.RLock() defer kr.RUnlock() if id == 0 { // nil represent plain text. return nil, nil } dk, ok := kr.dataKeys[id] if !ok { return nil, y.Wrapf(ErrInvalidDataKeyID, "Error for the KEY ID %d", id) } return dk, nil } // latestDataKey will give you the latest generated datakey based on the rotation // period. If the last generated datakey lifetime exceeds the rotation period. // It'll create new datakey. func (kr *KeyRegistry) latestDataKey() (*pb.DataKey, error) { if len(kr.opt.EncryptionKey) == 0 { // nil is for no encryption. return nil, nil } // validKey return datakey if the last generated key duration less than // rotation duration. validKey := func() (*pb.DataKey, bool) { // Time diffrence from the last generated time. diff := time.Since(time.Unix(kr.lastCreated, 0)) if diff < kr.opt.EncryptionKeyRotationDuration { return kr.dataKeys[kr.nextKeyID], true } return nil, false } kr.RLock() key, valid := validKey() kr.RUnlock() if valid { // If less than EncryptionKeyRotationDuration, returns the last generated key. return key, nil } kr.Lock() defer kr.Unlock() // Key might have generated by another go routine. So, // checking once again. key, valid = validKey() if valid { return key, nil } k := make([]byte, len(kr.opt.EncryptionKey)) iv, err := y.GenerateIV() if err != nil { return nil, err } _, err = rand.Read(k) if err != nil { return nil, err } // Otherwise Increment the KeyID and generate new datakey. kr.nextKeyID++ dk := &pb.DataKey{ KeyId: kr.nextKeyID, Data: k, CreatedAt: time.Now().Unix(), Iv: iv, } // Don't store the datakey on file if badger is running in InMemory mode. if !kr.opt.InMemory { // Store the datekey. buf := &bytes.Buffer{} if err = storeDataKey(buf, kr.opt.EncryptionKey, dk); err != nil { return nil, err } // Persist the datakey to the disk if _, err = kr.fp.Write(buf.Bytes()); err != nil { return nil, err } } // storeDatakey encrypts the datakey So, placing un-encrypted key in the memory. dk.Data = k kr.lastCreated = dk.CreatedAt kr.dataKeys[kr.nextKeyID] = dk return dk, nil } // Close closes the key registry. func (kr *KeyRegistry) Close() error { if !(kr.opt.ReadOnly || kr.opt.InMemory) { return kr.fp.Close() } return nil } // storeDataKey stores datakey in an encrypted format in the given buffer. If storage key preset. func storeDataKey(buf *bytes.Buffer, storageKey []byte, k *pb.DataKey) error { // xor will encrypt the IV and xor with the given data. // It'll used for both encryption and decryption. xor := func() error { if len(storageKey) == 0 { return nil } var err error k.Data, err = y.XORBlock(k.Data, storageKey, k.Iv) return err } // In memory datakey will be plain text so encrypting before storing to the disk. var err error if err = xor(); err != nil { return y.Wrapf(err, "Error while encrypting datakey in storeDataKey") } var data []byte if data, err = k.Marshal(); err != nil { err = y.Wrapf(err, "Error while marshaling datakey in storeDataKey") var err2 error // decrypting the datakey back. if err2 = xor(); err2 != nil { return y.Wrapf(err, y.Wrapf(err2, "Error while decrypting datakey in storeDataKey").Error()) } return err } var lenCrcBuf [8]byte binary.BigEndian.PutUint32(lenCrcBuf[0:4], uint32(len(data))) binary.BigEndian.PutUint32(lenCrcBuf[4:8], crc32.Checksum(data, y.CastagnoliCrcTable)) y.Check2(buf.Write(lenCrcBuf[:])) y.Check2(buf.Write(data)) // Decrypting the datakey back since we're using the pointer. return xor() } badger-2.2007.2/key_registry_test.go000066400000000000000000000112701372173116500173050ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "io/ioutil" "math/rand" "testing" "github.com/stretchr/testify/require" ) func getRegistryTestOptions(dir string, key []byte) KeyRegistryOptions { return KeyRegistryOptions{ Dir: dir, EncryptionKey: key, ReadOnly: false, } } func TestBuildRegistry(t *testing.T) { encryptionKey := make([]byte, 32) dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) _, err = rand.Read(encryptionKey) require.NoError(t, err) opt := getRegistryTestOptions(dir, encryptionKey) kr, err := OpenKeyRegistry(opt) require.NoError(t, err) dk, err := kr.latestDataKey() require.NoError(t, err) // We're resetting the last created timestamp. So, it creates // new datakey. kr.lastCreated = 0 dk1, err := kr.latestDataKey() // We generated two key. So, checking the length. require.Equal(t, 2, len(kr.dataKeys)) require.NoError(t, err) require.NoError(t, kr.Close()) kr2, err := OpenKeyRegistry(opt) require.NoError(t, err) require.Equal(t, 2, len(kr2.dataKeys)) // Asserting the correctness of the datakey after opening the registry. require.Equal(t, dk.Data, kr.dataKeys[dk.KeyId].Data) require.Equal(t, dk1.Data, kr.dataKeys[dk1.KeyId].Data) require.NoError(t, kr2.Close()) } func TestRewriteRegistry(t *testing.T) { encryptionKey := make([]byte, 32) dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) _, err = rand.Read(encryptionKey) require.NoError(t, err) opt := getRegistryTestOptions(dir, encryptionKey) kr, err := OpenKeyRegistry(opt) require.NoError(t, err) _, err = kr.latestDataKey() require.NoError(t, err) // We're resetting the last created timestamp. So, it creates // new datakey. kr.lastCreated = 0 _, err = kr.latestDataKey() require.NoError(t, err) require.NoError(t, kr.Close()) delete(kr.dataKeys, 1) require.NoError(t, WriteKeyRegistry(kr, opt)) kr2, err := OpenKeyRegistry(opt) require.NoError(t, err) require.Equal(t, 1, len(kr2.dataKeys)) require.NoError(t, kr2.Close()) } func TestMismatch(t *testing.T) { encryptionKey := make([]byte, 32) dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) _, err = rand.Read(encryptionKey) require.NoError(t, err) opt := getRegistryTestOptions(dir, encryptionKey) kr, err := OpenKeyRegistry(opt) require.NoError(t, err) require.NoError(t, kr.Close()) // Opening with the same key and asserting. kr, err = OpenKeyRegistry(opt) require.NoError(t, err) require.NoError(t, kr.Close()) // Opening with the invalid key and asserting. encryptionKey = make([]byte, 32) _, err = rand.Read(encryptionKey) require.NoError(t, err) opt.EncryptionKey = encryptionKey _, err = OpenKeyRegistry(opt) require.Error(t, err) require.EqualError(t, err, ErrEncryptionKeyMismatch.Error()) } func TestEncryptionAndDecryption(t *testing.T) { encryptionKey := make([]byte, 32) dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) _, err = rand.Read(encryptionKey) require.NoError(t, err) opt := getRegistryTestOptions(dir, encryptionKey) kr, err := OpenKeyRegistry(opt) require.NoError(t, err) dk, err := kr.latestDataKey() require.NoError(t, err) require.NoError(t, kr.Close()) // Checking the correctness of the datakey after closing and // opening the key registry. kr, err = OpenKeyRegistry(opt) require.NoError(t, err) dk1, err := kr.dataKey(dk.GetKeyId()) require.NoError(t, err) require.Equal(t, dk.Data, dk1.Data) require.NoError(t, kr.Close()) } func TestKeyRegistryInMemory(t *testing.T) { encryptionKey := make([]byte, 32) _, err := rand.Read(encryptionKey) require.NoError(t, err) opt := getRegistryTestOptions("", encryptionKey) opt.InMemory = true kr, err := OpenKeyRegistry(opt) require.NoError(t, err) _, err = kr.latestDataKey() require.NoError(t, err) // We're resetting the last created timestamp. So, it creates // new datakey. kr.lastCreated = 0 _, err = kr.latestDataKey() // We generated two key. So, checking the length. require.Equal(t, 2, len(kr.dataKeys)) require.NoError(t, err) require.NoError(t, kr.Close()) } badger-2.2007.2/level_handler.go000066400000000000000000000222761372173116500163420ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "fmt" "sort" "sync" "github.com/dgryski/go-farm" "github.com/dgraph-io/badger/v2/table" "github.com/dgraph-io/badger/v2/y" "github.com/pkg/errors" ) type levelHandler struct { // Guards tables, totalSize. sync.RWMutex // For level >= 1, tables are sorted by key ranges, which do not overlap. // For level 0, tables are sorted by time. // For level 0, newest table are at the back. Compact the oldest one first, which is at the front. tables []*table.Table totalSize int64 // The following are initialized once and const. level int strLevel string maxTotalSize int64 db *DB } func (s *levelHandler) getTotalSize() int64 { s.RLock() defer s.RUnlock() return s.totalSize } // initTables replaces s.tables with given tables. This is done during loading. func (s *levelHandler) initTables(tables []*table.Table) { s.Lock() defer s.Unlock() s.tables = tables s.totalSize = 0 for _, t := range tables { s.totalSize += t.Size() } if s.level == 0 { // Key range will overlap. Just sort by fileID in ascending order // because newer tables are at the end of level 0. sort.Slice(s.tables, func(i, j int) bool { return s.tables[i].ID() < s.tables[j].ID() }) } else { // Sort tables by keys. sort.Slice(s.tables, func(i, j int) bool { return y.CompareKeys(s.tables[i].Smallest(), s.tables[j].Smallest()) < 0 }) } } // deleteTables remove tables idx0, ..., idx1-1. func (s *levelHandler) deleteTables(toDel []*table.Table) error { s.Lock() // s.Unlock() below toDelMap := make(map[uint64]struct{}) for _, t := range toDel { toDelMap[t.ID()] = struct{}{} } // Make a copy as iterators might be keeping a slice of tables. var newTables []*table.Table for _, t := range s.tables { _, found := toDelMap[t.ID()] if !found { newTables = append(newTables, t) continue } s.totalSize -= t.Size() } s.tables = newTables s.Unlock() // Unlock s _before_ we DecrRef our tables, which can be slow. return decrRefs(toDel) } // replaceTables will replace tables[left:right] with newTables. Note this EXCLUDES tables[right]. // You must call decr() to delete the old tables _after_ writing the update to the manifest. func (s *levelHandler) replaceTables(toDel, toAdd []*table.Table) error { // Need to re-search the range of tables in this level to be replaced as other goroutines might // be changing it as well. (They can't touch our tables, but if they add/remove other tables, // the indices get shifted around.) s.Lock() // We s.Unlock() below. toDelMap := make(map[uint64]struct{}) for _, t := range toDel { toDelMap[t.ID()] = struct{}{} } var newTables []*table.Table for _, t := range s.tables { _, found := toDelMap[t.ID()] if !found { newTables = append(newTables, t) continue } s.totalSize -= t.Size() } // Increase totalSize first. for _, t := range toAdd { s.totalSize += t.Size() t.IncrRef() newTables = append(newTables, t) } // Assign tables. s.tables = newTables sort.Slice(s.tables, func(i, j int) bool { return y.CompareKeys(s.tables[i].Smallest(), s.tables[j].Smallest()) < 0 }) s.Unlock() // s.Unlock before we DecrRef tables -- that can be slow. return decrRefs(toDel) } // addTable adds toAdd table to levelHandler. Normally when we add tables to levelHandler, we sort // tables based on table.Smallest. This is required for correctness of the system. But in case of // stream writer this can be avoided. We can just add tables to levelHandler's table list // and after all addTable calls, we can sort table list(check sortTable method). // NOTE: levelHandler.sortTables() should be called after call addTable calls are done. func (s *levelHandler) addTable(t *table.Table) { s.Lock() defer s.Unlock() s.totalSize += t.Size() // Increase totalSize first. t.IncrRef() s.tables = append(s.tables, t) } // sortTables sorts tables of levelHandler based on table.Smallest. // Normally it should be called after all addTable calls. func (s *levelHandler) sortTables() { s.RLock() defer s.RUnlock() sort.Slice(s.tables, func(i, j int) bool { return y.CompareKeys(s.tables[i].Smallest(), s.tables[j].Smallest()) < 0 }) } func decrRefs(tables []*table.Table) error { for _, table := range tables { if err := table.DecrRef(); err != nil { return err } } return nil } func newLevelHandler(db *DB, level int) *levelHandler { return &levelHandler{ level: level, strLevel: fmt.Sprintf("l%d", level), db: db, } } // tryAddLevel0Table returns true if ok and no stalling. func (s *levelHandler) tryAddLevel0Table(t *table.Table) bool { y.AssertTrue(s.level == 0) // Need lock as we may be deleting the first table during a level 0 compaction. s.Lock() defer s.Unlock() // Stall (by returning false) if we are above the specified stall setting for L0. if len(s.tables) >= s.db.opt.NumLevelZeroTablesStall { return false } s.tables = append(s.tables, t) t.IncrRef() s.totalSize += t.Size() return true } func (s *levelHandler) numTables() int { s.RLock() defer s.RUnlock() return len(s.tables) } func (s *levelHandler) close() error { s.RLock() defer s.RUnlock() var err error for _, t := range s.tables { if closeErr := t.Close(); closeErr != nil && err == nil { err = closeErr } } return errors.Wrap(err, "levelHandler.close") } // getTableForKey acquires a read-lock to access s.tables. It returns a list of tableHandlers. func (s *levelHandler) getTableForKey(key []byte) ([]*table.Table, func() error) { s.RLock() defer s.RUnlock() if s.level == 0 { // For level 0, we need to check every table. Remember to make a copy as s.tables may change // once we exit this function, and we don't want to lock s.tables while seeking in tables. // CAUTION: Reverse the tables. out := make([]*table.Table, 0, len(s.tables)) for i := len(s.tables) - 1; i >= 0; i-- { out = append(out, s.tables[i]) s.tables[i].IncrRef() } return out, func() error { for _, t := range out { if err := t.DecrRef(); err != nil { return err } } return nil } } // For level >= 1, we can do a binary search as key range does not overlap. idx := sort.Search(len(s.tables), func(i int) bool { return y.CompareKeys(s.tables[i].Biggest(), key) >= 0 }) if idx >= len(s.tables) { // Given key is strictly > than every element we have. return nil, func() error { return nil } } tbl := s.tables[idx] tbl.IncrRef() return []*table.Table{tbl}, tbl.DecrRef } // get returns value for a given key or the key after that. If not found, return nil. func (s *levelHandler) get(key []byte) (y.ValueStruct, error) { tables, decr := s.getTableForKey(key) keyNoTs := y.ParseKey(key) hash := farm.Fingerprint64(keyNoTs) var maxVs y.ValueStruct for _, th := range tables { if th.DoesNotHave(hash) { y.NumLSMBloomHits.Add(s.strLevel, 1) continue } it := th.NewIterator(false) defer it.Close() y.NumLSMGets.Add(s.strLevel, 1) it.Seek(key) if !it.Valid() { continue } if y.SameKey(key, it.Key()) { if version := y.ParseTs(it.Key()); maxVs.Version < version { maxVs = it.ValueCopy() maxVs.Version = version } } } return maxVs, decr() } // appendIterators appends iterators to an array of iterators, for merging. // Note: This obtains references for the table handlers. Remember to close these iterators. func (s *levelHandler) appendIterators(iters []y.Iterator, opt *IteratorOptions) []y.Iterator { s.RLock() defer s.RUnlock() if s.level == 0 { // Remember to add in reverse order! // The newer table at the end of s.tables should be added first as it takes precedence. // Level 0 tables are not in key sorted order, so we need to consider them one by one. var out []*table.Table for _, t := range s.tables { if opt.pickTable(t) { out = append(out, t) } } return appendIteratorsReversed(iters, out, opt.Reverse) } tables := opt.pickTables(s.tables) if len(tables) == 0 { return iters } return append(iters, table.NewConcatIterator(tables, opt.Reverse)) } type levelHandlerRLocked struct{} // overlappingTables returns the tables that intersect with key range. Returns a half-interval. // This function should already have acquired a read lock, and this is so important the caller must // pass an empty parameter declaring such. func (s *levelHandler) overlappingTables(_ levelHandlerRLocked, kr keyRange) (int, int) { if len(kr.left) == 0 || len(kr.right) == 0 { return 0, 0 } left := sort.Search(len(s.tables), func(i int) bool { return y.CompareKeys(kr.left, s.tables[i].Biggest()) <= 0 }) right := sort.Search(len(s.tables), func(i int) bool { return y.CompareKeys(kr.right, s.tables[i].Smallest()) < 0 }) return left, right } badger-2.2007.2/levels.go000066400000000000000000001056541372173116500150320ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "bytes" "fmt" "math/rand" "os" "sort" "strings" "sync" "sync/atomic" "time" "golang.org/x/net/trace" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/table" "github.com/dgraph-io/badger/v2/y" "github.com/pkg/errors" ) type levelsController struct { nextFileID uint64 // Atomic // The following are initialized once and const. levels []*levelHandler kv *DB cstatus compactStatus // This is for getting timings between stalls. lastUnstalled time.Time } // revertToManifest checks that all necessary table files exist and removes all table files not // referenced by the manifest. idMap is a set of table file id's that were read from the directory // listing. func revertToManifest(kv *DB, mf *Manifest, idMap map[uint64]struct{}) error { // 1. Check all files in manifest exist. for id := range mf.Tables { if _, ok := idMap[id]; !ok { return fmt.Errorf("file does not exist for table %d", id) } } // 2. Delete files that shouldn't exist. for id := range idMap { if _, ok := mf.Tables[id]; !ok { kv.opt.Debugf("Table file %d not referenced in MANIFEST\n", id) filename := table.NewFilename(id, kv.opt.Dir) if err := os.Remove(filename); err != nil { return y.Wrapf(err, "While removing table %d", id) } } } return nil } func newLevelsController(db *DB, mf *Manifest) (*levelsController, error) { y.AssertTrue(db.opt.NumLevelZeroTablesStall > db.opt.NumLevelZeroTables) s := &levelsController{ kv: db, levels: make([]*levelHandler, db.opt.MaxLevels), } s.cstatus.levels = make([]*levelCompactStatus, db.opt.MaxLevels) for i := 0; i < db.opt.MaxLevels; i++ { s.levels[i] = newLevelHandler(db, i) switch i { case 0: // Do nothing. case 1: // Level 1 probably shouldn't be too much bigger than level 0. s.levels[i].maxTotalSize = db.opt.LevelOneSize default: s.levels[i].maxTotalSize = s.levels[i-1].maxTotalSize * int64(db.opt.LevelSizeMultiplier) } s.cstatus.levels[i] = new(levelCompactStatus) } if db.opt.InMemory { return s, nil } // Compare manifest against directory, check for existent/non-existent files, and remove. if err := revertToManifest(db, mf, getIDMap(db.opt.Dir)); err != nil { return nil, err } // Some files may be deleted. Let's reload. var flags uint32 = y.Sync if db.opt.ReadOnly { flags |= y.ReadOnly } var mu sync.Mutex tables := make([][]*table.Table, db.opt.MaxLevels) var maxFileID uint64 // We found that using 3 goroutines allows disk throughput to be utilized to its max. // Disk utilization is the main thing we should focus on, while trying to read the data. That's // the one factor that remains constant between HDD and SSD. throttle := y.NewThrottle(3) start := time.Now() var numOpened int32 tick := time.NewTicker(3 * time.Second) defer tick.Stop() for fileID, tf := range mf.Tables { fname := table.NewFilename(fileID, db.opt.Dir) select { case <-tick.C: db.opt.Infof("%d tables out of %d opened in %s\n", atomic.LoadInt32(&numOpened), len(mf.Tables), time.Since(start).Round(time.Millisecond)) default: } if err := throttle.Do(); err != nil { closeAllTables(tables) return nil, err } if fileID > maxFileID { maxFileID = fileID } go func(fname string, tf TableManifest) { var rerr error defer func() { throttle.Done(rerr) atomic.AddInt32(&numOpened, 1) }() fd, err := y.OpenExistingFile(fname, flags) if err != nil { rerr = errors.Wrapf(err, "Opening file: %q", fname) return } dk, err := db.registry.dataKey(tf.KeyID) if err != nil { rerr = errors.Wrapf(err, "Error while reading datakey") return } topt := buildTableOptions(db.opt) // Set compression from table manifest. topt.Compression = tf.Compression topt.DataKey = dk topt.BlockCache = db.blockCache topt.IndexCache = db.indexCache t, err := table.OpenTable(fd, topt) if err != nil { if strings.HasPrefix(err.Error(), "CHECKSUM_MISMATCH:") { db.opt.Errorf(err.Error()) db.opt.Errorf("Ignoring table %s", fd.Name()) // Do not set rerr. We will continue without this table. } else { rerr = errors.Wrapf(err, "Opening table: %q", fname) } return } mu.Lock() tables[tf.Level] = append(tables[tf.Level], t) mu.Unlock() }(fname, tf) } if err := throttle.Finish(); err != nil { closeAllTables(tables) return nil, err } db.opt.Infof("All %d tables opened in %s\n", atomic.LoadInt32(&numOpened), time.Since(start).Round(time.Millisecond)) s.nextFileID = maxFileID + 1 for i, tbls := range tables { s.levels[i].initTables(tbls) } // Make sure key ranges do not overlap etc. if err := s.validate(); err != nil { _ = s.cleanupLevels() return nil, errors.Wrap(err, "Level validation") } // Sync directory (because we have at least removed some files, or previously created the // manifest file). if err := syncDir(db.opt.Dir); err != nil { _ = s.close() return nil, err } return s, nil } // Closes the tables, for cleanup in newLevelsController. (We Close() instead of using DecrRef() // because that would delete the underlying files.) We ignore errors, which is OK because tables // are read-only. func closeAllTables(tables [][]*table.Table) { for _, tableSlice := range tables { for _, table := range tableSlice { _ = table.Close() } } } func (s *levelsController) cleanupLevels() error { var firstErr error for _, l := range s.levels { if err := l.close(); err != nil && firstErr == nil { firstErr = err } } return firstErr } // dropTree picks all tables from all levels, creates a manifest changeset, // applies it, and then decrements the refs of these tables, which would result // in their deletion. func (s *levelsController) dropTree() (int, error) { // First pick all tables, so we can create a manifest changelog. var all []*table.Table for _, l := range s.levels { l.RLock() all = append(all, l.tables...) l.RUnlock() } if len(all) == 0 { return 0, nil } // Generate the manifest changes. changes := []*pb.ManifestChange{} for _, table := range all { // Add a delete change only if the table is not in memory. if !table.IsInmemory { changes = append(changes, newDeleteChange(table.ID())) } } changeSet := pb.ManifestChangeSet{Changes: changes} if err := s.kv.manifest.addChanges(changeSet.Changes); err != nil { return 0, err } // Now that manifest has been successfully written, we can delete the tables. for _, l := range s.levels { l.Lock() l.totalSize = 0 l.tables = l.tables[:0] l.Unlock() } for _, table := range all { if err := table.DecrRef(); err != nil { return 0, err } } return len(all), nil } // dropPrefix runs a L0->L1 compaction, and then runs same level compaction on the rest of the // levels. For L0->L1 compaction, it runs compactions normally, but skips over all the keys with the // provided prefix and also the internal move keys for the same prefix. // For Li->Li compactions, it picks up the tables which would have the prefix. The // tables who only have keys with this prefix are quickly dropped. The ones which have other keys // are run through MergeIterator and compacted to create new tables. All the mechanisms of // compactions apply, i.e. level sizes and MANIFEST are updated as in the normal flow. func (s *levelsController) dropPrefixes(prefixes [][]byte) error { // Internal move keys related to the given prefix should also be skipped. for _, prefix := range prefixes { key := make([]byte, 0, len(badgerMove)+len(prefix)) key = append(key, badgerMove...) key = append(key, prefix...) prefixes = append(prefixes, key) } opt := s.kv.opt // Iterate levels in the reverse order because if we were to iterate from // lower level (say level 0) to a higher level (say level 3) we could have // a state in which level 0 is compacted and an older version of a key exists in lower level. // At this point, if someone creates an iterator, they would see an old // value for a key from lower levels. Iterating in reverse order ensures we // drop the oldest data first so that lookups never return stale data. for i := len(s.levels) - 1; i >= 0; i-- { l := s.levels[i] l.RLock() if l.level == 0 { size := len(l.tables) l.RUnlock() if size > 0 { cp := compactionPriority{ level: 0, score: 1.74, // A unique number greater than 1.0 does two things. Helps identify this // function in logs, and forces a compaction. dropPrefixes: prefixes, } if err := s.doCompact(174, cp); err != nil { opt.Warningf("While compacting level 0: %v", err) return nil } } continue } // Build a list of compaction tableGroups affecting all the prefixes we // need to drop. We need to build tableGroups that satisfy the invariant that // bottom tables are consecutive. // tableGroup contains groups of consecutive tables. var tableGroups [][]*table.Table var tableGroup []*table.Table finishGroup := func() { if len(tableGroup) > 0 { tableGroups = append(tableGroups, tableGroup) tableGroup = nil } } for _, table := range l.tables { if containsAnyPrefixes(table.Smallest(), table.Biggest(), prefixes) { tableGroup = append(tableGroup, table) } else { finishGroup() } } finishGroup() l.RUnlock() if len(tableGroups) == 0 { continue } opt.Infof("Dropping prefix at level %d (%d tableGroups)", l.level, len(tableGroups)) for _, operation := range tableGroups { cd := compactDef{ elog: trace.New(fmt.Sprintf("Badger.L%d", l.level), "Compact"), thisLevel: l, nextLevel: l, top: nil, bot: operation, dropPrefixes: prefixes, } if err := s.runCompactDef(l.level, cd); err != nil { opt.Warningf("While running compact def: %+v. Error: %v", cd, err) return err } } } return nil } func (s *levelsController) startCompact(lc *y.Closer) { n := s.kv.opt.NumCompactors lc.AddRunning(n - 1) for i := 0; i < n; i++ { // The worker with id=0 is dedicated to L0 and L1. This is not counted // towards the user specified NumCompactors. go s.runCompactor(i, lc) } } func (s *levelsController) runCompactor(id int, lc *y.Closer) { defer lc.Done() randomDelay := time.NewTimer(time.Duration(rand.Int31n(1000)) * time.Millisecond) select { case <-randomDelay.C: case <-lc.HasBeenClosed(): randomDelay.Stop() return } ticker := time.NewTicker(100 * time.Millisecond) defer ticker.Stop() for { select { // Can add a done channel or other stuff. case <-ticker.C: prios := s.pickCompactLevels() loop: for _, p := range prios { if id == 0 && p.level > 1 { // If I'm ID zero, I only compact L0 and L1. continue } if id != 0 && p.level <= 1 { // If I'm ID non-zero, I do NOT compact L0 and L1. continue } err := s.doCompact(id, p) switch err { case nil: break loop case errFillTables: // pass default: s.kv.opt.Warningf("While running doCompact: %v\n", err) } } case <-lc.HasBeenClosed(): return } } } // Returns true if level zero may be compacted, without accounting for compactions that already // might be happening. func (s *levelsController) isLevel0Compactable() bool { return s.levels[0].numTables() >= s.kv.opt.NumLevelZeroTables } // Returns true if the non-zero level may be compacted. delSize provides the size of the tables // which are currently being compacted so that we treat them as already having started being // compacted (because they have been, yet their size is already counted in getTotalSize). func (l *levelHandler) isCompactable(delSize int64) bool { return l.getTotalSize()-delSize >= l.maxTotalSize } type compactionPriority struct { level int score float64 dropPrefixes [][]byte } // pickCompactLevel determines which level to compact. // Based on: https://github.com/facebook/rocksdb/wiki/Leveled-Compaction func (s *levelsController) pickCompactLevels() (prios []compactionPriority) { // This function must use identical criteria for guaranteeing compaction's progress that // addLevel0Table uses. // cstatus is checked to see if level 0's tables are already being compacted if !s.cstatus.overlapsWith(0, infRange) && s.isLevel0Compactable() { pri := compactionPriority{ level: 0, score: float64(s.levels[0].numTables()) / float64(s.kv.opt.NumLevelZeroTables), } prios = append(prios, pri) } for i, l := range s.levels[1:] { // Don't consider those tables that are already being compacted right now. delSize := s.cstatus.delSize(i + 1) if l.isCompactable(delSize) { pri := compactionPriority{ level: i + 1, score: float64(l.getTotalSize()-delSize) / float64(l.maxTotalSize), } prios = append(prios, pri) } } // We should continue to sort the compaction priorities by score. Now that we have a dedicated // compactor for L0 and L1, we don't need to sort by level here. sort.Slice(prios, func(i, j int) bool { return prios[i].score > prios[j].score }) return prios } // checkOverlap checks if the given tables overlap with any level from the given "lev" onwards. func (s *levelsController) checkOverlap(tables []*table.Table, lev int) bool { kr := getKeyRange(tables...) for i, lh := range s.levels { if i < lev { // Skip upper levels. continue } lh.RLock() left, right := lh.overlappingTables(levelHandlerRLocked{}, kr) lh.RUnlock() if right-left > 0 { return true } } return false } // compactBuildTables merges topTables and botTables to form a list of new tables. func (s *levelsController) compactBuildTables( lev int, cd compactDef) ([]*table.Table, func() error, error) { topTables := cd.top botTables := cd.bot // Check overlap of the top level with the levels which are not being // compacted in this compaction. hasOverlap := s.checkOverlap(cd.allTables(), cd.nextLevel.level+1) // Try to collect stats so that we can inform value log about GC. That would help us find which // value log file should be GCed. discardStats := make(map[uint32]int64) updateStats := func(vs y.ValueStruct) { // We don't need to store/update discard stats when badger is running in Disk-less mode. if s.kv.opt.InMemory { return } if vs.Meta&bitValuePointer > 0 { var vp valuePointer vp.Decode(vs.Value) discardStats[vp.Fid] += int64(vp.Len) } } // Create iterators across all the tables involved first. var iters []y.Iterator switch { case lev == 0: iters = appendIteratorsReversed(iters, topTables, false) case len(topTables) > 0: y.AssertTrue(len(topTables) == 1) iters = []y.Iterator{topTables[0].NewIterator(false)} } // Next level has level>=1 and we can use ConcatIterator as key ranges do not overlap. var valid []*table.Table nextTable: for _, table := range botTables { if len(cd.dropPrefixes) > 0 { for _, prefix := range cd.dropPrefixes { if bytes.HasPrefix(table.Smallest(), prefix) && bytes.HasPrefix(table.Biggest(), prefix) { // All the keys in this table have the dropPrefix. So, this // table does not need to be in the iterator and can be // dropped immediately. continue nextTable } } } valid = append(valid, table) } iters = append(iters, table.NewConcatIterator(valid, false)) it := table.NewMergeIterator(iters, false) defer it.Close() // Important to close the iterator to do ref counting. it.Rewind() // Pick a discard ts, so we can discard versions below this ts. We should // never discard any versions starting from above this timestamp, because // that would affect the snapshot view guarantee provided by transactions. discardTs := s.kv.orc.discardAtOrBelow() var numBuilds, numVersions int var lastKey, skipKey []byte var vp valuePointer var newTables []*table.Table mu := new(sync.Mutex) // Guards newTables inflightBuilders := y.NewThrottle(5) for it.Valid() { timeStart := time.Now() dk, err := s.kv.registry.latestDataKey() if err != nil { return nil, nil, y.Wrapf(err, "Error while retrieving datakey in levelsController.compactBuildTables") } bopts := buildTableOptions(s.kv.opt) bopts.DataKey = dk // Builder does not need cache but the same options are used for opening table. bopts.BlockCache = s.kv.blockCache bopts.IndexCache = s.kv.indexCache builder := table.NewTableBuilder(bopts) var numKeys, numSkips uint64 for ; it.Valid(); it.Next() { // See if we need to skip the prefix. if len(cd.dropPrefixes) > 0 && hasAnyPrefixes(it.Key(), cd.dropPrefixes) { numSkips++ updateStats(it.Value()) continue } // See if we need to skip this key. if len(skipKey) > 0 { if y.SameKey(it.Key(), skipKey) { numSkips++ updateStats(it.Value()) continue } else { skipKey = skipKey[:0] } } if !y.SameKey(it.Key(), lastKey) { if builder.ReachedCapacity(s.kv.opt.MaxTableSize) { // Only break if we are on a different key, and have reached capacity. We want // to ensure that all versions of the key are stored in the same sstable, and // not divided across multiple tables at the same level. break } lastKey = y.SafeCopy(lastKey, it.Key()) numVersions = 0 } vs := it.Value() version := y.ParseTs(it.Key()) // Do not discard entries inserted by merge operator. These entries will be // discarded once they're merged if version <= discardTs && vs.Meta&bitMergeEntry == 0 { // Keep track of the number of versions encountered for this key. Only consider the // versions which are below the minReadTs, otherwise, we might end up discarding the // only valid version for a running transaction. numVersions++ // Keep the current version and discard all the next versions if // - The `discardEarlierVersions` bit is set OR // - We've already processed `NumVersionsToKeep` number of versions // (including the current item being processed) lastValidVersion := vs.Meta&bitDiscardEarlierVersions > 0 || numVersions == s.kv.opt.NumVersionsToKeep isExpired := isDeletedOrExpired(vs.Meta, vs.ExpiresAt) if isExpired || lastValidVersion { // If this version of the key is deleted or expired, skip all the rest of the // versions. Ensure that we're only removing versions below readTs. skipKey = y.SafeCopy(skipKey, it.Key()) switch { // Add the key to the table only if it has not expired. // We don't want to add the deleted/expired keys. case !isExpired && lastValidVersion: // Add this key. We have set skipKey, so the following key versions // would be skipped. case hasOverlap: // If this key range has overlap with lower levels, then keep the deletion // marker with the latest version, discarding the rest. We have set skipKey, // so the following key versions would be skipped. default: // If no overlap, we can skip all the versions, by continuing here. numSkips++ updateStats(vs) continue // Skip adding this key. } } } numKeys++ if vs.Meta&bitValuePointer > 0 { vp.Decode(vs.Value) } builder.Add(it.Key(), vs, vp.Len) } // It was true that it.Valid() at least once in the loop above, which means we // called Add() at least once, and builder is not Empty(). s.kv.opt.Debugf("LOG Compact. Added %d keys. Skipped %d keys. Iteration took: %v", numKeys, numSkips, time.Since(timeStart)) if builder.Empty() { continue } numBuilds++ fileID := s.reserveFileID() if err := inflightBuilders.Do(); err != nil { // Can't return from here, until I decrRef all the tables that I built so far. break } go func(builder *table.Builder) { defer builder.Close() defer inflightBuilders.Done(err) build := func(fileID uint64) (*table.Table, error) { fd, err := y.CreateSyncedFile(table.NewFilename(fileID, s.kv.opt.Dir), true) if err != nil { return nil, errors.Wrapf(err, "While opening new table: %d", fileID) } if _, err := fd.Write(builder.Finish()); err != nil { return nil, errors.Wrapf(err, "Unable to write to file: %d", fileID) } tbl, err := table.OpenTable(fd, bopts) // decrRef is added below. return tbl, errors.Wrapf(err, "Unable to open table: %q", fd.Name()) } var tbl *table.Table var err error if s.kv.opt.InMemory { tbl, err = table.OpenInMemoryTable(builder.Finish(), fileID, &bopts) } else { tbl, err = build(fileID) } // If we couldn't build the table, return fast. if err != nil { return } mu.Lock() newTables = append(newTables, tbl) mu.Unlock() }(builder) } // Wait for all table builders to finish and also for newTables accumulator to finish. err := inflightBuilders.Finish() if err == nil { // Ensure created files' directory entries are visible. We don't mind the extra latency // from not doing this ASAP after all file creation has finished because this is a // background operation. err = s.kv.syncDir(s.kv.opt.Dir) } if err != nil { // An error happened. Delete all the newly created table files (by calling DecrRef // -- we're the only holders of a ref). _ = decrRefs(newTables) return nil, nil, errors.Wrapf(err, "while running compactions for: %+v", cd) } sort.Slice(newTables, func(i, j int) bool { return y.CompareKeys(newTables[i].Biggest(), newTables[j].Biggest()) < 0 }) s.kv.vlog.updateDiscardStats(discardStats) s.kv.opt.Debugf("Discard stats: %v", discardStats) return newTables, func() error { return decrRefs(newTables) }, nil } func buildChangeSet(cd *compactDef, newTables []*table.Table) pb.ManifestChangeSet { changes := []*pb.ManifestChange{} for _, table := range newTables { changes = append(changes, newCreateChange(table.ID(), cd.nextLevel.level, table.KeyID(), table.CompressionType())) } for _, table := range cd.top { // Add a delete change only if the table is not in memory. if !table.IsInmemory { changes = append(changes, newDeleteChange(table.ID())) } } for _, table := range cd.bot { changes = append(changes, newDeleteChange(table.ID())) } return pb.ManifestChangeSet{Changes: changes} } func hasAnyPrefixes(s []byte, listOfPrefixes [][]byte) bool { for _, prefix := range listOfPrefixes { if bytes.HasPrefix(s, prefix) { return true } } return false } func containsPrefix(smallValue, largeValue, prefix []byte) bool { if bytes.HasPrefix(smallValue, prefix) { return true } if bytes.HasPrefix(largeValue, prefix) { return true } if bytes.Compare(prefix, smallValue) > 0 && bytes.Compare(prefix, largeValue) < 0 { return true } return false } func containsAnyPrefixes(smallValue, largeValue []byte, listOfPrefixes [][]byte) bool { for _, prefix := range listOfPrefixes { if containsPrefix(smallValue, largeValue, prefix) { return true } } return false } type compactDef struct { elog trace.Trace thisLevel *levelHandler nextLevel *levelHandler top []*table.Table bot []*table.Table thisRange keyRange nextRange keyRange thisSize int64 dropPrefixes [][]byte } func (cd *compactDef) lockLevels() { cd.thisLevel.RLock() cd.nextLevel.RLock() } func (cd *compactDef) unlockLevels() { cd.nextLevel.RUnlock() cd.thisLevel.RUnlock() } func (cd *compactDef) allTables() []*table.Table { ret := make([]*table.Table, 0, len(cd.top)+len(cd.bot)) ret = append(ret, cd.top...) ret = append(ret, cd.bot...) return ret } func (s *levelsController) fillTablesL0(cd *compactDef) bool { cd.lockLevels() defer cd.unlockLevels() cd.top = make([]*table.Table, len(cd.thisLevel.tables)) copy(cd.top, cd.thisLevel.tables) if len(cd.top) == 0 { return false } cd.thisRange = infRange kr := getKeyRange(cd.top...) left, right := cd.nextLevel.overlappingTables(levelHandlerRLocked{}, kr) cd.bot = make([]*table.Table, right-left) copy(cd.bot, cd.nextLevel.tables[left:right]) if len(cd.bot) == 0 { cd.nextRange = kr } else { cd.nextRange = getKeyRange(cd.bot...) } if !s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) { return false } return true } // sortByOverlap sorts tables in increasing order of overlap with next level. func (s *levelsController) sortByOverlap(tables []*table.Table, cd *compactDef) { if len(tables) == 0 || cd.nextLevel == nil { return } tableOverlap := make([]int, len(tables)) for i := range tables { // get key range for table tableRange := getKeyRange(tables[i]) // get overlap with next level left, right := cd.nextLevel.overlappingTables(levelHandlerRLocked{}, tableRange) tableOverlap[i] = right - left } sort.Slice(tables, func(i, j int) bool { return tableOverlap[i] < tableOverlap[j] }) } func (s *levelsController) fillTables(cd *compactDef) bool { cd.lockLevels() defer cd.unlockLevels() tables := make([]*table.Table, len(cd.thisLevel.tables)) copy(tables, cd.thisLevel.tables) if len(tables) == 0 { return false } // We want to pick files from current level in order of increasing overlap with next level // tables. Idea here is to first compact file from current level which has least overlap with // next level. This provides us better write amplification. s.sortByOverlap(tables, cd) for _, t := range tables { cd.thisSize = t.Size() cd.thisRange = getKeyRange(t) if s.cstatus.overlapsWith(cd.thisLevel.level, cd.thisRange) { continue } cd.top = []*table.Table{t} left, right := cd.nextLevel.overlappingTables(levelHandlerRLocked{}, cd.thisRange) // Sometimes below line(make([]*table.Table, right-left)) panics with error // (runtime error: makeslice: len out of range). One of the reason for this can be when // right < left. We don't know how to reproduce it as of now. We are just logging it so // that we can get more context. if right < left { s.kv.opt.Errorf("right: %d is less than left: %d in overlappingTables for current "+ "level: %d, next level: %d, key range(%s, %s)", right, left, cd.thisLevel.level, cd.nextLevel.level, cd.thisRange.left, cd.thisRange.right) continue } cd.bot = make([]*table.Table, right-left) copy(cd.bot, cd.nextLevel.tables[left:right]) if len(cd.bot) == 0 { cd.bot = []*table.Table{} cd.nextRange = cd.thisRange if !s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) { continue } return true } cd.nextRange = getKeyRange(cd.bot...) if s.cstatus.overlapsWith(cd.nextLevel.level, cd.nextRange) { continue } if !s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) { continue } return true } return false } func (s *levelsController) runCompactDef(l int, cd compactDef) (err error) { timeStart := time.Now() thisLevel := cd.thisLevel nextLevel := cd.nextLevel // Table should never be moved directly between levels, always be rewritten to allow discarding // invalid versions. newTables, decr, err := s.compactBuildTables(l, cd) if err != nil { return err } defer func() { // Only assign to err, if it's not already nil. if decErr := decr(); err == nil { err = decErr } }() changeSet := buildChangeSet(&cd, newTables) // We write to the manifest _before_ we delete files (and after we created files) if err := s.kv.manifest.addChanges(changeSet.Changes); err != nil { return err } // See comment earlier in this function about the ordering of these ops, and the order in which // we access levels when reading. if err := nextLevel.replaceTables(cd.bot, newTables); err != nil { return err } if err := thisLevel.deleteTables(cd.top); err != nil { return err } // Note: For level 0, while doCompact is running, it is possible that new tables are added. // However, the tables are added only to the end, so it is ok to just delete the first table. s.kv.opt.Infof("LOG Compact %d->%d, del %d tables, add %d tables, took %v\n", thisLevel.level, nextLevel.level, len(cd.top)+len(cd.bot), len(newTables), time.Since(timeStart)) return nil } var errFillTables = errors.New("Unable to fill tables") // doCompact picks some table on level l and compacts it away to the next level. func (s *levelsController) doCompact(id int, p compactionPriority) error { l := p.level y.AssertTrue(l+1 < s.kv.opt.MaxLevels) // Sanity check. cd := compactDef{ elog: trace.New(fmt.Sprintf("Badger.L%d", l), "Compact"), thisLevel: s.levels[l], nextLevel: s.levels[l+1], dropPrefixes: p.dropPrefixes, } cd.elog.SetMaxEvents(100) defer cd.elog.Finish() s.kv.opt.Debugf("[Compactor: %d] Attempting to run compaction: %+v", id, p) // While picking tables to be compacted, both levels' tables are expected to // remain unchanged. if l == 0 { if !s.fillTablesL0(&cd) { return errFillTables } } else { if !s.fillTables(&cd) { return errFillTables } } defer s.cstatus.delete(cd) // Remove the ranges from compaction status. s.kv.opt.Infof("[Compactor: %d] Running compaction: %+v for level: %d\n", id, p, cd.thisLevel.level) s.cstatus.toLog(cd.elog) if err := s.runCompactDef(l, cd); err != nil { // This compaction couldn't be done successfully. s.kv.opt.Warningf("[Compactor: %d] LOG Compact FAILED with error: %+v: %+v", id, err, cd) return err } s.cstatus.toLog(cd.elog) s.kv.opt.Infof("[Compactor: %d] Compaction for level: %d DONE", id, cd.thisLevel.level) return nil } func (s *levelsController) addLevel0Table(t *table.Table) error { // Add table to manifest file only if it is not opened in memory. We don't want to add a table // to the manifest file if it exists only in memory. if !t.IsInmemory { // We update the manifest _before_ the table becomes part of a levelHandler, because at that // point it could get used in some compaction. This ensures the manifest file gets updated in // the proper order. (That means this update happens before that of some compaction which // deletes the table.) err := s.kv.manifest.addChanges([]*pb.ManifestChange{ newCreateChange(t.ID(), 0, t.KeyID(), t.CompressionType()), }) if err != nil { return err } } for !s.levels[0].tryAddLevel0Table(t) { // Stall. Make sure all levels are healthy before we unstall. var timeStart time.Time { s.kv.opt.Infof("STALLED STALLED STALLED: %v\n", time.Since(s.lastUnstalled)) s.cstatus.RLock() for i := 0; i < s.kv.opt.MaxLevels; i++ { s.kv.opt.Debugf("level=%d. Status=%s Size=%d\n", i, s.cstatus.levels[i].debug(), s.levels[i].getTotalSize()) } s.cstatus.RUnlock() timeStart = time.Now() } // Before we unstall, we need to make sure that level 0 is healthy. Otherwise, we // will very quickly fill up level 0 again. for i := 0; ; i++ { // It's crucial that this behavior replicates pickCompactLevels' behavior in // computing compactability in order to guarantee progress. // Break the loop once L0 has enough space to accommodate new tables. if !s.isLevel0Compactable() { break } time.Sleep(10 * time.Millisecond) if i%100 == 0 { prios := s.pickCompactLevels() s.kv.opt.Debugf("Waiting to add level 0 table. Compaction priorities: %+v\n", prios) i = 0 } } { s.kv.opt.Debugf("UNSTALLED UNSTALLED UNSTALLED: %v\n", time.Since(timeStart)) s.lastUnstalled = time.Now() } } return nil } func (s *levelsController) close() error { err := s.cleanupLevels() return errors.Wrap(err, "levelsController.Close") } // get returns the found value if any. If not found, we return nil. func (s *levelsController) get(key []byte, maxVs *y.ValueStruct, startLevel int) ( y.ValueStruct, error) { if s.kv.IsClosed() { return y.ValueStruct{}, ErrDBClosed } // It's important that we iterate the levels from 0 on upward. The reason is, if we iterated // in opposite order, or in parallel (naively calling all the h.RLock() in some order) we could // read level L's tables post-compaction and level L+1's tables pre-compaction. (If we do // parallelize this, we will need to call the h.RLock() function by increasing order of level // number.) version := y.ParseTs(key) for _, h := range s.levels { // Ignore all levels below startLevel. This is useful for GC when L0 is kept in memory. if h.level < startLevel { continue } vs, err := h.get(key) // Calls h.RLock() and h.RUnlock(). if err != nil { return y.ValueStruct{}, errors.Wrapf(err, "get key: %q", key) } if vs.Value == nil && vs.Meta == 0 { continue } if maxVs == nil || vs.Version == version { return vs, nil } if maxVs.Version < vs.Version { *maxVs = vs } } if maxVs != nil { return *maxVs, nil } return y.ValueStruct{}, nil } func appendIteratorsReversed(out []y.Iterator, th []*table.Table, reversed bool) []y.Iterator { for i := len(th) - 1; i >= 0; i-- { // This will increment the reference of the table handler. out = append(out, th[i].NewIterator(reversed)) } return out } // appendIterators appends iterators to an array of iterators, for merging. // Note: This obtains references for the table handlers. Remember to close these iterators. func (s *levelsController) appendIterators( iters []y.Iterator, opt *IteratorOptions) []y.Iterator { // Just like with get, it's important we iterate the levels from 0 on upward, to avoid missing // data when there's a compaction. for _, level := range s.levels { iters = level.appendIterators(iters, opt) } return iters } // TableInfo represents the information about a table. type TableInfo struct { ID uint64 Level int Left []byte Right []byte KeyCount uint64 // Number of keys in the table EstimatedSz uint64 } func (s *levelsController) getTableInfo(withKeysCount bool) (result []TableInfo) { for _, l := range s.levels { l.RLock() for _, t := range l.tables { var count uint64 if withKeysCount { it := t.NewIterator(false) for it.Rewind(); it.Valid(); it.Next() { count++ } it.Close() } info := TableInfo{ ID: t.ID(), Level: l.level, Left: t.Smallest(), Right: t.Biggest(), KeyCount: count, EstimatedSz: t.EstimatedSize(), } result = append(result, info) } l.RUnlock() } sort.Slice(result, func(i, j int) bool { if result[i].Level != result[j].Level { return result[i].Level < result[j].Level } return result[i].ID < result[j].ID }) return } // verifyChecksum verifies checksum for all tables on all levels. func (s *levelsController) verifyChecksum() error { var tables []*table.Table for _, l := range s.levels { l.RLock() tables = tables[:0] for _, t := range l.tables { tables = append(tables, t) t.IncrRef() } l.RUnlock() for _, t := range tables { errChkVerify := t.VerifyChecksum() if err := t.DecrRef(); err != nil { s.kv.opt.Errorf("unable to decrease reference of table: %s while "+ "verifying checksum with error: %s", t.Filename(), err) } if errChkVerify != nil { return errChkVerify } } } return nil } badger-2.2007.2/levels_test.go000066400000000000000000000674541372173116500160760ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "math" "testing" "time" "github.com/dgraph-io/badger/v2/options" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/table" "github.com/dgraph-io/badger/v2/y" "github.com/stretchr/testify/require" ) // createAndOpen creates a table with the given data and adds it to the given level. func createAndOpen(db *DB, td []keyValVersion, level int) { opts := table.Options{ BlockSize: db.opt.BlockSize, BloomFalsePositive: db.opt.BloomFalsePositive, LoadingMode: options.LoadToRAM, ChkMode: options.NoVerification, } b := table.NewTableBuilder(opts) // Add all keys and versions to the table. for _, item := range td { key := y.KeyWithTs([]byte(item.key), uint64(item.version)) val := y.ValueStruct{Value: []byte(item.val), Meta: item.meta} b.Add(key, val, 0) } fd, err := y.CreateSyncedFile(table.NewFilename(db.lc.reserveFileID(), db.opt.Dir), true) if err != nil { panic(err) } if _, err = fd.Write(b.Finish()); err != nil { panic(err) } tab, err := table.OpenTable(fd, opts) if err != nil { panic(err) } if err := db.manifest.addChanges([]*pb.ManifestChange{ newCreateChange(tab.ID(), level, 0, tab.CompressionType()), }); err != nil { panic(err) } // Add table to the given level. db.lc.levels[level].tables = append(db.lc.levels[level].tables, tab) } type keyValVersion struct { key string val string version int meta byte } func TestCheckOverlap(t *testing.T) { t.Run("overlap", func(t *testing.T) { // This test consists of one table on level 0 and one on level 1. // There is an overlap amongst the tables but there is no overlap // with rest of the levels. t.Run("same keys", func(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { l0 := []keyValVersion{{"foo", "bar", 3, 0}} l1 := []keyValVersion{{"foo", "bar", 2, 0}} createAndOpen(db, l0, 0) createAndOpen(db, l1, 1) // Level 0 should overlap with level 0 tables. require.True(t, db.lc.checkOverlap(db.lc.levels[0].tables, 0)) // Level 1 should overlap with level 0 tables (they have the same keys). require.True(t, db.lc.checkOverlap(db.lc.levels[0].tables, 1)) // Level 2 and 3 should not overlap with level 0 tables. require.False(t, db.lc.checkOverlap(db.lc.levels[0].tables, 2)) require.False(t, db.lc.checkOverlap(db.lc.levels[1].tables, 2)) require.False(t, db.lc.checkOverlap(db.lc.levels[0].tables, 3)) require.False(t, db.lc.checkOverlap(db.lc.levels[1].tables, 3)) }) }) t.Run("overlapping keys", func(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { l0 := []keyValVersion{{"a", "x", 1, 0}, {"b", "x", 1, 0}, {"foo", "bar", 3, 0}} l1 := []keyValVersion{{"foo", "bar", 2, 0}} createAndOpen(db, l0, 0) createAndOpen(db, l1, 1) // Level 0 should overlap with level 0 tables. require.True(t, db.lc.checkOverlap(db.lc.levels[0].tables, 0)) require.True(t, db.lc.checkOverlap(db.lc.levels[1].tables, 1)) // Level 1 should overlap with level 0 tables, "foo" key is common. require.True(t, db.lc.checkOverlap(db.lc.levels[0].tables, 1)) // Level 2 and 3 should not overlap with level 0 tables. require.False(t, db.lc.checkOverlap(db.lc.levels[0].tables, 2)) require.False(t, db.lc.checkOverlap(db.lc.levels[0].tables, 3)) }) }) }) t.Run("non-overlapping", func(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { l0 := []keyValVersion{{"a", "x", 1, 0}, {"b", "x", 1, 0}, {"c", "bar", 3, 0}} l1 := []keyValVersion{{"foo", "bar", 2, 0}} createAndOpen(db, l0, 0) createAndOpen(db, l1, 1) // Level 1 should not overlap with level 0 tables require.False(t, db.lc.checkOverlap(db.lc.levels[0].tables, 1)) // Level 2 and 3 should not overlap with level 0 tables. require.False(t, db.lc.checkOverlap(db.lc.levels[0].tables, 2)) require.False(t, db.lc.checkOverlap(db.lc.levels[0].tables, 3)) }) }) } func getAllAndCheck(t *testing.T, db *DB, expected []keyValVersion) { db.View(func(txn *Txn) error { opt := DefaultIteratorOptions opt.AllVersions = true opt.InternalAccess = true it := txn.NewIterator(opt) defer it.Close() i := 0 for it.Rewind(); it.Valid(); it.Next() { item := it.Item() v, err := item.ValueCopy(nil) require.NoError(t, err) // fmt.Printf("k: %s v: %d val: %s\n", item.key, item.Version(), v) require.Less(t, i, len(expected), "DB has more number of key than expected") expect := expected[i] require.Equal(t, expect.key, string(item.Key()), "expected key: %s actual key: %s", expect.key, item.Key()) require.Equal(t, expect.val, string(v), "key: %s expected value: %s actual %s", item.key, expect.val, v) require.Equal(t, expect.version, int(item.Version()), "key: %s expected version: %d actual %d", item.key, expect.version, item.Version()) require.Equal(t, expect.meta, item.meta, "key: %s expected meta: %d meta %d", item.key, expect.meta, item.meta) i++ } require.Equal(t, len(expected), i, "keys examined should be equal to keys expected") return nil }) } func TestCompaction(t *testing.T) { // Disable compactions and keep single version of each key. opt := DefaultOptions("").WithNumCompactors(0).WithNumVersionsToKeep(1) opt.managedTxns = true t.Run("level 0 to level 1", func(t *testing.T) { runBadgerTest(t, &opt, func(t *testing.T, db *DB) { l0 := []keyValVersion{{"foo", "bar", 3, 0}, {"fooz", "baz", 1, 0}} l01 := []keyValVersion{{"foo", "bar", 2, 0}} l1 := []keyValVersion{{"foo", "bar", 1, 0}} // Level 0 has table l0 and l01. createAndOpen(db, l0, 0) createAndOpen(db, l01, 0) // Level 1 has table l1. createAndOpen(db, l1, 1) // Set a high discard timestamp so that all the keys are below the discard timestamp. db.SetDiscardTs(10) getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 3, 0}, {"foo", "bar", 2, 0}, {"foo", "bar", 1, 0}, {"fooz", "baz", 1, 0}, }) cdef := compactDef{ thisLevel: db.lc.levels[0], nextLevel: db.lc.levels[1], top: db.lc.levels[0].tables, bot: db.lc.levels[1].tables, } require.NoError(t, db.lc.runCompactDef(0, cdef)) // foo version 2 should be dropped after compaction. getAllAndCheck(t, db, []keyValVersion{{"foo", "bar", 3, 0}, {"fooz", "baz", 1, 0}}) }) }) t.Run("level 0 to level 1 with lower overlap", func(t *testing.T) { runBadgerTest(t, &opt, func(t *testing.T, db *DB) { l0 := []keyValVersion{{"foo", "bar", 3, 0}, {"fooz", "baz", 1, 0}} l01 := []keyValVersion{{"foo", "bar", 2, 0}} l1 := []keyValVersion{{"foo", "bar", 1, 0}} l2 := []keyValVersion{{"foo", "bar", 0, 0}} // Level 0 has table l0 and l01. createAndOpen(db, l0, 0) createAndOpen(db, l01, 0) // Level 1 has table l1. createAndOpen(db, l1, 1) // Level 2 has table l2. createAndOpen(db, l2, 2) // Set a high discard timestamp so that all the keys are below the discard timestamp. db.SetDiscardTs(10) getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 3, 0}, {"foo", "bar", 2, 0}, {"foo", "bar", 1, 0}, {"foo", "bar", 0, 0}, {"fooz", "baz", 1, 0}, }) cdef := compactDef{ thisLevel: db.lc.levels[0], nextLevel: db.lc.levels[1], top: db.lc.levels[0].tables, bot: db.lc.levels[1].tables, } require.NoError(t, db.lc.runCompactDef(0, cdef)) // foo version 2 and version 1 should be dropped after compaction. getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 3, 0}, {"foo", "bar", 0, 0}, {"fooz", "baz", 1, 0}, }) }) }) t.Run("level 1 to level 2", func(t *testing.T) { runBadgerTest(t, &opt, func(t *testing.T, db *DB) { l1 := []keyValVersion{{"foo", "bar", 3, 0}, {"fooz", "baz", 1, 0}} l2 := []keyValVersion{{"foo", "bar", 2, 0}} createAndOpen(db, l1, 1) createAndOpen(db, l2, 2) // Set a high discard timestamp so that all the keys are below the discard timestamp. db.SetDiscardTs(10) getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 3, 0}, {"foo", "bar", 2, 0}, {"fooz", "baz", 1, 0}, }) cdef := compactDef{ thisLevel: db.lc.levels[1], nextLevel: db.lc.levels[2], top: db.lc.levels[1].tables, bot: db.lc.levels[2].tables, } require.NoError(t, db.lc.runCompactDef(1, cdef)) // foo version 2 should be dropped after compaction. getAllAndCheck(t, db, []keyValVersion{{"foo", "bar", 3, 0}, {"fooz", "baz", 1, 0}}) }) }) t.Run("level 1 to level 2 with delete", func(t *testing.T) { t.Run("with overlap", func(t *testing.T) { runBadgerTest(t, &opt, func(t *testing.T, db *DB) { l1 := []keyValVersion{{"foo", "bar", 3, bitDelete}, {"fooz", "baz", 1, bitDelete}} l2 := []keyValVersion{{"foo", "bar", 2, 0}} l3 := []keyValVersion{{"foo", "bar", 1, 0}} createAndOpen(db, l1, 1) createAndOpen(db, l2, 2) createAndOpen(db, l3, 3) // Set a high discard timestamp so that all the keys are below the discard timestamp. db.SetDiscardTs(10) getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 3, 1}, {"foo", "bar", 2, 0}, {"foo", "bar", 1, 0}, {"fooz", "baz", 1, 1}, }) cdef := compactDef{ thisLevel: db.lc.levels[1], nextLevel: db.lc.levels[2], top: db.lc.levels[1].tables, bot: db.lc.levels[2].tables, } require.NoError(t, db.lc.runCompactDef(1, cdef)) // foo bar version 2 should be dropped after compaction. fooz // baz version 1 will remain because overlap exists, which is // expected because `hasOverlap` is only checked once at the // beginning of `compactBuildTables` method. // everything from level 1 is now in level 2. getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 3, bitDelete}, {"foo", "bar", 1, 0}, {"fooz", "baz", 1, 1}, }) cdef = compactDef{ thisLevel: db.lc.levels[2], nextLevel: db.lc.levels[3], top: db.lc.levels[2].tables, bot: db.lc.levels[3].tables, } require.NoError(t, db.lc.runCompactDef(2, cdef)) // everything should be removed now getAllAndCheck(t, db, []keyValVersion{}) }) }) t.Run("with bottom overlap", func(t *testing.T) { runBadgerTest(t, &opt, func(t *testing.T, db *DB) { l1 := []keyValVersion{{"foo", "bar", 3, bitDelete}} l2 := []keyValVersion{{"foo", "bar", 2, 0}, {"fooz", "baz", 2, bitDelete}} l3 := []keyValVersion{{"fooz", "baz", 1, 0}} createAndOpen(db, l1, 1) createAndOpen(db, l2, 2) createAndOpen(db, l3, 3) // Set a high discard timestamp so that all the keys are below the discard timestamp. db.SetDiscardTs(10) getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 3, bitDelete}, {"foo", "bar", 2, 0}, {"fooz", "baz", 2, bitDelete}, {"fooz", "baz", 1, 0}, }) cdef := compactDef{ thisLevel: db.lc.levels[1], nextLevel: db.lc.levels[2], top: db.lc.levels[1].tables, bot: db.lc.levels[2].tables, } require.NoError(t, db.lc.runCompactDef(1, cdef)) // the top table at L1 doesn't overlap L3, but the bottom table at L2 // does, delete keys should not be removed. getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 3, bitDelete}, {"fooz", "baz", 2, bitDelete}, {"fooz", "baz", 1, 0}, }) }) }) t.Run("without overlap", func(t *testing.T) { runBadgerTest(t, &opt, func(t *testing.T, db *DB) { l1 := []keyValVersion{{"foo", "bar", 3, bitDelete}, {"fooz", "baz", 1, bitDelete}} l2 := []keyValVersion{{"fooo", "barr", 2, 0}} createAndOpen(db, l1, 1) createAndOpen(db, l2, 2) // Set a high discard timestamp so that all the keys are below the discard timestamp. db.SetDiscardTs(10) getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 3, 1}, {"fooo", "barr", 2, 0}, {"fooz", "baz", 1, 1}, }) cdef := compactDef{ thisLevel: db.lc.levels[1], nextLevel: db.lc.levels[2], top: db.lc.levels[1].tables, bot: db.lc.levels[2].tables, } require.NoError(t, db.lc.runCompactDef(1, cdef)) // foo version 2 should be dropped after compaction. getAllAndCheck(t, db, []keyValVersion{{"fooo", "barr", 2, 0}}) }) }) }) } func TestCompactionTwoVersions(t *testing.T) { // Disable compactions and keep two versions of each key. opt := DefaultOptions("").WithNumCompactors(0).WithNumVersionsToKeep(2) opt.managedTxns = true t.Run("with overlap", func(t *testing.T) { runBadgerTest(t, &opt, func(t *testing.T, db *DB) { l1 := []keyValVersion{{"foo", "bar", 3, 0}, {"fooz", "baz", 1, bitDelete}} l2 := []keyValVersion{{"foo", "bar", 2, 0}} l3 := []keyValVersion{{"foo", "bar", 1, 0}} createAndOpen(db, l1, 1) createAndOpen(db, l2, 2) createAndOpen(db, l3, 3) // Set a high discard timestamp so that all the keys are below the discard timestamp. db.SetDiscardTs(10) getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 3, 0}, {"foo", "bar", 2, 0}, {"foo", "bar", 1, 0}, {"fooz", "baz", 1, 1}, }) cdef := compactDef{ thisLevel: db.lc.levels[1], nextLevel: db.lc.levels[2], top: db.lc.levels[1].tables, bot: db.lc.levels[2].tables, } require.NoError(t, db.lc.runCompactDef(1, cdef)) // Nothing should be dropped after compaction because number of // versions to keep is 2. getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 3, 0}, {"foo", "bar", 2, 0}, {"foo", "bar", 1, 0}, {"fooz", "baz", 1, 1}, }) cdef = compactDef{ thisLevel: db.lc.levels[2], nextLevel: db.lc.levels[3], top: db.lc.levels[2].tables, bot: db.lc.levels[3].tables, } require.NoError(t, db.lc.runCompactDef(2, cdef)) getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 3, 0}, {"foo", "bar", 2, 0}, }) }) }) } func TestCompactionAllVersions(t *testing.T) { // Disable compactions and keep all versions of the each key. opt := DefaultOptions("").WithNumCompactors(0).WithNumVersionsToKeep(math.MaxInt32) opt.managedTxns = true t.Run("without overlap", func(t *testing.T) { runBadgerTest(t, &opt, func(t *testing.T, db *DB) { l1 := []keyValVersion{{"foo", "bar", 3, 0}, {"fooz", "baz", 1, bitDelete}} l2 := []keyValVersion{{"foo", "bar", 2, 0}} l3 := []keyValVersion{{"foo", "bar", 1, 0}} createAndOpen(db, l1, 1) createAndOpen(db, l2, 2) createAndOpen(db, l3, 3) // Set a high discard timestamp so that all the keys are below the discard timestamp. db.SetDiscardTs(10) getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 3, 0}, {"foo", "bar", 2, 0}, {"foo", "bar", 1, 0}, {"fooz", "baz", 1, 1}, }) cdef := compactDef{ thisLevel: db.lc.levels[1], nextLevel: db.lc.levels[2], top: db.lc.levels[1].tables, bot: db.lc.levels[2].tables, } require.NoError(t, db.lc.runCompactDef(1, cdef)) // Nothing should be dropped after compaction because all versions // should be kept. getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 3, 0}, {"foo", "bar", 2, 0}, {"foo", "bar", 1, 0}, {"fooz", "baz", 1, 1}, }) cdef = compactDef{ thisLevel: db.lc.levels[2], nextLevel: db.lc.levels[3], top: db.lc.levels[2].tables, bot: db.lc.levels[3].tables, } require.NoError(t, db.lc.runCompactDef(2, cdef)) getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 3, 0}, {"foo", "bar", 2, 0}, {"foo", "bar", 1, 0}, }) }) }) t.Run("without overlap", func(t *testing.T) { runBadgerTest(t, &opt, func(t *testing.T, db *DB) { l1 := []keyValVersion{{"foo", "bar", 3, bitDelete}, {"fooz", "baz", 1, bitDelete}} l2 := []keyValVersion{{"fooo", "barr", 2, 0}} createAndOpen(db, l1, 1) createAndOpen(db, l2, 2) // Set a high discard timestamp so that all the keys are below the discard timestamp. db.SetDiscardTs(10) getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 3, 1}, {"fooo", "barr", 2, 0}, {"fooz", "baz", 1, 1}, }) cdef := compactDef{ thisLevel: db.lc.levels[1], nextLevel: db.lc.levels[2], top: db.lc.levels[1].tables, bot: db.lc.levels[2].tables, } require.NoError(t, db.lc.runCompactDef(1, cdef)) // foo version 2 should be dropped after compaction. getAllAndCheck(t, db, []keyValVersion{{"fooo", "barr", 2, 0}}) }) }) } func TestHeadKeyCleanup(t *testing.T) { // Disable compactions and keep single version of each key. opt := DefaultOptions("").WithNumCompactors(0).WithNumVersionsToKeep(1) opt.managedTxns = true runBadgerTest(t, &opt, func(t *testing.T, db *DB) { l0 := []keyValVersion{ {string(head), "foo", 5, 0}, {string(head), "bar", 4, 0}, {string(head), "baz", 3, 0}, } l1 := []keyValVersion{{string(head), "fooz", 2, 0}, {string(head), "foozbaz", 1, 0}} // Level 0 has table l0 and l01. createAndOpen(db, l0, 0) // Level 1 has table l1. createAndOpen(db, l1, 1) // Set a high discard timestamp so that all the keys are below the discard timestamp. db.SetDiscardTs(10) getAllAndCheck(t, db, []keyValVersion{ {string(head), "foo", 5, 0}, {string(head), "bar", 4, 0}, {string(head), "baz", 3, 0}, {string(head), "fooz", 2, 0}, {string(head), "foozbaz", 1, 0}, }) cdef := compactDef{ thisLevel: db.lc.levels[0], nextLevel: db.lc.levels[1], top: db.lc.levels[0].tables, bot: db.lc.levels[1].tables, } require.NoError(t, db.lc.runCompactDef(0, cdef)) // foo version 2 should be dropped after compaction. getAllAndCheck(t, db, []keyValVersion{{string(head), "foo", 5, 0}}) }) } func TestDiscardTs(t *testing.T) { // Disable compactions and keep single version of each key. opt := DefaultOptions("").WithNumCompactors(0).WithNumVersionsToKeep(1) opt.managedTxns = true t.Run("all keys above discardTs", func(t *testing.T) { runBadgerTest(t, &opt, func(t *testing.T, db *DB) { l0 := []keyValVersion{{"foo", "bar", 4, 0}, {"fooz", "baz", 3, 0}} l01 := []keyValVersion{{"foo", "bar", 3, 0}} l1 := []keyValVersion{{"foo", "bar", 2, 0}} // Level 0 has table l0 and l01. createAndOpen(db, l0, 0) createAndOpen(db, l01, 0) // Level 1 has table l1. createAndOpen(db, l1, 1) // Set dicardTs to 1. All the keys are above discardTs. db.SetDiscardTs(1) getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 4, 0}, {"foo", "bar", 3, 0}, {"foo", "bar", 2, 0}, {"fooz", "baz", 3, 0}, }) cdef := compactDef{ thisLevel: db.lc.levels[0], nextLevel: db.lc.levels[1], top: db.lc.levels[0].tables, bot: db.lc.levels[1].tables, } require.NoError(t, db.lc.runCompactDef(0, cdef)) // No keys should be dropped. getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 4, 0}, {"foo", "bar", 3, 0}, {"foo", "bar", 2, 0}, {"fooz", "baz", 3, 0}, }) }) }) t.Run("some keys above discardTs", func(t *testing.T) { runBadgerTest(t, &opt, func(t *testing.T, db *DB) { l0 := []keyValVersion{ {"foo", "bar", 4, 0}, {"foo", "bar", 3, 0}, {"foo", "bar", 2, 0}, {"fooz", "baz", 2, 0}, } l1 := []keyValVersion{{"foo", "bbb", 1, 0}} createAndOpen(db, l0, 0) createAndOpen(db, l1, 1) // Set dicardTs to 3. foo2 and foo1 should be dropped. db.SetDiscardTs(3) getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 4, 0}, {"foo", "bar", 3, 0}, {"foo", "bar", 2, 0}, {"foo", "bbb", 1, 0}, {"fooz", "baz", 2, 0}, }) cdef := compactDef{ thisLevel: db.lc.levels[0], nextLevel: db.lc.levels[1], top: db.lc.levels[0].tables, bot: db.lc.levels[1].tables, } require.NoError(t, db.lc.runCompactDef(0, cdef)) // foo1 and foo2 should be dropped. getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 4, 0}, {"foo", "bar", 3, 0}, {"fooz", "baz", 2, 0}, }) }) }) t.Run("all keys below discardTs", func(t *testing.T) { runBadgerTest(t, &opt, func(t *testing.T, db *DB) { l0 := []keyValVersion{{"foo", "bar", 4, 0}, {"fooz", "baz", 3, 0}} l01 := []keyValVersion{{"foo", "bar", 3, 0}} l1 := []keyValVersion{{"foo", "bar", 2, 0}} // Level 0 has table l0 and l01. createAndOpen(db, l0, 0) createAndOpen(db, l01, 0) // Level 1 has table l1. createAndOpen(db, l1, 1) // Set dicardTs to 10. All the keys are below discardTs. db.SetDiscardTs(10) getAllAndCheck(t, db, []keyValVersion{ {"foo", "bar", 4, 0}, {"foo", "bar", 3, 0}, {"foo", "bar", 2, 0}, {"fooz", "baz", 3, 0}, }) cdef := compactDef{ thisLevel: db.lc.levels[0], nextLevel: db.lc.levels[1], top: db.lc.levels[0].tables, bot: db.lc.levels[1].tables, } require.NoError(t, db.lc.runCompactDef(0, cdef)) // Only one version of every key should be left. getAllAndCheck(t, db, []keyValVersion{{"foo", "bar", 4, 0}, {"fooz", "baz", 3, 0}}) }) }) } // This is a test to ensure that the first entry with DiscardEarlierversion bit < DiscardTs // is kept around (when numversionstokeep is infinite). func TestDiscardFirstVersion(t *testing.T) { opt := DefaultOptions("") opt.NumCompactors = 0 opt.NumVersionsToKeep = math.MaxInt32 opt.managedTxns = true runBadgerTest(t, &opt, func(t *testing.T, db *DB) { l0 := []keyValVersion{{"foo", "bar", 1, 0}} l01 := []keyValVersion{{"foo", "bar", 2, bitDiscardEarlierVersions}} l02 := []keyValVersion{{"foo", "bar", 3, 0}} l03 := []keyValVersion{{"foo", "bar", 4, 0}} l04 := []keyValVersion{{"foo", "bar", 9, 0}} l05 := []keyValVersion{{"foo", "bar", 10, bitDiscardEarlierVersions}} // Level 0 has all the tables. createAndOpen(db, l0, 0) createAndOpen(db, l01, 0) createAndOpen(db, l02, 0) createAndOpen(db, l03, 0) createAndOpen(db, l04, 0) createAndOpen(db, l05, 0) // Discard Time stamp is set to 7. db.SetDiscardTs(7) // Compact L0 to L1 cdef := compactDef{ thisLevel: db.lc.levels[0], nextLevel: db.lc.levels[1], top: db.lc.levels[0].tables, bot: db.lc.levels[1].tables, } require.NoError(t, db.lc.runCompactDef(0, cdef)) // - Version 10, 9 lie above version 7 so they should be there. // - Version 4, 3, 2 lie below the discardTs but they don't have the // "bitDiscardEarlierVersions" versions set so they should not be removed because number // of versions to keep is set to infinite. // - Version 1 is below DiscardTS and below the first "bitDiscardEarlierVersions" // marker so IT WILL BE REMOVED. ExpectedKeys := []keyValVersion{ {"foo", "bar", 10, bitDiscardEarlierVersions}, {"foo", "bar", 9, 0}, {"foo", "bar", 4, 0}, {"foo", "bar", 3, 0}, {"foo", "bar", 2, bitDiscardEarlierVersions}} getAllAndCheck(t, db, ExpectedKeys) }) } // This test ensures we don't stall when L1's size is greater than opt.LevelOneSize. // We should stall only when L0 tables more than the opt.NumLevelZeroTableStall. func TestL1Stall(t *testing.T) { opt := DefaultOptions("") // Disable all compactions. opt.NumCompactors = 0 // Number of level zero tables. opt.NumLevelZeroTables = 3 // Addition of new tables will stall if there are 4 or more L0 tables. opt.NumLevelZeroTablesStall = 4 // Level 1 size is 10 bytes. opt.LevelOneSize = 10 runBadgerTest(t, &opt, func(t *testing.T, db *DB) { // Level 0 has 4 tables. db.lc.levels[0].Lock() db.lc.levels[0].tables = []*table.Table{createEmptyTable(db), createEmptyTable(db), createEmptyTable(db), createEmptyTable(db)} db.lc.levels[0].Unlock() timeout := time.After(5 * time.Second) done := make(chan bool) // This is important. Set level 1 size more than the opt.LevelOneSize (we've set it to 10). db.lc.levels[1].totalSize = 100 go func() { tab := createEmptyTable(db) require.NoError(t, db.lc.addLevel0Table(tab)) tab.DecrRef() done <- true }() time.Sleep(time.Second) db.lc.levels[0].Lock() // Drop two tables from Level 0 so that addLevel0Table can make progress. Earlier table // count was 4 which is equal to L0 stall count. toDrop := db.lc.levels[0].tables[:2] decrRefs(toDrop) db.lc.levels[0].tables = db.lc.levels[0].tables[2:] db.lc.levels[0].Unlock() select { case <-timeout: t.Fatal("Test didn't finish in time") case <-done: } }) } func createEmptyTable(db *DB) *table.Table { opts := table.Options{ BloomFalsePositive: db.opt.BloomFalsePositive, LoadingMode: options.LoadToRAM, ChkMode: options.NoVerification, } b := table.NewTableBuilder(opts) // Add one key so that we can open this table. b.Add(y.KeyWithTs([]byte("foo"), 1), y.ValueStruct{}, 0) // Open table in memory to avoid adding changes to manifest file. tab, err := table.OpenInMemoryTable(b.Finish(), db.lc.reserveFileID(), &opts) if err != nil { panic(err) } return tab } func TestL0Stall(t *testing.T) { opt := DefaultOptions("") // Disable all compactions. opt.NumCompactors = 0 // Number of level zero tables. opt.NumLevelZeroTables = 3 // Addition of new tables will stall if there are 4 or more L0 tables. opt.NumLevelZeroTablesStall = 4 runBadgerTest(t, &opt, func(t *testing.T, db *DB) { db.lc.levels[0].Lock() // Add NumLevelZeroTableStall+1 number of tables to level 0. This would fill up level // zero and all new additions are expected to stall if L0 is in memory. for i := 0; i < opt.NumLevelZeroTablesStall+1; i++ { db.lc.levels[0].tables = append(db.lc.levels[0].tables, createEmptyTable(db)) } db.lc.levels[0].Unlock() timeout := time.After(5 * time.Second) done := make(chan bool) go func() { tab := createEmptyTable(db) require.NoError(t, db.lc.addLevel0Table(tab)) tab.DecrRef() done <- true }() // Let it stall for a second. time.Sleep(time.Second) select { case <-timeout: t.Log("Timeout triggered") // Mark this test as successful since L0 is in memory and the // addition of new table to L0 is supposed to stall. // Remove tables from level 0 so that the stalled // compaction can make progress. This does not have any // effect on the test. This is done so that the goroutine // stuck on addLevel0Table can make progress and end. db.lc.levels[0].Lock() db.lc.levels[0].tables = nil db.lc.levels[0].Unlock() <-done case <-done: // The test completed before 5 second timeout. Mark it as successful. t.Fatal("Test did not stall") } }) } // Regression test for https://github.com/dgraph-io/dgraph/issues/5573 func TestDropPrefixMoveBug(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { // l1 is used to verify that drop prefix actually drops move keys from all the levels. l1 := []keyValVersion{{string(append(badgerMove, "F"...)), "", 0, 0}} createAndOpen(db, l1, 1) // Mutiple levels can have the exact same move key with version. l2 := []keyValVersion{{string(append(badgerMove, "F"...)), "", 0, 0}, {"A", "", 0, 0}} l21 := []keyValVersion{{"B", "", 0, 0}, {"C", "", 0, 0}} l22 := []keyValVersion{{"F", "", 0, 0}, {"G", "", 0, 0}} // Level 2 has all the tables. createAndOpen(db, l2, 2) createAndOpen(db, l21, 2) createAndOpen(db, l22, 2) require.NoError(t, db.lc.validate()) require.NoError(t, db.DropPrefix([]byte("F"))) db.View(func(txn *Txn) error { iopt := DefaultIteratorOptions iopt.AllVersions = true it := txn.NewIterator(iopt) defer it.Close() specialKey := []byte("F") droppedPrefixes := [][]byte{specialKey, append(badgerMove, specialKey...)} for it.Rewind(); it.Valid(); it.Next() { key := it.Item().Key() // Ensure we don't have any "F" or "!badger!move!F" left require.False(t, hasAnyPrefixes(key, droppedPrefixes)) } return nil }) require.NoError(t, db.lc.validate()) }) } badger-2.2007.2/logger.go000066400000000000000000000047241372173116500150130ustar00rootroot00000000000000/* * Copyright 2018 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "log" "os" ) // Logger is implemented by any logging system that is used for standard logs. type Logger interface { Errorf(string, ...interface{}) Warningf(string, ...interface{}) Infof(string, ...interface{}) Debugf(string, ...interface{}) } // Errorf logs an ERROR log message to the logger specified in opts or to the // global logger if no logger is specified in opts. func (opt *Options) Errorf(format string, v ...interface{}) { if opt.Logger == nil { return } opt.Logger.Errorf(format, v...) } // Infof logs an INFO message to the logger specified in opts. func (opt *Options) Infof(format string, v ...interface{}) { if opt.Logger == nil { return } opt.Logger.Infof(format, v...) } // Warningf logs a WARNING message to the logger specified in opts. func (opt *Options) Warningf(format string, v ...interface{}) { if opt.Logger == nil { return } opt.Logger.Warningf(format, v...) } // Debugf logs a DEBUG message to the logger specified in opts. func (opt *Options) Debugf(format string, v ...interface{}) { if opt.Logger == nil { return } opt.Logger.Debugf(format, v...) } type loggingLevel int const ( DEBUG loggingLevel = iota INFO WARNING ERROR ) type defaultLog struct { *log.Logger level loggingLevel } func defaultLogger(level loggingLevel) *defaultLog { return &defaultLog{Logger: log.New(os.Stderr, "badger ", log.LstdFlags), level: level} } func (l *defaultLog) Errorf(f string, v ...interface{}) { if l.level <= ERROR { l.Printf("ERROR: "+f, v...) } } func (l *defaultLog) Warningf(f string, v ...interface{}) { if l.level <= WARNING { l.Printf("WARNING: "+f, v...) } } func (l *defaultLog) Infof(f string, v ...interface{}) { if l.level <= INFO { l.Printf("INFO: "+f, v...) } } func (l *defaultLog) Debugf(f string, v ...interface{}) { if l.level <= DEBUG { l.Printf("DEBUG: "+f, v...) } } badger-2.2007.2/logger_test.go000066400000000000000000000035131372173116500160450ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "fmt" "testing" "github.com/stretchr/testify/require" ) type mockLogger struct { output string } func (l *mockLogger) Errorf(f string, v ...interface{}) { l.output = fmt.Sprintf("ERROR: "+f, v...) } func (l *mockLogger) Infof(f string, v ...interface{}) { l.output = fmt.Sprintf("INFO: "+f, v...) } func (l *mockLogger) Warningf(f string, v ...interface{}) { l.output = fmt.Sprintf("WARNING: "+f, v...) } func (l *mockLogger) Debugf(f string, v ...interface{}) { l.output = fmt.Sprintf("DEBUG: "+f, v...) } // Test that the DB-specific log is used instead of the global log. func TestDbLog(t *testing.T) { l := &mockLogger{} opt := Options{Logger: l} opt.Errorf("test") require.Equal(t, "ERROR: test", l.output) opt.Infof("test") require.Equal(t, "INFO: test", l.output) opt.Warningf("test") require.Equal(t, "WARNING: test", l.output) } // Test that the global logger is used when no logger is specified in Options. func TestNoDbLog(t *testing.T) { l := &mockLogger{} opt := Options{} opt.Logger = l opt.Errorf("test") require.Equal(t, "ERROR: test", l.output) opt.Infof("test") require.Equal(t, "INFO: test", l.output) opt.Warningf("test") require.Equal(t, "WARNING: test", l.output) } badger-2.2007.2/managed_db.go000066400000000000000000000057441372173116500156000ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger // OpenManaged returns a new DB, which allows more control over setting // transaction timestamps, aka managed mode. // // This is only useful for databases built on top of Badger (like Dgraph), and // can be ignored by most users. func OpenManaged(opts Options) (*DB, error) { opts.managedTxns = true return Open(opts) } // NewTransactionAt follows the same logic as DB.NewTransaction(), but uses the // provided read timestamp. // // This is only useful for databases built on top of Badger (like Dgraph), and // can be ignored by most users. func (db *DB) NewTransactionAt(readTs uint64, update bool) *Txn { if !db.opt.managedTxns { panic("Cannot use NewTransactionAt with managedDB=false. Use NewTransaction instead.") } txn := db.newTransaction(update, true) txn.readTs = readTs return txn } // NewWriteBatchAt is similar to NewWriteBatch but it allows user to set the commit timestamp. // NewWriteBatchAt is supposed to be used only in the managed mode. func (db *DB) NewWriteBatchAt(commitTs uint64) *WriteBatch { if !db.opt.managedTxns { panic("cannot use NewWriteBatchAt with managedDB=false. Use NewWriteBatch instead") } wb := db.newWriteBatch(true) wb.commitTs = commitTs wb.txn.commitTs = commitTs return wb } func (db *DB) NewManagedWriteBatch() *WriteBatch { if !db.opt.managedTxns { panic("cannot use NewManagedWriteBatch with managedDB=false. Use NewWriteBatch instead") } wb := db.newWriteBatch(true) return wb } // CommitAt commits the transaction, following the same logic as Commit(), but // at the given commit timestamp. This will panic if not used with managed transactions. // // This is only useful for databases built on top of Badger (like Dgraph), and // can be ignored by most users. func (txn *Txn) CommitAt(commitTs uint64, callback func(error)) error { if !txn.db.opt.managedTxns { panic("Cannot use CommitAt with managedDB=false. Use Commit instead.") } txn.commitTs = commitTs if callback == nil { return txn.Commit() } txn.CommitWith(callback) return nil } // SetDiscardTs sets a timestamp at or below which, any invalid or deleted // versions can be discarded from the LSM tree, and thence from the value log to // reclaim disk space. Can only be used with managed transactions. func (db *DB) SetDiscardTs(ts uint64) { if !db.opt.managedTxns { panic("Cannot use SetDiscardTs with managedDB=false.") } db.orc.setDiscardTs(ts) } badger-2.2007.2/managed_db_test.go000066400000000000000000000462061372173116500166350ustar00rootroot00000000000000package badger import ( "fmt" "io/ioutil" "math" "math/rand" "runtime" "sync" "sync/atomic" "testing" "time" "github.com/dgraph-io/badger/v2/y" "github.com/stretchr/testify/require" ) func val(large bool) []byte { var buf []byte if large { buf = make([]byte, 8192) } else { buf = make([]byte, 16) } rand.Read(buf) return buf } func numKeys(db *DB) int { var count int err := db.View(func(txn *Txn) error { itr := txn.NewIterator(DefaultIteratorOptions) defer itr.Close() for itr.Rewind(); itr.Valid(); itr.Next() { count++ } return nil }) y.Check(err) return count } func numKeysManaged(db *DB, readTs uint64) int { txn := db.NewTransactionAt(readTs, false) defer txn.Discard() itr := txn.NewIterator(DefaultIteratorOptions) defer itr.Close() var count int for itr.Rewind(); itr.Valid(); itr.Next() { count++ } return count } func TestDropAllManaged(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opts := getTestOptions(dir) opts.managedTxns = true opts.ValueLogFileSize = 5 << 20 db, err := Open(opts) require.NoError(t, err) N := uint64(10000) populate := func(db *DB, start uint64) { var wg sync.WaitGroup for i := start; i < start+N; i++ { wg.Add(1) txn := db.NewTransactionAt(math.MaxUint64, true) require.NoError(t, txn.SetEntry(NewEntry([]byte(key("key", int(i))), val(true)))) require.NoError(t, txn.CommitAt(uint64(i), func(err error) { require.NoError(t, err) wg.Done() })) } wg.Wait() } populate(db, N) require.Equal(t, int(N), numKeysManaged(db, math.MaxUint64)) require.NoError(t, db.DropAll()) require.NoError(t, db.DropAll()) // Just call it twice, for fun. require.Equal(t, 0, numKeysManaged(db, math.MaxUint64)) // Check that we can still write to db, and using lower timestamps. populate(db, 1) require.Equal(t, int(N), numKeysManaged(db, math.MaxUint64)) require.NoError(t, db.Close()) // Ensure that value log is correctly replayed, that we are preserving badgerHead. opts.managedTxns = true db2, err := Open(opts) require.NoError(t, err) require.Equal(t, int(N), numKeysManaged(db2, math.MaxUint64)) require.NoError(t, db2.Close()) } func TestDropAll(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opts := getTestOptions(dir) opts.ValueLogFileSize = 5 << 20 db, err := Open(opts) require.NoError(t, err) N := uint64(10000) populate := func(db *DB) { writer := db.NewWriteBatch() for i := uint64(0); i < N; i++ { require.NoError(t, writer.Set([]byte(key("key", int(i))), val(true))) } require.NoError(t, writer.Flush()) } populate(db) require.Equal(t, int(N), numKeys(db)) require.NoError(t, db.DropAll()) require.Equal(t, 0, numKeys(db)) // Check that we can still write to mdb, and using lower timestamps. populate(db) require.Equal(t, int(N), numKeys(db)) require.NoError(t, db.Close()) // Ensure that value log is correctly replayed. db2, err := Open(opts) require.NoError(t, err) require.Equal(t, int(N), numKeys(db2)) require.NoError(t, db2.Close()) } func TestDropAllTwice(t *testing.T) { test := func(t *testing.T, opts Options) { db, err := Open(opts) require.NoError(t, err) defer func() { require.NoError(t, db.Close()) }() N := uint64(10000) populate := func(db *DB) { writer := db.NewWriteBatch() for i := uint64(0); i < N; i++ { require.NoError(t, writer.Set([]byte(key("key", int(i))), val(false))) } require.NoError(t, writer.Flush()) } populate(db) require.Equal(t, int(N), numKeys(db)) require.NoError(t, db.DropAll()) require.Equal(t, 0, numKeys(db)) // Call DropAll again. require.NoError(t, db.DropAll()) require.NoError(t, db.Close()) } t.Run("disk mode", func(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opts := getTestOptions(dir) opts.ValueLogFileSize = 5 << 20 test(t, opts) }) t.Run("InMemory mode", func(t *testing.T) { opts := getTestOptions("") opts.InMemory = true test(t, opts) }) } func TestDropAllWithPendingTxn(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opts := getTestOptions(dir) opts.ValueLogFileSize = 5 << 20 db, err := Open(opts) require.NoError(t, err) defer func() { require.NoError(t, db.Close()) }() N := uint64(10000) populate := func(db *DB) { writer := db.NewWriteBatch() for i := uint64(0); i < N; i++ { require.NoError(t, writer.Set([]byte(key("key", int(i))), val(true))) } require.NoError(t, writer.Flush()) } populate(db) require.Equal(t, int(N), numKeys(db)) txn := db.NewTransaction(true) var wg sync.WaitGroup wg.Add(1) go func() { defer wg.Done() itr := txn.NewIterator(DefaultIteratorOptions) defer itr.Close() var keys []string for { var count int for itr.Rewind(); itr.Valid(); itr.Next() { count++ item := itr.Item() keys = append(keys, string(item.KeyCopy(nil))) _, err := item.ValueCopy(nil) if err != nil { t.Logf("Got error during value copy: %v", err) return } } t.Logf("Got number of keys: %d\n", count) for _, key := range keys { item, err := txn.Get([]byte(key)) if err != nil { t.Logf("Got error during key lookup: %v", err) return } if _, err := item.ValueCopy(nil); err != nil { t.Logf("Got error during second value copy: %v", err) return } } } }() // Do not cancel txn. wg.Add(1) go func() { defer wg.Done() time.Sleep(2 * time.Second) require.NoError(t, db.DropAll()) }() wg.Wait() } func TestDropReadOnly(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opts := getTestOptions(dir) opts.ValueLogFileSize = 5 << 20 db, err := Open(opts) require.NoError(t, err) N := uint64(1000) populate := func(db *DB) { writer := db.NewWriteBatch() for i := uint64(0); i < N; i++ { require.NoError(t, writer.Set([]byte(key("key", int(i))), val(true))) } require.NoError(t, writer.Flush()) } populate(db) require.Equal(t, int(N), numKeys(db)) require.NoError(t, db.Close()) opts.ReadOnly = true db2, err := Open(opts) // acquireDirectoryLock returns ErrWindowsNotSupported on Windows. It can be ignored safely. if runtime.GOOS == "windows" { require.Equal(t, err, ErrWindowsNotSupported) } else { require.NoError(t, err) require.Panics(t, func() { db2.DropAll() }) require.NoError(t, db2.Close()) } } func TestWriteAfterClose(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opts := getTestOptions(dir) opts.ValueLogFileSize = 5 << 20 db, err := Open(opts) require.NoError(t, err) N := uint64(1000) populate := func(db *DB) { writer := db.NewWriteBatch() for i := uint64(0); i < N; i++ { require.NoError(t, writer.Set([]byte(key("key", int(i))), val(true))) } require.NoError(t, writer.Flush()) } populate(db) require.Equal(t, int(N), numKeys(db)) require.NoError(t, db.Close()) err = db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry([]byte("a"), []byte("b"))) }) require.Equal(t, ErrDBClosed, err) } func TestDropAllRace(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opts := getTestOptions(dir) opts.managedTxns = true db, err := Open(opts) require.NoError(t, err) N := 10000 // Start a goroutine to keep trying to write to DB while DropAll happens. closer := y.NewCloser(1) go func() { defer closer.Done() ticker := time.NewTicker(time.Millisecond) defer ticker.Stop() i := N + 1 // Writes would happen above N. var errors int32 for { select { case <-ticker.C: i++ txn := db.NewTransactionAt(math.MaxUint64, true) require.NoError(t, txn.SetEntry(NewEntry([]byte(key("key", i)), val(false)))) if err := txn.CommitAt(uint64(i), func(err error) { if err != nil { atomic.AddInt32(&errors, 1) } }); err != nil { atomic.AddInt32(&errors, 1) } case <-closer.HasBeenClosed(): // The following causes a data race. // t.Logf("i: %d. Number of (expected) write errors: %d.\n", i, errors) return } } }() var wg sync.WaitGroup for i := 1; i <= N; i++ { wg.Add(1) txn := db.NewTransactionAt(math.MaxUint64, true) require.NoError(t, txn.SetEntry(NewEntry([]byte(key("key", i)), val(false)))) require.NoError(t, txn.CommitAt(uint64(i), func(err error) { require.NoError(t, err) wg.Done() })) } wg.Wait() before := numKeysManaged(db, math.MaxUint64) require.True(t, before > N) require.NoError(t, db.DropAll()) closer.SignalAndWait() after := numKeysManaged(db, math.MaxUint64) t.Logf("Before: %d. After dropall: %d\n", before, after) require.True(t, after < before) db.Close() } func TestDropPrefix(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opts := getTestOptions(dir) opts.ValueLogFileSize = 5 << 20 db, err := Open(opts) require.NoError(t, err) N := uint64(10000) populate := func(db *DB) { writer := db.NewWriteBatch() for i := uint64(0); i < N; i++ { require.NoError(t, writer.Set([]byte(key("key", int(i))), val(true))) } require.NoError(t, writer.Flush()) } populate(db) require.Equal(t, int(N), numKeys(db)) require.NoError(t, db.DropPrefix([]byte("key000"))) require.Equal(t, int(N)-10, numKeys(db)) require.NoError(t, db.DropPrefix([]byte("key00"))) require.Equal(t, int(N)-100, numKeys(db)) expected := int(N) for i := 0; i < 10; i++ { require.NoError(t, db.DropPrefix([]byte(fmt.Sprintf("key%d", i)))) expected -= 1000 require.Equal(t, expected, numKeys(db)) } require.NoError(t, db.DropPrefix([]byte("key1"))) require.Equal(t, 0, numKeys(db)) require.NoError(t, db.DropPrefix([]byte("key"))) require.Equal(t, 0, numKeys(db)) // Check that we can still write to mdb, and using lower timestamps. populate(db) require.Equal(t, int(N), numKeys(db)) require.NoError(t, db.DropPrefix([]byte("key"))) db.Close() // Ensure that value log is correctly replayed. db2, err := Open(opts) require.NoError(t, err) require.Equal(t, 0, numKeys(db2)) db2.Close() } func TestDropPrefixWithPendingTxn(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opts := getTestOptions(dir) opts.ValueLogFileSize = 5 << 20 db, err := Open(opts) require.NoError(t, err) defer func() { require.NoError(t, db.Close()) }() N := uint64(10000) populate := func(db *DB) { writer := db.NewWriteBatch() for i := uint64(0); i < N; i++ { require.NoError(t, writer.Set([]byte(key("key", int(i))), val(true))) } require.NoError(t, writer.Flush()) } populate(db) require.Equal(t, int(N), numKeys(db)) txn := db.NewTransaction(true) var wg sync.WaitGroup wg.Add(2) go func() { defer wg.Done() itr := txn.NewIterator(DefaultIteratorOptions) defer itr.Close() var keys []string for { var count int for itr.Rewind(); itr.Valid(); itr.Next() { count++ item := itr.Item() keys = append(keys, string(item.KeyCopy(nil))) _, err := item.ValueCopy(nil) if err != nil { t.Logf("Got error during value copy: %v", err) return } } t.Logf("Got number of keys: %d\n", count) for _, key := range keys { item, err := txn.Get([]byte(key)) if err != nil { t.Logf("Got error during key lookup: %v", err) return } if _, err := item.ValueCopy(nil); err != nil { t.Logf("Got error during second value copy: %v", err) return } } } }() // Do not cancel txn. go func() { defer wg.Done() time.Sleep(2 * time.Second) require.NoError(t, db.DropPrefix([]byte("key0"))) require.NoError(t, db.DropPrefix([]byte("key00"))) require.NoError(t, db.DropPrefix([]byte("key"))) }() wg.Wait() } func TestDropPrefixReadOnly(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opts := getTestOptions(dir) opts.ValueLogFileSize = 5 << 20 db, err := Open(opts) require.NoError(t, err) N := uint64(1000) populate := func(db *DB) { writer := db.NewWriteBatch() for i := uint64(0); i < N; i++ { require.NoError(t, writer.Set([]byte(key("key", int(i))), val(true))) } require.NoError(t, writer.Flush()) } populate(db) require.Equal(t, int(N), numKeys(db)) require.NoError(t, db.Close()) opts.ReadOnly = true db2, err := Open(opts) // acquireDirectoryLock returns ErrWindowsNotSupported on Windows. It can be ignored safely. if runtime.GOOS == "windows" { require.Equal(t, err, ErrWindowsNotSupported) } else { require.NoError(t, err) require.Panics(t, func() { db2.DropPrefix([]byte("key0")) }) require.NoError(t, db2.Close()) } } func TestDropPrefixRace(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opts := getTestOptions(dir) opts.managedTxns = true db, err := Open(opts) require.NoError(t, err) N := 10000 // Start a goroutine to keep trying to write to DB while DropPrefix happens. closer := y.NewCloser(1) go func() { defer closer.Done() ticker := time.NewTicker(time.Millisecond) defer ticker.Stop() i := N + 1 // Writes would happen above N. var errors int32 for { select { case <-ticker.C: i++ txn := db.NewTransactionAt(math.MaxUint64, true) require.NoError(t, txn.SetEntry(NewEntry([]byte(key("key", i)), val(false)))) if err := txn.CommitAt(uint64(i), func(err error) { if err != nil { atomic.AddInt32(&errors, 1) } }); err != nil { atomic.AddInt32(&errors, 1) } case <-closer.HasBeenClosed(): // The following causes a data race. // t.Logf("i: %d. Number of (expected) write errors: %d.\n", i, errors) return } } }() var wg sync.WaitGroup for i := 1; i <= N; i++ { wg.Add(1) txn := db.NewTransactionAt(math.MaxUint64, true) require.NoError(t, txn.SetEntry(NewEntry([]byte(key("key", i)), val(false)))) require.NoError(t, txn.CommitAt(uint64(i), func(err error) { require.NoError(t, err) wg.Done() })) } wg.Wait() before := numKeysManaged(db, math.MaxUint64) require.True(t, before > N) require.NoError(t, db.DropPrefix([]byte("key00"))) require.NoError(t, db.DropPrefix([]byte("key1"))) require.NoError(t, db.DropPrefix([]byte("key"))) closer.SignalAndWait() after := numKeysManaged(db, math.MaxUint64) t.Logf("Before: %d. After dropprefix: %d\n", before, after) require.True(t, after < before) require.NoError(t, db.Close()) } func TestWriteBatchManagedMode(t *testing.T) { key := func(i int) []byte { return []byte(fmt.Sprintf("%10d", i)) } val := func(i int) []byte { return []byte(fmt.Sprintf("%128d", i)) } opt := DefaultOptions("") opt.managedTxns = true opt.MaxTableSize = 1 << 20 // This would create multiple transactions in write batch. runBadgerTest(t, &opt, func(t *testing.T, db *DB) { wb := db.NewWriteBatchAt(1) defer wb.Cancel() N, M := 50000, 1000 start := time.Now() for i := 0; i < N; i++ { require.NoError(t, wb.Set(key(i), val(i))) } for i := 0; i < M; i++ { require.NoError(t, wb.Delete(key(i))) } require.NoError(t, wb.Flush()) t.Logf("Time taken for %d writes (w/ test options): %s\n", N+M, time.Since(start)) err := db.View(func(txn *Txn) error { itr := txn.NewIterator(DefaultIteratorOptions) defer itr.Close() i := M for itr.Rewind(); itr.Valid(); itr.Next() { item := itr.Item() require.Equal(t, string(key(i)), string(item.Key())) require.Equal(t, item.Version(), uint64(1)) valcopy, err := item.ValueCopy(nil) require.NoError(t, err) require.Equal(t, val(i), valcopy) i++ } require.Equal(t, N, i) return nil }) require.NoError(t, err) }) } func TestWriteBatchManaged(t *testing.T) { key := func(i int) []byte { return []byte(fmt.Sprintf("%10d", i)) } val := func(i int) []byte { return []byte(fmt.Sprintf("%128d", i)) } opt := DefaultOptions("") opt.managedTxns = true opt.MaxTableSize = 1 << 15 // This would create multiple transactions in write batch. runBadgerTest(t, &opt, func(t *testing.T, db *DB) { wb := db.NewManagedWriteBatch() defer wb.Cancel() N, M := 50000, 1000 start := time.Now() for i := 0; i < N; i++ { require.NoError(t, wb.SetEntryAt(&Entry{Key: key(i), Value: val(i)}, 1)) } for i := 0; i < M; i++ { require.NoError(t, wb.DeleteAt(key(i), 2)) } require.NoError(t, wb.Flush()) t.Logf("Time taken for %d writes (w/ test options): %s\n", N+M, time.Since(start)) err := db.View(func(txn *Txn) error { itr := txn.NewIterator(DefaultIteratorOptions) defer itr.Close() i := M for itr.Rewind(); itr.Valid(); itr.Next() { item := itr.Item() require.Equal(t, string(key(i)), string(item.Key())) require.Equal(t, item.Version(), uint64(1)) valcopy, err := item.ValueCopy(nil) require.NoError(t, err) require.Equal(t, val(i), valcopy) i++ } require.Equal(t, N, i) return nil }) require.NoError(t, err) }) } func TestWriteBatchDuplicate(t *testing.T) { N := 10 k := []byte("key") v := []byte("val") readVerify := func(t *testing.T, db *DB, n int, versions []int) { err := db.View(func(txn *Txn) error { iopt := DefaultIteratorOptions iopt.AllVersions = true itr := txn.NewIterator(iopt) defer itr.Close() i := 0 for itr.Rewind(); itr.Valid(); itr.Next() { item := itr.Item() require.Equal(t, k, item.Key()) require.Equal(t, uint64(versions[i]), item.Version()) err := item.Value(func(val []byte) error { require.Equal(t, v, val) return nil }) require.NoError(t, err) i++ } require.Equal(t, n, i) return nil }) require.NoError(t, err) } t.Run("writebatch", func(t *testing.T) { opt := DefaultOptions("") opt.MaxTableSize = 1 << 15 // This would create multiple transactions in write batch. runBadgerTest(t, &opt, func(t *testing.T, db *DB) { wb := db.NewWriteBatch() defer wb.Cancel() for i := uint64(0); i < uint64(N); i++ { // Multiple versions of the same key. require.NoError(t, wb.SetEntry(&Entry{Key: k, Value: v})) } require.NoError(t, wb.Flush()) readVerify(t, db, 1, []int{1}) }) }) t.Run("writebatch at", func(t *testing.T) { opt := DefaultOptions("") opt.MaxTableSize = 1 << 15 // This would create multiple transactions in write batch. opt.managedTxns = true runBadgerTest(t, &opt, func(t *testing.T, db *DB) { wb := db.NewWriteBatchAt(10) defer wb.Cancel() for i := uint64(0); i < uint64(N); i++ { // Multiple versions of the same key. require.NoError(t, wb.SetEntry(&Entry{Key: k, Value: v})) } require.NoError(t, wb.Flush()) readVerify(t, db, 1, []int{10}) }) }) t.Run("managed writebatch", func(t *testing.T) { opt := DefaultOptions("") opt.managedTxns = true opt.MaxTableSize = 1 << 15 // This would create multiple transactions in write batch. runBadgerTest(t, &opt, func(t *testing.T, db *DB) { wb := db.NewManagedWriteBatch() defer wb.Cancel() for i := uint64(0); i < uint64(N); i++ { // Multiple versions of the same key. require.NoError(t, wb.SetEntryAt(&Entry{Key: k, Value: v}, i)) } require.NoError(t, wb.Flush()) readVerify(t, db, N, []int{9, 8, 7, 6, 5, 4, 3, 2, 1, 0}) }) }) } badger-2.2007.2/manifest.go000066400000000000000000000322431372173116500153370ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "bufio" "bytes" "encoding/binary" "fmt" "hash/crc32" "io" "os" "path/filepath" "sync" "github.com/dgraph-io/badger/v2/options" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/y" "github.com/golang/protobuf/proto" "github.com/pkg/errors" ) // Manifest represents the contents of the MANIFEST file in a Badger store. // // The MANIFEST file describes the startup state of the db -- all LSM files and what level they're // at. // // It consists of a sequence of ManifestChangeSet objects. Each of these is treated atomically, // and contains a sequence of ManifestChange's (file creations/deletions) which we use to // reconstruct the manifest at startup. type Manifest struct { Levels []levelManifest Tables map[uint64]TableManifest // Contains total number of creation and deletion changes in the manifest -- used to compute // whether it'd be useful to rewrite the manifest. Creations int Deletions int } func createManifest() Manifest { levels := make([]levelManifest, 0) return Manifest{ Levels: levels, Tables: make(map[uint64]TableManifest), } } // levelManifest contains information about LSM tree levels // in the MANIFEST file. type levelManifest struct { Tables map[uint64]struct{} // Set of table id's } // TableManifest contains information about a specific table // in the LSM tree. type TableManifest struct { Level uint8 KeyID uint64 Compression options.CompressionType } // manifestFile holds the file pointer (and other info) about the manifest file, which is a log // file we append to. type manifestFile struct { fp *os.File directory string // We make this configurable so that unit tests can hit rewrite() code quickly deletionsRewriteThreshold int // Guards appends, which includes access to the manifest field. appendLock sync.Mutex // Used to track the current state of the manifest, used when rewriting. manifest Manifest // Used to indicate if badger was opened in InMemory mode. inMemory bool } const ( // ManifestFilename is the filename for the manifest file. ManifestFilename = "MANIFEST" manifestRewriteFilename = "MANIFEST-REWRITE" manifestDeletionsRewriteThreshold = 10000 manifestDeletionsRatio = 10 ) // asChanges returns a sequence of changes that could be used to recreate the Manifest in its // present state. func (m *Manifest) asChanges() []*pb.ManifestChange { changes := make([]*pb.ManifestChange, 0, len(m.Tables)) for id, tm := range m.Tables { changes = append(changes, newCreateChange(id, int(tm.Level), tm.KeyID, tm.Compression)) } return changes } func (m *Manifest) clone() Manifest { changeSet := pb.ManifestChangeSet{Changes: m.asChanges()} ret := createManifest() y.Check(applyChangeSet(&ret, &changeSet)) return ret } // openOrCreateManifestFile opens a Badger manifest file if it exists, or creates one if // doesn’t exists. func openOrCreateManifestFile(opt Options) ( ret *manifestFile, result Manifest, err error) { if opt.InMemory { return &manifestFile{inMemory: true}, Manifest{}, nil } return helpOpenOrCreateManifestFile(opt.Dir, opt.ReadOnly, manifestDeletionsRewriteThreshold) } func helpOpenOrCreateManifestFile(dir string, readOnly bool, deletionsThreshold int) ( *manifestFile, Manifest, error) { path := filepath.Join(dir, ManifestFilename) var flags uint32 if readOnly { flags |= y.ReadOnly } fp, err := y.OpenExistingFile(path, flags) // We explicitly sync in addChanges, outside the lock. if err != nil { if !os.IsNotExist(err) { return nil, Manifest{}, err } if readOnly { return nil, Manifest{}, fmt.Errorf("no manifest found, required for read-only db") } m := createManifest() fp, netCreations, err := helpRewrite(dir, &m) if err != nil { return nil, Manifest{}, err } y.AssertTrue(netCreations == 0) mf := &manifestFile{ fp: fp, directory: dir, manifest: m.clone(), deletionsRewriteThreshold: deletionsThreshold, } return mf, m, nil } manifest, truncOffset, err := ReplayManifestFile(fp) if err != nil { _ = fp.Close() return nil, Manifest{}, err } if !readOnly { // Truncate file so we don't have a half-written entry at the end. if err := fp.Truncate(truncOffset); err != nil { _ = fp.Close() return nil, Manifest{}, err } } if _, err = fp.Seek(0, io.SeekEnd); err != nil { _ = fp.Close() return nil, Manifest{}, err } mf := &manifestFile{ fp: fp, directory: dir, manifest: manifest.clone(), deletionsRewriteThreshold: deletionsThreshold, } return mf, manifest, nil } func (mf *manifestFile) close() error { if mf.inMemory { return nil } return mf.fp.Close() } // addChanges writes a batch of changes, atomically, to the file. By "atomically" that means when // we replay the MANIFEST file, we'll either replay all the changes or none of them. (The truth of // this depends on the filesystem -- some might append garbage data if a system crash happens at // the wrong time.) func (mf *manifestFile) addChanges(changesParam []*pb.ManifestChange) error { if mf.inMemory { return nil } changes := pb.ManifestChangeSet{Changes: changesParam} buf, err := proto.Marshal(&changes) if err != nil { return err } // Maybe we could use O_APPEND instead (on certain file systems) mf.appendLock.Lock() if err := applyChangeSet(&mf.manifest, &changes); err != nil { mf.appendLock.Unlock() return err } // Rewrite manifest if it'd shrink by 1/10 and it's big enough to care if mf.manifest.Deletions > mf.deletionsRewriteThreshold && mf.manifest.Deletions > manifestDeletionsRatio*(mf.manifest.Creations-mf.manifest.Deletions) { if err := mf.rewrite(); err != nil { mf.appendLock.Unlock() return err } } else { var lenCrcBuf [8]byte binary.BigEndian.PutUint32(lenCrcBuf[0:4], uint32(len(buf))) binary.BigEndian.PutUint32(lenCrcBuf[4:8], crc32.Checksum(buf, y.CastagnoliCrcTable)) buf = append(lenCrcBuf[:], buf...) if _, err := mf.fp.Write(buf); err != nil { mf.appendLock.Unlock() return err } } mf.appendLock.Unlock() return mf.fp.Sync() } // Has to be 4 bytes. The value can never change, ever, anyway. var magicText = [4]byte{'B', 'd', 'g', 'r'} // The magic version number. const magicVersion = 7 func helpRewrite(dir string, m *Manifest) (*os.File, int, error) { rewritePath := filepath.Join(dir, manifestRewriteFilename) // We explicitly sync. fp, err := y.OpenTruncFile(rewritePath, false) if err != nil { return nil, 0, err } buf := make([]byte, 8) copy(buf[0:4], magicText[:]) binary.BigEndian.PutUint32(buf[4:8], magicVersion) netCreations := len(m.Tables) changes := m.asChanges() set := pb.ManifestChangeSet{Changes: changes} changeBuf, err := proto.Marshal(&set) if err != nil { fp.Close() return nil, 0, err } var lenCrcBuf [8]byte binary.BigEndian.PutUint32(lenCrcBuf[0:4], uint32(len(changeBuf))) binary.BigEndian.PutUint32(lenCrcBuf[4:8], crc32.Checksum(changeBuf, y.CastagnoliCrcTable)) buf = append(buf, lenCrcBuf[:]...) buf = append(buf, changeBuf...) if _, err := fp.Write(buf); err != nil { fp.Close() return nil, 0, err } if err := fp.Sync(); err != nil { fp.Close() return nil, 0, err } // In Windows the files should be closed before doing a Rename. if err = fp.Close(); err != nil { return nil, 0, err } manifestPath := filepath.Join(dir, ManifestFilename) if err := os.Rename(rewritePath, manifestPath); err != nil { return nil, 0, err } fp, err = y.OpenExistingFile(manifestPath, 0) if err != nil { return nil, 0, err } if _, err := fp.Seek(0, io.SeekEnd); err != nil { fp.Close() return nil, 0, err } if err := syncDir(dir); err != nil { fp.Close() return nil, 0, err } return fp, netCreations, nil } // Must be called while appendLock is held. func (mf *manifestFile) rewrite() error { // In Windows the files should be closed before doing a Rename. if err := mf.fp.Close(); err != nil { return err } fp, netCreations, err := helpRewrite(mf.directory, &mf.manifest) if err != nil { return err } mf.fp = fp mf.manifest.Creations = netCreations mf.manifest.Deletions = 0 return nil } type countingReader struct { wrapped *bufio.Reader count int64 } func (r *countingReader) Read(p []byte) (n int, err error) { n, err = r.wrapped.Read(p) r.count += int64(n) return } func (r *countingReader) ReadByte() (b byte, err error) { b, err = r.wrapped.ReadByte() if err == nil { r.count++ } return } var ( errBadMagic = errors.New("manifest has bad magic") errBadChecksum = errors.New("manifest has checksum mismatch") ) // ReplayManifestFile reads the manifest file and constructs two manifest objects. (We need one // immutable copy and one mutable copy of the manifest. Easiest way is to construct two of them.) // Also, returns the last offset after a completely read manifest entry -- the file must be // truncated at that point before further appends are made (if there is a partial entry after // that). In normal conditions, truncOffset is the file size. func ReplayManifestFile(fp *os.File) (Manifest, int64, error) { r := countingReader{wrapped: bufio.NewReader(fp)} var magicBuf [8]byte if _, err := io.ReadFull(&r, magicBuf[:]); err != nil { return Manifest{}, 0, errBadMagic } if !bytes.Equal(magicBuf[0:4], magicText[:]) { return Manifest{}, 0, errBadMagic } version := y.BytesToU32(magicBuf[4:8]) if version != magicVersion { return Manifest{}, 0, //nolint:lll fmt.Errorf("manifest has unsupported version: %d (we support %d).\n"+ "Please see https://github.com/dgraph-io/badger/blob/master/README.md#i-see-manifest-has-unsupported-version-x-we-support-y-error"+ " on how to fix this.", version, magicVersion) } stat, err := fp.Stat() if err != nil { return Manifest{}, 0, err } build := createManifest() var offset int64 for { offset = r.count var lenCrcBuf [8]byte _, err := io.ReadFull(&r, lenCrcBuf[:]) if err != nil { if err == io.EOF || err == io.ErrUnexpectedEOF { break } return Manifest{}, 0, err } length := y.BytesToU32(lenCrcBuf[0:4]) // Sanity check to ensure we don't over-allocate memory. if length > uint32(stat.Size()) { return Manifest{}, 0, errors.Errorf( "Buffer length: %d greater than file size: %d. Manifest file might be corrupted", length, stat.Size()) } var buf = make([]byte, length) if _, err := io.ReadFull(&r, buf); err != nil { if err == io.EOF || err == io.ErrUnexpectedEOF { break } return Manifest{}, 0, err } if crc32.Checksum(buf, y.CastagnoliCrcTable) != y.BytesToU32(lenCrcBuf[4:8]) { return Manifest{}, 0, errBadChecksum } var changeSet pb.ManifestChangeSet if err := proto.Unmarshal(buf, &changeSet); err != nil { return Manifest{}, 0, err } if err := applyChangeSet(&build, &changeSet); err != nil { return Manifest{}, 0, err } } return build, offset, nil } func applyManifestChange(build *Manifest, tc *pb.ManifestChange) error { switch tc.Op { case pb.ManifestChange_CREATE: if _, ok := build.Tables[tc.Id]; ok { return fmt.Errorf("MANIFEST invalid, table %d exists", tc.Id) } build.Tables[tc.Id] = TableManifest{ Level: uint8(tc.Level), KeyID: tc.KeyId, Compression: options.CompressionType(tc.Compression), } for len(build.Levels) <= int(tc.Level) { build.Levels = append(build.Levels, levelManifest{make(map[uint64]struct{})}) } build.Levels[tc.Level].Tables[tc.Id] = struct{}{} build.Creations++ case pb.ManifestChange_DELETE: tm, ok := build.Tables[tc.Id] if !ok { return fmt.Errorf("MANIFEST removes non-existing table %d", tc.Id) } delete(build.Levels[tm.Level].Tables, tc.Id) delete(build.Tables, tc.Id) build.Deletions++ default: return fmt.Errorf("MANIFEST file has invalid manifestChange op") } return nil } // This is not a "recoverable" error -- opening the KV store fails because the MANIFEST file is // just plain broken. func applyChangeSet(build *Manifest, changeSet *pb.ManifestChangeSet) error { for _, change := range changeSet.Changes { if err := applyManifestChange(build, change); err != nil { return err } } return nil } func newCreateChange( id uint64, level int, keyID uint64, c options.CompressionType) *pb.ManifestChange { return &pb.ManifestChange{ Id: id, Op: pb.ManifestChange_CREATE, Level: uint32(level), KeyId: keyID, // Hard coding it, since we're supporting only AES for now. EncryptionAlgo: pb.EncryptionAlgo_aes, Compression: uint32(c), } } func newDeleteChange(id uint64) *pb.ManifestChange { return &pb.ManifestChange{ Id: id, Op: pb.ManifestChange_DELETE, } } badger-2.2007.2/manifest_test.go000066400000000000000000000144751372173116500164050ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "fmt" "io/ioutil" "math/rand" "os" "path/filepath" "sort" "testing" "golang.org/x/net/trace" "github.com/dgraph-io/badger/v2/options" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/table" "github.com/dgraph-io/badger/v2/y" "github.com/stretchr/testify/require" ) func TestManifestBasic(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opt := getTestOptions(dir) { kv, err := Open(opt) require.NoError(t, err) n := 5000 for i := 0; i < n; i++ { if (i % 10000) == 0 { fmt.Printf("Putting i=%d\n", i) } k := []byte(fmt.Sprintf("%16x", rand.Int63())) txnSet(t, kv, k, k, 0x00) } txnSet(t, kv, []byte("testkey"), []byte("testval"), 0x05) kv.validate() require.NoError(t, kv.Close()) } kv, err := Open(opt) require.NoError(t, err) require.NoError(t, kv.View(func(txn *Txn) error { item, err := txn.Get([]byte("testkey")) require.NoError(t, err) require.EqualValues(t, "testval", string(getItemValue(t, item))) require.EqualValues(t, byte(0x05), item.UserMeta()) return nil })) require.NoError(t, kv.Close()) } func helpTestManifestFileCorruption(t *testing.T, off int64, errorContent string) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opt := getTestOptions(dir) { kv, err := Open(opt) require.NoError(t, err) require.NoError(t, kv.Close()) } fp, err := os.OpenFile(filepath.Join(dir, ManifestFilename), os.O_RDWR, 0) require.NoError(t, err) // Mess with magic value or version to force error _, err = fp.WriteAt([]byte{'X'}, off) require.NoError(t, err) require.NoError(t, fp.Close()) kv, err := Open(opt) defer func() { if kv != nil { kv.Close() } }() require.Error(t, err) require.Contains(t, err.Error(), errorContent) } func TestManifestMagic(t *testing.T) { helpTestManifestFileCorruption(t, 3, "bad magic") } func TestManifestVersion(t *testing.T) { helpTestManifestFileCorruption(t, 4, "unsupported version") } func TestManifestChecksum(t *testing.T) { helpTestManifestFileCorruption(t, 15, "checksum mismatch") } func key(prefix string, i int) string { return prefix + fmt.Sprintf("%04d", i) } func buildTestTable(t *testing.T, prefix string, n int, opts table.Options) *os.File { y.AssertTrue(n <= 10000) keyValues := make([][]string, n) for i := 0; i < n; i++ { k := key(prefix, i) v := fmt.Sprintf("%d", i) keyValues[i] = []string{k, v} } return buildTable(t, keyValues, opts) } // TODO - Move these to somewhere where table package can also use it. // keyValues is n by 2 where n is number of pairs. func buildTable(t *testing.T, keyValues [][]string, bopts table.Options) *os.File { if bopts.BloomFalsePositive == 0 { bopts.BloomFalsePositive = 0.01 } if bopts.BlockSize == 0 { bopts.BlockSize = 4 * 1024 } b := table.NewTableBuilder(bopts) defer b.Close() // TODO: Add test for file garbage collection here. No files should be left after the tests here. filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Int63()) f, err := y.OpenSyncedFile(filename, true) if t != nil { require.NoError(t, err) } else { y.Check(err) } sort.Slice(keyValues, func(i, j int) bool { return keyValues[i][0] < keyValues[j][0] }) for _, kv := range keyValues { y.AssertTrue(len(kv) == 2) b.Add(y.KeyWithTs([]byte(kv[0]), 10), y.ValueStruct{ Value: []byte(kv[1]), Meta: 'A', UserMeta: 0, }, 0) } _, err = f.Write(b.Finish()) require.NoError(t, err, "unable to write to file.") f.Close() f, _ = y.OpenSyncedFile(filename, true) return f } func TestOverlappingKeyRangeError(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) kv, err := Open(DefaultOptions(dir)) require.NoError(t, err) defer kv.Close() lh0 := newLevelHandler(kv, 0) lh1 := newLevelHandler(kv, 1) opts := table.Options{LoadingMode: options.MemoryMap, ChkMode: options.OnTableAndBlockRead} f := buildTestTable(t, "k", 2, opts) t1, err := table.OpenTable(f, opts) require.NoError(t, err) defer t1.DecrRef() done := lh0.tryAddLevel0Table(t1) require.Equal(t, true, done) cd := compactDef{ thisLevel: lh0, nextLevel: lh1, elog: trace.New("Badger", "Compact"), } manifest := createManifest() lc, err := newLevelsController(kv, &manifest) require.NoError(t, err) done = lc.fillTablesL0(&cd) require.Equal(t, true, done) lc.runCompactDef(0, cd) f = buildTestTable(t, "l", 2, opts) t2, err := table.OpenTable(f, opts) require.NoError(t, err) defer t2.DecrRef() done = lh0.tryAddLevel0Table(t2) require.Equal(t, true, done) cd = compactDef{ thisLevel: lh0, nextLevel: lh1, elog: trace.New("Badger", "Compact"), } lc.fillTablesL0(&cd) lc.runCompactDef(0, cd) } func TestManifestRewrite(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) deletionsThreshold := 10 mf, m, err := helpOpenOrCreateManifestFile(dir, false, deletionsThreshold) defer func() { if mf != nil { mf.close() } }() require.NoError(t, err) require.Equal(t, 0, m.Creations) require.Equal(t, 0, m.Deletions) err = mf.addChanges([]*pb.ManifestChange{ newCreateChange(0, 0, 0, 0), }) require.NoError(t, err) for i := uint64(0); i < uint64(deletionsThreshold*3); i++ { ch := []*pb.ManifestChange{ newCreateChange(i+1, 0, 0, 0), newDeleteChange(i), } err := mf.addChanges(ch) require.NoError(t, err) } err = mf.close() require.NoError(t, err) mf = nil mf, m, err = helpOpenOrCreateManifestFile(dir, false, deletionsThreshold) require.NoError(t, err) require.Equal(t, map[uint64]TableManifest{ uint64(deletionsThreshold * 3): {Level: 0}, }, m.Tables) } badger-2.2007.2/merge.go000066400000000000000000000115551372173116500146330ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "sync" "time" "github.com/dgraph-io/badger/v2/y" "github.com/pkg/errors" ) // MergeOperator represents a Badger merge operator. type MergeOperator struct { sync.RWMutex f MergeFunc db *DB key []byte closer *y.Closer } // MergeFunc accepts two byte slices, one representing an existing value, and // another representing a new value that needs to be ‘merged’ into it. MergeFunc // contains the logic to perform the ‘merge’ and return an updated value. // MergeFunc could perform operations like integer addition, list appends etc. // Note that the ordering of the operands is maintained. type MergeFunc func(existingVal, newVal []byte) []byte // GetMergeOperator creates a new MergeOperator for a given key and returns a // pointer to it. It also fires off a goroutine that performs a compaction using // the merge function that runs periodically, as specified by dur. func (db *DB) GetMergeOperator(key []byte, f MergeFunc, dur time.Duration) *MergeOperator { op := &MergeOperator{ f: f, db: db, key: key, closer: y.NewCloser(1), } go op.runCompactions(dur) return op } var errNoMerge = errors.New("No need for merge") func (op *MergeOperator) iterateAndMerge() (newVal []byte, latest uint64, err error) { txn := op.db.NewTransaction(false) defer txn.Discard() opt := DefaultIteratorOptions opt.AllVersions = true it := txn.NewKeyIterator(op.key, opt) defer it.Close() var numVersions int for it.Rewind(); it.Valid(); it.Next() { item := it.Item() numVersions++ if numVersions == 1 { // This should be the newVal, considering this is the latest version. newVal, err = item.ValueCopy(newVal) if err != nil { return nil, 0, err } latest = item.Version() } else { if err := item.Value(func(oldVal []byte) error { // The merge should always be on the newVal considering it has the merge result of // the latest version. The value read should be the oldVal. newVal = op.f(oldVal, newVal) return nil }); err != nil { return nil, 0, err } } if item.DiscardEarlierVersions() { break } } if numVersions == 0 { return nil, latest, ErrKeyNotFound } else if numVersions == 1 { return newVal, latest, errNoMerge } return newVal, latest, nil } func (op *MergeOperator) compact() error { op.Lock() defer op.Unlock() val, version, err := op.iterateAndMerge() if err == ErrKeyNotFound || err == errNoMerge { return nil } else if err != nil { return err } entries := []*Entry{ { Key: y.KeyWithTs(op.key, version), Value: val, meta: bitDiscardEarlierVersions, }, } // Write value back to the DB. It is important that we do not set the bitMergeEntry bit // here. When compaction happens, all the older merged entries will be removed. return op.db.batchSetAsync(entries, func(err error) { if err != nil { op.db.opt.Errorf("failed to insert the result of merge compaction: %s", err) } }) } func (op *MergeOperator) runCompactions(dur time.Duration) { ticker := time.NewTicker(dur) defer op.closer.Done() var stop bool for { select { case <-op.closer.HasBeenClosed(): stop = true case <-ticker.C: // wait for tick } if err := op.compact(); err != nil { op.db.opt.Errorf("failure while running merge operation: %s", err) } if stop { ticker.Stop() break } } } // Add records a value in Badger which will eventually be merged by a background // routine into the values that were recorded by previous invocations to Add(). func (op *MergeOperator) Add(val []byte) error { return op.db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry(op.key, val).withMergeBit()) }) } // Get returns the latest value for the merge operator, which is derived by // applying the merge function to all the values added so far. // // If Add has not been called even once, Get will return ErrKeyNotFound. func (op *MergeOperator) Get() ([]byte, error) { op.RLock() defer op.RUnlock() var existing []byte err := op.db.View(func(txn *Txn) (err error) { existing, _, err = op.iterateAndMerge() return err }) if err == errNoMerge { return existing, nil } return existing, err } // Stop waits for any pending merge to complete and then stops the background // goroutine. func (op *MergeOperator) Stop() { op.closer.SignalAndWait() } badger-2.2007.2/merge_test.go000066400000000000000000000107451372173116500156720ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "encoding/binary" "io/ioutil" "testing" "time" "github.com/stretchr/testify/require" ) func TestGetMergeOperator(t *testing.T) { t.Run("Get before Add", func(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { m := db.GetMergeOperator([]byte("merge"), add, 200*time.Millisecond) defer m.Stop() val, err := m.Get() require.Equal(t, ErrKeyNotFound, err) require.Nil(t, val) }) }) t.Run("Add and Get", func(t *testing.T) { key := []byte("merge") runBadgerTest(t, nil, func(t *testing.T, db *DB) { m := db.GetMergeOperator(key, add, 200*time.Millisecond) defer m.Stop() err := m.Add(uint64ToBytes(1)) require.NoError(t, err) m.Add(uint64ToBytes(2)) require.NoError(t, err) m.Add(uint64ToBytes(3)) require.NoError(t, err) res, err := m.Get() require.NoError(t, err) require.Equal(t, uint64(6), bytesToUint64(res)) }) }) t.Run("Add and Get slices", func(t *testing.T) { // Merge function to merge two byte slices add := func(originalValue, newValue []byte) []byte { return append(originalValue, newValue...) } runBadgerTest(t, nil, func(t *testing.T, db *DB) { m := db.GetMergeOperator([]byte("fooprefix"), add, 2*time.Millisecond) defer m.Stop() require.Nil(t, m.Add([]byte("A"))) require.Nil(t, m.Add([]byte("B"))) require.Nil(t, m.Add([]byte("C"))) value, err := m.Get() require.Nil(t, err) require.Equal(t, "ABC", string(value)) }) }) t.Run("Get Before Compact", func(t *testing.T) { key := []byte("merge") runBadgerTest(t, nil, func(t *testing.T, db *DB) { m := db.GetMergeOperator(key, add, 500*time.Millisecond) defer m.Stop() err := m.Add(uint64ToBytes(1)) require.NoError(t, err) m.Add(uint64ToBytes(2)) require.NoError(t, err) m.Add(uint64ToBytes(3)) require.NoError(t, err) res, err := m.Get() require.NoError(t, err) require.Equal(t, uint64(6), bytesToUint64(res)) }) }) t.Run("Get after Stop", func(t *testing.T) { key := []byte("merge") runBadgerTest(t, nil, func(t *testing.T, db *DB) { m := db.GetMergeOperator(key, add, 1*time.Second) err := m.Add(uint64ToBytes(1)) require.NoError(t, err) m.Add(uint64ToBytes(2)) require.NoError(t, err) m.Add(uint64ToBytes(3)) require.NoError(t, err) m.Stop() res, err := m.Get() require.NoError(t, err) require.Equal(t, uint64(6), bytesToUint64(res)) }) }) t.Run("Old keys should be removed after compaction", func(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opts := getTestOptions(dir) db, err := Open(opts) require.NoError(t, err) mergeKey := []byte("foo") m := db.GetMergeOperator(mergeKey, add, 2*time.Millisecond) count := 5000 // This will cause compaction from L0->L1 for i := 0; i < count; i++ { require.NoError(t, m.Add(uint64ToBytes(1))) } value, err := m.Get() require.Nil(t, err) require.Equal(t, uint64(count), bytesToUint64(value)) m.Stop() // Force compaction by closing DB. The compaction should discard all the old merged values require.Nil(t, db.Close()) db, err = Open(opts) require.NoError(t, err) defer db.Close() keyCount := 0 txn := db.NewTransaction(false) defer txn.Discard() iopt := DefaultIteratorOptions iopt.AllVersions = true it := txn.NewKeyIterator(mergeKey, iopt) defer it.Close() for it.Rewind(); it.Valid(); it.Next() { keyCount++ } // We should have only one key in badger. All the other keys should've been removed by // compaction require.Equal(t, 1, keyCount) }) } func uint64ToBytes(i uint64) []byte { var buf [8]byte binary.BigEndian.PutUint64(buf[:], i) return buf[:] } func bytesToUint64(b []byte) uint64 { return binary.BigEndian.Uint64(b) } // Merge function to add two uint64 numbers func add(existing, new []byte) []byte { return uint64ToBytes(bytesToUint64(existing) + bytesToUint64(new)) } badger-2.2007.2/options.go000066400000000000000000000607221372173116500152270ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "time" "github.com/dgraph-io/badger/v2/options" "github.com/dgraph-io/badger/v2/table" ) // Note: If you add a new option X make sure you also add a WithX method on Options. // Options are params for creating DB object. // // This package provides DefaultOptions which contains options that should // work for most applications. Consider using that as a starting point before // customizing it for your own needs. // // Each option X is documented on the WithX method. type Options struct { // Required options. Dir string ValueDir string // Usually modified options. SyncWrites bool TableLoadingMode options.FileLoadingMode ValueLogLoadingMode options.FileLoadingMode NumVersionsToKeep int ReadOnly bool Truncate bool Logger Logger Compression options.CompressionType InMemory bool // Fine tuning options. MaxTableSize int64 LevelSizeMultiplier int MaxLevels int ValueThreshold int NumMemtables int // Changing BlockSize across DB runs will not break badger. The block size is // read from the block index stored at the end of the table. BlockSize int BloomFalsePositive float64 KeepL0InMemory bool BlockCacheSize int64 IndexCacheSize int64 LoadBloomsOnOpen bool NumLevelZeroTables int NumLevelZeroTablesStall int LevelOneSize int64 ValueLogFileSize int64 ValueLogMaxEntries uint32 NumCompactors int CompactL0OnClose bool LogRotatesToFlush int32 ZSTDCompressionLevel int // When set, checksum will be validated for each entry read from the value log file. VerifyValueChecksum bool // Encryption related options. EncryptionKey []byte // encryption key EncryptionKeyRotationDuration time.Duration // key rotation duration // BypassLockGaurd will bypass the lock guard on badger. Bypassing lock // guard can cause data corruption if multiple badger instances are using // the same directory. Use this options with caution. BypassLockGuard bool // ChecksumVerificationMode decides when db should verify checksums for SSTable blocks. ChecksumVerificationMode options.ChecksumVerificationMode // DetectConflicts determines whether the transactions would be checked for // conflicts. The transactions can be processed at a higher rate when // conflict detection is disabled. DetectConflicts bool // Transaction start and commit timestamps are managed by end-user. // This is only useful for databases built on top of Badger (like Dgraph). // Not recommended for most users. managedTxns bool // 4. Flags for testing purposes // ------------------------------ maxBatchCount int64 // max entries in batch maxBatchSize int64 // max batch size in bytes } // DefaultOptions sets a list of recommended options for good performance. // Feel free to modify these to suit your needs with the WithX methods. func DefaultOptions(path string) Options { return Options{ Dir: path, ValueDir: path, LevelOneSize: 256 << 20, LevelSizeMultiplier: 10, TableLoadingMode: options.MemoryMap, ValueLogLoadingMode: options.MemoryMap, // table.MemoryMap to mmap() the tables. // table.Nothing to not preload the tables. MaxLevels: 7, MaxTableSize: 64 << 20, NumCompactors: 2, // Run at least 2 compactors. One is dedicated for L0. NumLevelZeroTables: 5, NumLevelZeroTablesStall: 15, NumMemtables: 5, BloomFalsePositive: 0.01, BlockSize: 4 * 1024, SyncWrites: true, NumVersionsToKeep: 1, CompactL0OnClose: true, KeepL0InMemory: false, VerifyValueChecksum: false, Compression: options.None, BlockCacheSize: 0, IndexCacheSize: 0, LoadBloomsOnOpen: true, // The following benchmarks were done on a 4 KB block size (default block size). The // compression is ratio supposed to increase with increasing compression level but since the // input for compression algorithm is small (4 KB), we don't get significant benefit at // level 3. // no_compression-16 10 502848865 ns/op 165.46 MB/s - // zstd_compression/level_1-16 7 739037966 ns/op 112.58 MB/s 2.93 // zstd_compression/level_3-16 7 756950250 ns/op 109.91 MB/s 2.72 // zstd_compression/level_15-16 1 11135686219 ns/op 7.47 MB/s 4.38 // Benchmark code can be found in table/builder_test.go file ZSTDCompressionLevel: 1, // Nothing to read/write value log using standard File I/O // MemoryMap to mmap() the value log files // (2^30 - 1)*2 when mmapping < 2^31 - 1, max int32. // -1 so 2*ValueLogFileSize won't overflow on 32-bit systems. ValueLogFileSize: 1<<30 - 1, ValueLogMaxEntries: 1000000, ValueThreshold: 1 << 10, // 1 KB. Truncate: false, Logger: defaultLogger(INFO), LogRotatesToFlush: 2, EncryptionKey: []byte{}, EncryptionKeyRotationDuration: 10 * 24 * time.Hour, // Default 10 days. DetectConflicts: true, } } func buildTableOptions(opt Options) table.Options { return table.Options{ BlockSize: opt.BlockSize, BloomFalsePositive: opt.BloomFalsePositive, LoadBloomsOnOpen: opt.LoadBloomsOnOpen, LoadingMode: opt.TableLoadingMode, ChkMode: opt.ChecksumVerificationMode, Compression: opt.Compression, ZSTDCompressionLevel: opt.ZSTDCompressionLevel, } } const ( maxValueThreshold = (1 << 20) // 1 MB ) // LSMOnlyOptions follows from DefaultOptions, but sets a higher ValueThreshold // so values would be collocated with the LSM tree, with value log largely acting // as a write-ahead log only. These options would reduce the disk usage of value // log, and make Badger act more like a typical LSM tree. func LSMOnlyOptions(path string) Options { // Let's not set any other options, because they can cause issues with the // size of key-value a user can pass to Badger. For e.g., if we set // ValueLogFileSize to 64MB, a user can't pass a value more than that. // Setting it to ValueLogMaxEntries to 1000, can generate too many files. // These options are better configured on a usage basis, than broadly here. // The ValueThreshold is the most important setting a user needs to do to // achieve a heavier usage of LSM tree. // NOTE: If a user does not want to set 64KB as the ValueThreshold because // of performance reasons, 1KB would be a good option too, allowing // values smaller than 1KB to be collocated with the keys in the LSM tree. return DefaultOptions(path).WithValueThreshold(maxValueThreshold /* 1 MB */) } // WithDir returns a new Options value with Dir set to the given value. // // Dir is the path of the directory where key data will be stored in. // If it doesn't exist, Badger will try to create it for you. // This is set automatically to be the path given to `DefaultOptions`. func (opt Options) WithDir(val string) Options { opt.Dir = val return opt } // WithValueDir returns a new Options value with ValueDir set to the given value. // // ValueDir is the path of the directory where value data will be stored in. // If it doesn't exist, Badger will try to create it for you. // This is set automatically to be the path given to `DefaultOptions`. func (opt Options) WithValueDir(val string) Options { opt.ValueDir = val return opt } // WithLoggingLevel returns a new Options value with logging level of the // default logger set to the given value. // LoggingLevel sets the level of logging. It should be one of DEBUG, INFO, // WARNING or ERROR levels. // // The default value of LoggingLevel is INFO. func (opt Options) WithLoggingLevel(val loggingLevel) Options { opt.Logger = defaultLogger(val) return opt } // WithSyncWrites returns a new Options value with SyncWrites set to the given value. // // When SyncWrites is true all writes are synced to disk. Setting this to false would achieve better // performance, but may cause data loss in case of crash. // // The default value of SyncWrites is true. func (opt Options) WithSyncWrites(val bool) Options { opt.SyncWrites = val return opt } // WithTableLoadingMode returns a new Options value with TableLoadingMode set to the given value. // // TableLoadingMode indicates which file loading mode should be used for the LSM tree data files. // // The default value of TableLoadingMode is options.MemoryMap. func (opt Options) WithTableLoadingMode(val options.FileLoadingMode) Options { opt.TableLoadingMode = val return opt } // WithValueLogLoadingMode returns a new Options value with ValueLogLoadingMode set to the given // value. // // ValueLogLoadingMode indicates which file loading mode should be used for the value log data // files. // // The default value of ValueLogLoadingMode is options.MemoryMap. func (opt Options) WithValueLogLoadingMode(val options.FileLoadingMode) Options { opt.ValueLogLoadingMode = val return opt } // WithNumVersionsToKeep returns a new Options value with NumVersionsToKeep set to the given value. // // NumVersionsToKeep sets how many versions to keep per key at most. // // The default value of NumVersionsToKeep is 1. func (opt Options) WithNumVersionsToKeep(val int) Options { opt.NumVersionsToKeep = val return opt } // WithReadOnly returns a new Options value with ReadOnly set to the given value. // // When ReadOnly is true the DB will be opened on read-only mode. // Multiple processes can open the same Badger DB. // Note: if the DB being opened had crashed before and has vlog data to be replayed, // ReadOnly will cause Open to fail with an appropriate message. // // The default value of ReadOnly is false. func (opt Options) WithReadOnly(val bool) Options { opt.ReadOnly = val return opt } // WithTruncate returns a new Options value with Truncate set to the given value. // // Truncate indicates whether value log files should be truncated to delete corrupt data, if any. // This option is ignored when ReadOnly is true. // // The default value of Truncate is false. func (opt Options) WithTruncate(val bool) Options { opt.Truncate = val return opt } // WithLogger returns a new Options value with Logger set to the given value. // // Logger provides a way to configure what logger each value of badger.DB uses. // // The default value of Logger writes to stderr using the log package from the Go standard library. func (opt Options) WithLogger(val Logger) Options { opt.Logger = val return opt } // WithMaxTableSize returns a new Options value with MaxTableSize set to the given value. // // MaxTableSize sets the maximum size in bytes for each LSM table or file. // // The default value of MaxTableSize is 64MB. func (opt Options) WithMaxTableSize(val int64) Options { opt.MaxTableSize = val return opt } // WithLevelSizeMultiplier returns a new Options value with LevelSizeMultiplier set to the given // value. // // LevelSizeMultiplier sets the ratio between the maximum sizes of contiguous levels in the LSM. // Once a level grows to be larger than this ratio allowed, the compaction process will be // triggered. // // The default value of LevelSizeMultiplier is 10. func (opt Options) WithLevelSizeMultiplier(val int) Options { opt.LevelSizeMultiplier = val return opt } // WithMaxLevels returns a new Options value with MaxLevels set to the given value. // // Maximum number of levels of compaction allowed in the LSM. // // The default value of MaxLevels is 7. func (opt Options) WithMaxLevels(val int) Options { opt.MaxLevels = val return opt } // WithValueThreshold returns a new Options value with ValueThreshold set to the given value. // // ValueThreshold sets the threshold used to decide whether a value is stored directly in the LSM // tree or separately in the log value files. // // The default value of ValueThreshold is 1 KB, but LSMOnlyOptions sets it to maxValueThreshold. func (opt Options) WithValueThreshold(val int) Options { opt.ValueThreshold = val return opt } // WithNumMemtables returns a new Options value with NumMemtables set to the given value. // // NumMemtables sets the maximum number of tables to keep in memory before stalling. // // The default value of NumMemtables is 5. func (opt Options) WithNumMemtables(val int) Options { opt.NumMemtables = val return opt } // WithBloomFalsePositive returns a new Options value with BloomFalsePositive set // to the given value. // // BloomFalsePositive sets the false positive probability of the bloom filter in any SSTable. // Before reading a key from table, the bloom filter is checked for key existence. // BloomFalsePositive might impact read performance of DB. Lower BloomFalsePositive value might // consume more memory. // // The default value of BloomFalsePositive is 0.01. func (opt Options) WithBloomFalsePositive(val float64) Options { opt.BloomFalsePositive = val return opt } // WithBlockSize returns a new Options value with BlockSize set to the given value. // // BlockSize sets the size of any block in SSTable. SSTable is divided into multiple blocks // internally. Each block is compressed using prefix diff encoding. // // The default value of BlockSize is 4KB. func (opt Options) WithBlockSize(val int) Options { opt.BlockSize = val return opt } // WithNumLevelZeroTables returns a new Options value with NumLevelZeroTables set to the given // value. // // NumLevelZeroTables sets the maximum number of Level 0 tables before compaction starts. // // The default value of NumLevelZeroTables is 5. func (opt Options) WithNumLevelZeroTables(val int) Options { opt.NumLevelZeroTables = val return opt } // WithNumLevelZeroTablesStall returns a new Options value with NumLevelZeroTablesStall set to the // given value. // // NumLevelZeroTablesStall sets the number of Level 0 tables that once reached causes the DB to // stall until compaction succeeds. // // The default value of NumLevelZeroTablesStall is 10. func (opt Options) WithNumLevelZeroTablesStall(val int) Options { opt.NumLevelZeroTablesStall = val return opt } // WithLevelOneSize returns a new Options value with LevelOneSize set to the given value. // // LevelOneSize sets the maximum total size for Level 1. // // The default value of LevelOneSize is 20MB. func (opt Options) WithLevelOneSize(val int64) Options { opt.LevelOneSize = val return opt } // WithValueLogFileSize returns a new Options value with ValueLogFileSize set to the given value. // // ValueLogFileSize sets the maximum size of a single value log file. // // The default value of ValueLogFileSize is 1GB. func (opt Options) WithValueLogFileSize(val int64) Options { opt.ValueLogFileSize = val return opt } // WithValueLogMaxEntries returns a new Options value with ValueLogMaxEntries set to the given // value. // // ValueLogMaxEntries sets the maximum number of entries a value log file can hold approximately. // A actual size limit of a value log file is the minimum of ValueLogFileSize and // ValueLogMaxEntries. // // The default value of ValueLogMaxEntries is one million (1000000). func (opt Options) WithValueLogMaxEntries(val uint32) Options { opt.ValueLogMaxEntries = val return opt } // WithNumCompactors returns a new Options value with NumCompactors set to the given value. // // NumCompactors sets the number of compaction workers to run concurrently. // Setting this to zero stops compactions, which could eventually cause writes to block forever. // // The default value of NumCompactors is 2. One is dedicated just for L0. func (opt Options) WithNumCompactors(val int) Options { opt.NumCompactors = val return opt } // WithCompactL0OnClose returns a new Options value with CompactL0OnClose set to the given value. // // CompactL0OnClose determines whether Level 0 should be compacted before closing the DB. // This ensures that both reads and writes are efficient when the DB is opened later. // CompactL0OnClose is set to true if KeepL0InMemory is set to true. // // The default value of CompactL0OnClose is true. func (opt Options) WithCompactL0OnClose(val bool) Options { opt.CompactL0OnClose = val return opt } // WithLogRotatesToFlush returns a new Options value with LogRotatesToFlush set to the given value. // // LogRotatesToFlush sets the number of value log file rotates after which the Memtables are // flushed to disk. This is useful in write loads with fewer keys and larger values. This work load // would fill up the value logs quickly, while not filling up the Memtables. Thus, on a crash // and restart, the value log head could cause the replay of a good number of value log files // which can slow things on start. // // The default value of LogRotatesToFlush is 2. func (opt Options) WithLogRotatesToFlush(val int32) Options { opt.LogRotatesToFlush = val return opt } // WithEncryptionKey return a new Options value with EncryptionKey set to the given value. // // EncryptionKey is used to encrypt the data with AES. Type of AES is used based on the key // size. For example 16 bytes will use AES-128. 24 bytes will use AES-192. 32 bytes will // use AES-256. func (opt Options) WithEncryptionKey(key []byte) Options { opt.EncryptionKey = key return opt } // WithEncryptionKeyRotationDuration returns new Options value with the duration set to // the given value. // // Key Registry will use this duration to create new keys. If the previous generated // key exceed the given duration. Then the key registry will create new key. func (opt Options) WithEncryptionKeyRotationDuration(d time.Duration) Options { opt.EncryptionKeyRotationDuration = d return opt } // WithKeepL0InMemory returns a new Options value with KeepL0InMemory set to the given value. // // When KeepL0InMemory is set to true we will keep all Level 0 tables in memory. This leads to // better performance in writes as well as compactions. In case of DB crash, the value log replay // will take longer to complete since memtables and all level 0 tables will have to be recreated. // This option also sets CompactL0OnClose option to true. // // The default value of KeepL0InMemory is false. func (opt Options) WithKeepL0InMemory(val bool) Options { opt.KeepL0InMemory = val return opt } // WithCompression returns a new Options value with Compression set to the given value. // // When compression is enabled, every block will be compressed using the specified algorithm. // This option doesn't affect existing tables. Only the newly created tables will be compressed. // // The default compression algorithm used is zstd when built with Cgo. Without Cgo, the default is // snappy. Compression is enabled by default. func (opt Options) WithCompression(cType options.CompressionType) Options { opt.Compression = cType return opt } // WithVerifyValueChecksum returns a new Options value with VerifyValueChecksum set to // the given value. // // When VerifyValueChecksum is set to true, checksum will be verified for every entry read // from the value log. If the value is stored in SST (value size less than value threshold) then the // checksum validation will not be done. // // The default value of VerifyValueChecksum is False. func (opt Options) WithVerifyValueChecksum(val bool) Options { opt.VerifyValueChecksum = val return opt } // WithChecksumVerificationMode returns a new Options value with ChecksumVerificationMode set to // the given value. // // ChecksumVerificationMode indicates when the db should verify checksums for SSTable blocks. // // The default value of VerifyValueChecksum is options.NoVerification. func (opt Options) WithChecksumVerificationMode(cvMode options.ChecksumVerificationMode) Options { opt.ChecksumVerificationMode = cvMode return opt } // WithBlockCacheSize returns a new Options value with BlockCacheSize set to the given value. // // This value specifies how much data cache should hold in memory. A small size // of cache means lower memory consumption and lookups/iterations would take // longer. It is recommended to use a cache if you're using compression or encryption. // If compression and encryption both are disabled, adding a cache will lead to // unnecessary overhead which will affect the read performance. Setting size to // zero disables the cache altogether. // // Default value of BlockCacheSize is zero. func (opt Options) WithBlockCacheSize(size int64) Options { opt.BlockCacheSize = size return opt } // WithInMemory returns a new Options value with Inmemory mode set to the given value. // // When badger is running in InMemory mode, everything is stored in memory. No value/sst files are // created. In case of a crash all data will be lost. func (opt Options) WithInMemory(b bool) Options { opt.InMemory = b return opt } // WithZSTDCompressionLevel returns a new Options value with ZSTDCompressionLevel set // to the given value. // // The ZSTD compression algorithm supports 20 compression levels. The higher the compression // level, the better is the compression ratio but lower is the performance. Lower levels // have better performance and higher levels have better compression ratios. // We recommend using level 1 ZSTD Compression Level. Any level higher than 1 seems to // deteriorate badger's performance. // The following benchmarks were done on a 4 KB block size (default block size). The compression is // ratio supposed to increase with increasing compression level but since the input for compression // algorithm is small (4 KB), we don't get significant benefit at level 3. It is advised to write // your own benchmarks before choosing a compression algorithm or level. // // no_compression-16 10 502848865 ns/op 165.46 MB/s - // zstd_compression/level_1-16 7 739037966 ns/op 112.58 MB/s 2.93 // zstd_compression/level_3-16 7 756950250 ns/op 109.91 MB/s 2.72 // zstd_compression/level_15-16 1 11135686219 ns/op 7.47 MB/s 4.38 // Benchmark code can be found in table/builder_test.go file func (opt Options) WithZSTDCompressionLevel(cLevel int) Options { opt.ZSTDCompressionLevel = cLevel return opt } // WithBypassLockGuard returns a new Options value with BypassLockGuard // set to the given value. // // When BypassLockGuard option is set, badger will not acquire a lock on the // directory. This could lead to data corruption if multiple badger instances // write to the same data directory. Use this option with caution. // // The default value of BypassLockGuard is false. func (opt Options) WithBypassLockGuard(b bool) Options { opt.BypassLockGuard = b return opt } // WithLoadBloomsOnOpen returns a new Options value with LoadBloomsOnOpen set to the given value. // // Badger uses bloom filters to speed up key lookups. When LoadBloomsOnOpen is set // to false, bloom filters will be loaded lazily and not on DB open. Set this // option to false to reduce the time taken to open the DB. // // The default value of LoadBloomsOnOpen is true. func (opt Options) WithLoadBloomsOnOpen(b bool) Options { opt.LoadBloomsOnOpen = b return opt } // WithIndexCacheSize returns a new Options value with IndexCacheSize set to // the given value. // // This value specifies how much memory should be used by table indices. These // indices include the block offsets and the bloomfilters. Badger uses bloom // filters to speed up lookups. Each table has its own bloom // filter and each bloom filter is approximately of 5 MB. // // Zero value for IndexCacheSize means all the indices will be kept in // memory and the cache is disabled. // // The default value of IndexCacheSize is 0 which means all indices are kept in // memory. func (opt Options) WithIndexCacheSize(size int64) Options { opt.IndexCacheSize = size return opt } // WithDetectConflicts returns a new Options value with DetectConflicts set to the given value. // // Detect conflicts options determines if the transactions would be checked for // conflicts before committing them. When this option is set to false // (detectConflicts=false) badger can process transactions at a higher rate. // Setting this options to false might be useful when the user application // deals with conflict detection and resolution. // // The default value of Detect conflicts is True. func (opt Options) WithDetectConflicts(b bool) Options { opt.DetectConflicts = b return opt } badger-2.2007.2/options/000077500000000000000000000000001372173116500146715ustar00rootroot00000000000000badger-2.2007.2/options/options.go000066400000000000000000000036711372173116500167220ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package options // FileLoadingMode specifies how data in LSM table files and value log files should // be loaded. type FileLoadingMode int const ( // FileIO indicates that files must be loaded using standard I/O FileIO FileLoadingMode = iota // LoadToRAM indicates that file must be loaded into RAM LoadToRAM // MemoryMap indicates that that the file must be memory-mapped MemoryMap ) // ChecksumVerificationMode tells when should DB verify checksum for SSTable blocks. type ChecksumVerificationMode int const ( // NoVerification indicates DB should not verify checksum for SSTable blocks. NoVerification ChecksumVerificationMode = iota // OnTableRead indicates checksum should be verified while opening SSTtable. OnTableRead // OnBlockRead indicates checksum should be verified on every SSTable block read. OnBlockRead // OnTableAndBlockRead indicates checksum should be verified // on SSTable opening and on every block read. OnTableAndBlockRead ) // CompressionType specifies how a block should be compressed. type CompressionType uint32 const ( // None mode indicates that a block is not compressed. None CompressionType = 0 // Snappy mode indicates that a block is compressed using Snappy algorithm. Snappy CompressionType = 1 // ZSTD mode indicates that a block is compressed using ZSTD algorithm. ZSTD CompressionType = 2 ) badger-2.2007.2/pb/000077500000000000000000000000001372173116500135775ustar00rootroot00000000000000badger-2.2007.2/pb/gen.sh000077500000000000000000000006341372173116500147120ustar00rootroot00000000000000#!/bin/bash # You might need to go get -v github.com/gogo/protobuf/... protos=${GOPATH-$HOME/go}/src/github.com/dgraph-io/badger/pb pushd $protos > /dev/null protoc --gofast_out=plugins=grpc:. -I=. pb.proto # Move pb.pb.go file to the correct directory. This is necessary because protoc # would generate the pb.pb.go file inside a different directory. mv $protos/github.com/dgraph-io/badger/v2/pb/pb.pb.go ./ badger-2.2007.2/pb/pb.pb.go000066400000000000000000001645711372173116500151450ustar00rootroot00000000000000// Code generated by protoc-gen-gogo. DO NOT EDIT. // source: pb.proto package pb import ( fmt "fmt" proto "github.com/golang/protobuf/proto" io "io" math "math" math_bits "math/bits" ) // Reference imports to suppress errors if they are not otherwise used. var _ = proto.Marshal var _ = fmt.Errorf var _ = math.Inf // This is a compile-time assertion to ensure that this generated file // is compatible with the proto package it is being compiled against. // A compilation error at this line likely means your copy of the // proto package needs to be updated. const _ = proto.ProtoPackageIsVersion3 // please upgrade the proto package type EncryptionAlgo int32 const ( EncryptionAlgo_aes EncryptionAlgo = 0 ) var EncryptionAlgo_name = map[int32]string{ 0: "aes", } var EncryptionAlgo_value = map[string]int32{ "aes": 0, } func (x EncryptionAlgo) String() string { return proto.EnumName(EncryptionAlgo_name, int32(x)) } func (EncryptionAlgo) EnumDescriptor() ([]byte, []int) { return fileDescriptor_f80abaa17e25ccc8, []int{0} } type ManifestChange_Operation int32 const ( ManifestChange_CREATE ManifestChange_Operation = 0 ManifestChange_DELETE ManifestChange_Operation = 1 ) var ManifestChange_Operation_name = map[int32]string{ 0: "CREATE", 1: "DELETE", } var ManifestChange_Operation_value = map[string]int32{ "CREATE": 0, "DELETE": 1, } func (x ManifestChange_Operation) String() string { return proto.EnumName(ManifestChange_Operation_name, int32(x)) } func (ManifestChange_Operation) EnumDescriptor() ([]byte, []int) { return fileDescriptor_f80abaa17e25ccc8, []int{3, 0} } type Checksum_Algorithm int32 const ( Checksum_CRC32C Checksum_Algorithm = 0 Checksum_XXHash64 Checksum_Algorithm = 1 ) var Checksum_Algorithm_name = map[int32]string{ 0: "CRC32C", 1: "XXHash64", } var Checksum_Algorithm_value = map[string]int32{ "CRC32C": 0, "XXHash64": 1, } func (x Checksum_Algorithm) String() string { return proto.EnumName(Checksum_Algorithm_name, int32(x)) } func (Checksum_Algorithm) EnumDescriptor() ([]byte, []int) { return fileDescriptor_f80abaa17e25ccc8, []int{6, 0} } type KV struct { Key []byte `protobuf:"bytes,1,opt,name=key,proto3" json:"key,omitempty"` Value []byte `protobuf:"bytes,2,opt,name=value,proto3" json:"value,omitempty"` UserMeta []byte `protobuf:"bytes,3,opt,name=user_meta,json=userMeta,proto3" json:"user_meta,omitempty"` Version uint64 `protobuf:"varint,4,opt,name=version,proto3" json:"version,omitempty"` ExpiresAt uint64 `protobuf:"varint,5,opt,name=expires_at,json=expiresAt,proto3" json:"expires_at,omitempty"` Meta []byte `protobuf:"bytes,6,opt,name=meta,proto3" json:"meta,omitempty"` // Stream id is used to identify which stream the KV came from. StreamId uint32 `protobuf:"varint,10,opt,name=stream_id,json=streamId,proto3" json:"stream_id,omitempty"` // Stream done is used to indicate end of stream. StreamDone bool `protobuf:"varint,11,opt,name=stream_done,json=streamDone,proto3" json:"stream_done,omitempty"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_unrecognized []byte `json:"-"` XXX_sizecache int32 `json:"-"` } func (m *KV) Reset() { *m = KV{} } func (m *KV) String() string { return proto.CompactTextString(m) } func (*KV) ProtoMessage() {} func (*KV) Descriptor() ([]byte, []int) { return fileDescriptor_f80abaa17e25ccc8, []int{0} } func (m *KV) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) } func (m *KV) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { if deterministic { return xxx_messageInfo_KV.Marshal(b, m, deterministic) } else { b = b[:cap(b)] n, err := m.MarshalToSizedBuffer(b) if err != nil { return nil, err } return b[:n], nil } } func (m *KV) XXX_Merge(src proto.Message) { xxx_messageInfo_KV.Merge(m, src) } func (m *KV) XXX_Size() int { return m.Size() } func (m *KV) XXX_DiscardUnknown() { xxx_messageInfo_KV.DiscardUnknown(m) } var xxx_messageInfo_KV proto.InternalMessageInfo func (m *KV) GetKey() []byte { if m != nil { return m.Key } return nil } func (m *KV) GetValue() []byte { if m != nil { return m.Value } return nil } func (m *KV) GetUserMeta() []byte { if m != nil { return m.UserMeta } return nil } func (m *KV) GetVersion() uint64 { if m != nil { return m.Version } return 0 } func (m *KV) GetExpiresAt() uint64 { if m != nil { return m.ExpiresAt } return 0 } func (m *KV) GetMeta() []byte { if m != nil { return m.Meta } return nil } func (m *KV) GetStreamId() uint32 { if m != nil { return m.StreamId } return 0 } func (m *KV) GetStreamDone() bool { if m != nil { return m.StreamDone } return false } type KVList struct { Kv []*KV `protobuf:"bytes,1,rep,name=kv,proto3" json:"kv,omitempty"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_unrecognized []byte `json:"-"` XXX_sizecache int32 `json:"-"` } func (m *KVList) Reset() { *m = KVList{} } func (m *KVList) String() string { return proto.CompactTextString(m) } func (*KVList) ProtoMessage() {} func (*KVList) Descriptor() ([]byte, []int) { return fileDescriptor_f80abaa17e25ccc8, []int{1} } func (m *KVList) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) } func (m *KVList) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { if deterministic { return xxx_messageInfo_KVList.Marshal(b, m, deterministic) } else { b = b[:cap(b)] n, err := m.MarshalToSizedBuffer(b) if err != nil { return nil, err } return b[:n], nil } } func (m *KVList) XXX_Merge(src proto.Message) { xxx_messageInfo_KVList.Merge(m, src) } func (m *KVList) XXX_Size() int { return m.Size() } func (m *KVList) XXX_DiscardUnknown() { xxx_messageInfo_KVList.DiscardUnknown(m) } var xxx_messageInfo_KVList proto.InternalMessageInfo func (m *KVList) GetKv() []*KV { if m != nil { return m.Kv } return nil } type ManifestChangeSet struct { // A set of changes that are applied atomically. Changes []*ManifestChange `protobuf:"bytes,1,rep,name=changes,proto3" json:"changes,omitempty"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_unrecognized []byte `json:"-"` XXX_sizecache int32 `json:"-"` } func (m *ManifestChangeSet) Reset() { *m = ManifestChangeSet{} } func (m *ManifestChangeSet) String() string { return proto.CompactTextString(m) } func (*ManifestChangeSet) ProtoMessage() {} func (*ManifestChangeSet) Descriptor() ([]byte, []int) { return fileDescriptor_f80abaa17e25ccc8, []int{2} } func (m *ManifestChangeSet) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) } func (m *ManifestChangeSet) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { if deterministic { return xxx_messageInfo_ManifestChangeSet.Marshal(b, m, deterministic) } else { b = b[:cap(b)] n, err := m.MarshalToSizedBuffer(b) if err != nil { return nil, err } return b[:n], nil } } func (m *ManifestChangeSet) XXX_Merge(src proto.Message) { xxx_messageInfo_ManifestChangeSet.Merge(m, src) } func (m *ManifestChangeSet) XXX_Size() int { return m.Size() } func (m *ManifestChangeSet) XXX_DiscardUnknown() { xxx_messageInfo_ManifestChangeSet.DiscardUnknown(m) } var xxx_messageInfo_ManifestChangeSet proto.InternalMessageInfo func (m *ManifestChangeSet) GetChanges() []*ManifestChange { if m != nil { return m.Changes } return nil } type ManifestChange struct { Id uint64 `protobuf:"varint,1,opt,name=Id,proto3" json:"Id,omitempty"` Op ManifestChange_Operation `protobuf:"varint,2,opt,name=Op,proto3,enum=badgerpb2.ManifestChange_Operation" json:"Op,omitempty"` Level uint32 `protobuf:"varint,3,opt,name=Level,proto3" json:"Level,omitempty"` KeyId uint64 `protobuf:"varint,4,opt,name=key_id,json=keyId,proto3" json:"key_id,omitempty"` EncryptionAlgo EncryptionAlgo `protobuf:"varint,5,opt,name=encryption_algo,json=encryptionAlgo,proto3,enum=badgerpb2.EncryptionAlgo" json:"encryption_algo,omitempty"` Compression uint32 `protobuf:"varint,6,opt,name=compression,proto3" json:"compression,omitempty"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_unrecognized []byte `json:"-"` XXX_sizecache int32 `json:"-"` } func (m *ManifestChange) Reset() { *m = ManifestChange{} } func (m *ManifestChange) String() string { return proto.CompactTextString(m) } func (*ManifestChange) ProtoMessage() {} func (*ManifestChange) Descriptor() ([]byte, []int) { return fileDescriptor_f80abaa17e25ccc8, []int{3} } func (m *ManifestChange) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) } func (m *ManifestChange) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { if deterministic { return xxx_messageInfo_ManifestChange.Marshal(b, m, deterministic) } else { b = b[:cap(b)] n, err := m.MarshalToSizedBuffer(b) if err != nil { return nil, err } return b[:n], nil } } func (m *ManifestChange) XXX_Merge(src proto.Message) { xxx_messageInfo_ManifestChange.Merge(m, src) } func (m *ManifestChange) XXX_Size() int { return m.Size() } func (m *ManifestChange) XXX_DiscardUnknown() { xxx_messageInfo_ManifestChange.DiscardUnknown(m) } var xxx_messageInfo_ManifestChange proto.InternalMessageInfo func (m *ManifestChange) GetId() uint64 { if m != nil { return m.Id } return 0 } func (m *ManifestChange) GetOp() ManifestChange_Operation { if m != nil { return m.Op } return ManifestChange_CREATE } func (m *ManifestChange) GetLevel() uint32 { if m != nil { return m.Level } return 0 } func (m *ManifestChange) GetKeyId() uint64 { if m != nil { return m.KeyId } return 0 } func (m *ManifestChange) GetEncryptionAlgo() EncryptionAlgo { if m != nil { return m.EncryptionAlgo } return EncryptionAlgo_aes } func (m *ManifestChange) GetCompression() uint32 { if m != nil { return m.Compression } return 0 } type BlockOffset struct { Key []byte `protobuf:"bytes,1,opt,name=key,proto3" json:"key,omitempty"` Offset uint32 `protobuf:"varint,2,opt,name=offset,proto3" json:"offset,omitempty"` Len uint32 `protobuf:"varint,3,opt,name=len,proto3" json:"len,omitempty"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_unrecognized []byte `json:"-"` XXX_sizecache int32 `json:"-"` } func (m *BlockOffset) Reset() { *m = BlockOffset{} } func (m *BlockOffset) String() string { return proto.CompactTextString(m) } func (*BlockOffset) ProtoMessage() {} func (*BlockOffset) Descriptor() ([]byte, []int) { return fileDescriptor_f80abaa17e25ccc8, []int{4} } func (m *BlockOffset) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) } func (m *BlockOffset) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { if deterministic { return xxx_messageInfo_BlockOffset.Marshal(b, m, deterministic) } else { b = b[:cap(b)] n, err := m.MarshalToSizedBuffer(b) if err != nil { return nil, err } return b[:n], nil } } func (m *BlockOffset) XXX_Merge(src proto.Message) { xxx_messageInfo_BlockOffset.Merge(m, src) } func (m *BlockOffset) XXX_Size() int { return m.Size() } func (m *BlockOffset) XXX_DiscardUnknown() { xxx_messageInfo_BlockOffset.DiscardUnknown(m) } var xxx_messageInfo_BlockOffset proto.InternalMessageInfo func (m *BlockOffset) GetKey() []byte { if m != nil { return m.Key } return nil } func (m *BlockOffset) GetOffset() uint32 { if m != nil { return m.Offset } return 0 } func (m *BlockOffset) GetLen() uint32 { if m != nil { return m.Len } return 0 } type TableIndex struct { Offsets []*BlockOffset `protobuf:"bytes,1,rep,name=offsets,proto3" json:"offsets,omitempty"` BloomFilter []byte `protobuf:"bytes,2,opt,name=bloom_filter,json=bloomFilter,proto3" json:"bloom_filter,omitempty"` EstimatedSize uint64 `protobuf:"varint,3,opt,name=estimated_size,json=estimatedSize,proto3" json:"estimated_size,omitempty"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_unrecognized []byte `json:"-"` XXX_sizecache int32 `json:"-"` } func (m *TableIndex) Reset() { *m = TableIndex{} } func (m *TableIndex) String() string { return proto.CompactTextString(m) } func (*TableIndex) ProtoMessage() {} func (*TableIndex) Descriptor() ([]byte, []int) { return fileDescriptor_f80abaa17e25ccc8, []int{5} } func (m *TableIndex) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) } func (m *TableIndex) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { if deterministic { return xxx_messageInfo_TableIndex.Marshal(b, m, deterministic) } else { b = b[:cap(b)] n, err := m.MarshalToSizedBuffer(b) if err != nil { return nil, err } return b[:n], nil } } func (m *TableIndex) XXX_Merge(src proto.Message) { xxx_messageInfo_TableIndex.Merge(m, src) } func (m *TableIndex) XXX_Size() int { return m.Size() } func (m *TableIndex) XXX_DiscardUnknown() { xxx_messageInfo_TableIndex.DiscardUnknown(m) } var xxx_messageInfo_TableIndex proto.InternalMessageInfo func (m *TableIndex) GetOffsets() []*BlockOffset { if m != nil { return m.Offsets } return nil } func (m *TableIndex) GetBloomFilter() []byte { if m != nil { return m.BloomFilter } return nil } func (m *TableIndex) GetEstimatedSize() uint64 { if m != nil { return m.EstimatedSize } return 0 } type Checksum struct { Algo Checksum_Algorithm `protobuf:"varint,1,opt,name=algo,proto3,enum=badgerpb2.Checksum_Algorithm" json:"algo,omitempty"` Sum uint64 `protobuf:"varint,2,opt,name=sum,proto3" json:"sum,omitempty"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_unrecognized []byte `json:"-"` XXX_sizecache int32 `json:"-"` } func (m *Checksum) Reset() { *m = Checksum{} } func (m *Checksum) String() string { return proto.CompactTextString(m) } func (*Checksum) ProtoMessage() {} func (*Checksum) Descriptor() ([]byte, []int) { return fileDescriptor_f80abaa17e25ccc8, []int{6} } func (m *Checksum) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) } func (m *Checksum) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { if deterministic { return xxx_messageInfo_Checksum.Marshal(b, m, deterministic) } else { b = b[:cap(b)] n, err := m.MarshalToSizedBuffer(b) if err != nil { return nil, err } return b[:n], nil } } func (m *Checksum) XXX_Merge(src proto.Message) { xxx_messageInfo_Checksum.Merge(m, src) } func (m *Checksum) XXX_Size() int { return m.Size() } func (m *Checksum) XXX_DiscardUnknown() { xxx_messageInfo_Checksum.DiscardUnknown(m) } var xxx_messageInfo_Checksum proto.InternalMessageInfo func (m *Checksum) GetAlgo() Checksum_Algorithm { if m != nil { return m.Algo } return Checksum_CRC32C } func (m *Checksum) GetSum() uint64 { if m != nil { return m.Sum } return 0 } type DataKey struct { KeyId uint64 `protobuf:"varint,1,opt,name=key_id,json=keyId,proto3" json:"key_id,omitempty"` Data []byte `protobuf:"bytes,2,opt,name=data,proto3" json:"data,omitempty"` Iv []byte `protobuf:"bytes,3,opt,name=iv,proto3" json:"iv,omitempty"` CreatedAt int64 `protobuf:"varint,4,opt,name=created_at,json=createdAt,proto3" json:"created_at,omitempty"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_unrecognized []byte `json:"-"` XXX_sizecache int32 `json:"-"` } func (m *DataKey) Reset() { *m = DataKey{} } func (m *DataKey) String() string { return proto.CompactTextString(m) } func (*DataKey) ProtoMessage() {} func (*DataKey) Descriptor() ([]byte, []int) { return fileDescriptor_f80abaa17e25ccc8, []int{7} } func (m *DataKey) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) } func (m *DataKey) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { if deterministic { return xxx_messageInfo_DataKey.Marshal(b, m, deterministic) } else { b = b[:cap(b)] n, err := m.MarshalToSizedBuffer(b) if err != nil { return nil, err } return b[:n], nil } } func (m *DataKey) XXX_Merge(src proto.Message) { xxx_messageInfo_DataKey.Merge(m, src) } func (m *DataKey) XXX_Size() int { return m.Size() } func (m *DataKey) XXX_DiscardUnknown() { xxx_messageInfo_DataKey.DiscardUnknown(m) } var xxx_messageInfo_DataKey proto.InternalMessageInfo func (m *DataKey) GetKeyId() uint64 { if m != nil { return m.KeyId } return 0 } func (m *DataKey) GetData() []byte { if m != nil { return m.Data } return nil } func (m *DataKey) GetIv() []byte { if m != nil { return m.Iv } return nil } func (m *DataKey) GetCreatedAt() int64 { if m != nil { return m.CreatedAt } return 0 } func init() { proto.RegisterEnum("badgerpb2.EncryptionAlgo", EncryptionAlgo_name, EncryptionAlgo_value) proto.RegisterEnum("badgerpb2.ManifestChange_Operation", ManifestChange_Operation_name, ManifestChange_Operation_value) proto.RegisterEnum("badgerpb2.Checksum_Algorithm", Checksum_Algorithm_name, Checksum_Algorithm_value) proto.RegisterType((*KV)(nil), "badgerpb2.KV") proto.RegisterType((*KVList)(nil), "badgerpb2.KVList") proto.RegisterType((*ManifestChangeSet)(nil), "badgerpb2.ManifestChangeSet") proto.RegisterType((*ManifestChange)(nil), "badgerpb2.ManifestChange") proto.RegisterType((*BlockOffset)(nil), "badgerpb2.BlockOffset") proto.RegisterType((*TableIndex)(nil), "badgerpb2.TableIndex") proto.RegisterType((*Checksum)(nil), "badgerpb2.Checksum") proto.RegisterType((*DataKey)(nil), "badgerpb2.DataKey") } func init() { proto.RegisterFile("pb.proto", fileDescriptor_f80abaa17e25ccc8) } var fileDescriptor_f80abaa17e25ccc8 = []byte{ // 691 bytes of a gzipped FileDescriptorProto 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x74, 0x54, 0xdd, 0x6e, 0xda, 0x4a, 0x10, 0xc6, 0xc6, 0xe1, 0x67, 0x08, 0x1c, 0xce, 0xea, 0x9c, 0xc8, 0x51, 0x15, 0x4a, 0x1c, 0x45, 0x45, 0x95, 0x0a, 0x2d, 0x54, 0xbd, 0x27, 0x84, 0x2a, 0x88, 0x44, 0x48, 0x9b, 0x28, 0x8a, 0x7a, 0x83, 0xd6, 0xf6, 0x00, 0x16, 0xfe, 0x93, 0x77, 0xb1, 0x42, 0x9e, 0xa0, 0x37, 0xbd, 0xef, 0x23, 0xf5, 0xb2, 0x17, 0x7d, 0x80, 0x2a, 0x7d, 0x91, 0xca, 0x6b, 0x83, 0x40, 0x6a, 0xef, 0x66, 0xbe, 0xf9, 0x76, 0x67, 0xbf, 0x6f, 0xc6, 0x86, 0x52, 0x68, 0xb6, 0xc3, 0x28, 0x10, 0x01, 0x29, 0x9b, 0xcc, 0x9e, 0x63, 0x14, 0x9a, 0x5d, 0xe3, 0x87, 0x02, 0xea, 0xf8, 0x9e, 0xd4, 0x21, 0xbf, 0xc4, 0xb5, 0xae, 0x34, 0x95, 0xd6, 0x21, 0x4d, 0x42, 0xf2, 0x1f, 0x1c, 0xc4, 0xcc, 0x5d, 0xa1, 0xae, 0x4a, 0x2c, 0x4d, 0xc8, 0x0b, 0x28, 0xaf, 0x38, 0x46, 0x53, 0x0f, 0x05, 0xd3, 0xf3, 0xb2, 0x52, 0x4a, 0x80, 0x1b, 0x14, 0x8c, 0xe8, 0x50, 0x8c, 0x31, 0xe2, 0x4e, 0xe0, 0xeb, 0x5a, 0x53, 0x69, 0x69, 0x74, 0x93, 0x92, 0x13, 0x00, 0x7c, 0x0c, 0x9d, 0x08, 0xf9, 0x94, 0x09, 0xfd, 0x40, 0x16, 0xcb, 0x19, 0xd2, 0x17, 0x84, 0x80, 0x26, 0x2f, 0x2c, 0xc8, 0x0b, 0x65, 0x9c, 0x74, 0xe2, 0x22, 0x42, 0xe6, 0x4d, 0x1d, 0x5b, 0x87, 0xa6, 0xd2, 0xaa, 0xd2, 0x52, 0x0a, 0x8c, 0x6c, 0xf2, 0x12, 0x2a, 0x59, 0xd1, 0x0e, 0x7c, 0xd4, 0x2b, 0x4d, 0xa5, 0x55, 0xa2, 0x90, 0x42, 0x97, 0x81, 0x8f, 0xc6, 0x2b, 0x28, 0x8c, 0xef, 0xaf, 0x1d, 0x2e, 0xc8, 0x09, 0xa8, 0xcb, 0x58, 0x57, 0x9a, 0xf9, 0x56, 0xa5, 0x5b, 0x6d, 0x6f, 0x85, 0xb7, 0xc7, 0xf7, 0x54, 0x5d, 0xc6, 0xc6, 0x15, 0xfc, 0x7b, 0xc3, 0x7c, 0x67, 0x86, 0x5c, 0x0c, 0x16, 0xcc, 0x9f, 0xe3, 0x2d, 0x0a, 0xd2, 0x83, 0xa2, 0x25, 0x13, 0x9e, 0x1d, 0x3c, 0xde, 0x39, 0xb8, 0x4f, 0xa7, 0x1b, 0xa6, 0xf1, 0x45, 0x85, 0xda, 0x7e, 0x8d, 0xd4, 0x40, 0x1d, 0xd9, 0xd2, 0x54, 0x8d, 0xaa, 0x23, 0x9b, 0xf4, 0x40, 0x9d, 0x84, 0xd2, 0xd0, 0x5a, 0xf7, 0xec, 0xaf, 0x57, 0xb6, 0x27, 0x21, 0x46, 0x4c, 0x38, 0x81, 0x4f, 0xd5, 0x49, 0x98, 0x0c, 0xe2, 0x1a, 0x63, 0x74, 0xa5, 0xdd, 0x55, 0x9a, 0x26, 0xe4, 0x7f, 0x28, 0x2c, 0x71, 0x9d, 0x78, 0x93, 0x5a, 0x7d, 0xb0, 0xc4, 0xf5, 0xc8, 0x26, 0x17, 0xf0, 0x0f, 0xfa, 0x56, 0xb4, 0x0e, 0x93, 0xe3, 0x53, 0xe6, 0xce, 0x03, 0xe9, 0x76, 0x6d, 0x4f, 0xc1, 0x70, 0xcb, 0xe8, 0xbb, 0xf3, 0x80, 0xd6, 0x70, 0x2f, 0x27, 0x4d, 0xa8, 0x58, 0x81, 0x17, 0x46, 0xc8, 0xe5, 0x28, 0x0b, 0xb2, 0xed, 0x2e, 0x64, 0x9c, 0x41, 0x79, 0xfb, 0x46, 0x02, 0x50, 0x18, 0xd0, 0x61, 0xff, 0x6e, 0x58, 0xcf, 0x25, 0xf1, 0xe5, 0xf0, 0x7a, 0x78, 0x37, 0xac, 0x2b, 0xc6, 0x08, 0x2a, 0x17, 0x6e, 0x60, 0x2d, 0x27, 0xb3, 0x19, 0x47, 0xf1, 0x87, 0x0d, 0x3b, 0x82, 0x42, 0x20, 0x6b, 0xd2, 0x91, 0x2a, 0xcd, 0xb2, 0x84, 0xe9, 0xa2, 0x9f, 0xc9, 0x4d, 0x42, 0xe3, 0xb3, 0x02, 0x70, 0xc7, 0x4c, 0x17, 0x47, 0xbe, 0x8d, 0x8f, 0xe4, 0x2d, 0x14, 0x53, 0xea, 0x66, 0x3c, 0x47, 0x3b, 0xe2, 0x76, 0x7a, 0xd2, 0x0d, 0x8d, 0x9c, 0xc2, 0xa1, 0xe9, 0x06, 0x81, 0x37, 0x9d, 0x39, 0xae, 0xc0, 0x28, 0xdb, 0xe9, 0x8a, 0xc4, 0x3e, 0x4a, 0x88, 0x9c, 0x43, 0x0d, 0xb9, 0x70, 0x3c, 0x26, 0xd0, 0x9e, 0x72, 0xe7, 0x09, 0xe5, 0x03, 0x34, 0x5a, 0xdd, 0xa2, 0xb7, 0xce, 0x13, 0x1a, 0x31, 0x94, 0x06, 0x0b, 0xb4, 0x96, 0x7c, 0xe5, 0x91, 0x77, 0xa0, 0x49, 0x87, 0x15, 0xe9, 0xf0, 0xc9, 0xce, 0x23, 0x36, 0x94, 0x76, 0x62, 0x68, 0xe4, 0x88, 0x85, 0x47, 0x25, 0x35, 0xd1, 0xc6, 0x57, 0x9e, 0xec, 0xaf, 0xd1, 0x24, 0x34, 0xce, 0xa1, 0xbc, 0x25, 0xa5, 0x5e, 0x0e, 0x7a, 0xdd, 0x41, 0x3d, 0x47, 0x0e, 0xa1, 0xf4, 0xf0, 0x70, 0xc5, 0xf8, 0xe2, 0xc3, 0xfb, 0xba, 0x62, 0x58, 0x50, 0xbc, 0x64, 0x82, 0x8d, 0x71, 0xbd, 0x33, 0x7a, 0x65, 0x77, 0xf4, 0x04, 0x34, 0x9b, 0x09, 0x96, 0x69, 0x93, 0x71, 0xb2, 0x80, 0x4e, 0x9c, 0x7d, 0xa7, 0xaa, 0x13, 0x27, 0xdf, 0xa1, 0x15, 0xa1, 0x94, 0xc8, 0x84, 0xdc, 0x9c, 0x3c, 0x2d, 0x67, 0x48, 0x5f, 0xbc, 0x3e, 0x86, 0xda, 0xfe, 0x6e, 0x90, 0x22, 0xe4, 0x19, 0xf2, 0x7a, 0xee, 0xa2, 0xf7, 0xed, 0xb9, 0xa1, 0x7c, 0x7f, 0x6e, 0x28, 0x3f, 0x9f, 0x1b, 0xca, 0xd7, 0x5f, 0x8d, 0xdc, 0xa7, 0xd3, 0xb9, 0x23, 0x16, 0x2b, 0xb3, 0x6d, 0x05, 0x5e, 0xc7, 0x9e, 0x47, 0x2c, 0x5c, 0xbc, 0x71, 0x82, 0x4e, 0xea, 0x41, 0x27, 0xee, 0x76, 0x42, 0xd3, 0x2c, 0xc8, 0xdf, 0x4d, 0xef, 0x77, 0x00, 0x00, 0x00, 0xff, 0xff, 0x3a, 0xcd, 0x7e, 0xad, 0x7a, 0x04, 0x00, 0x00, } func (m *KV) Marshal() (dAtA []byte, err error) { size := m.Size() dAtA = make([]byte, size) n, err := m.MarshalToSizedBuffer(dAtA[:size]) if err != nil { return nil, err } return dAtA[:n], nil } func (m *KV) MarshalTo(dAtA []byte) (int, error) { size := m.Size() return m.MarshalToSizedBuffer(dAtA[:size]) } func (m *KV) MarshalToSizedBuffer(dAtA []byte) (int, error) { i := len(dAtA) _ = i var l int _ = l if m.XXX_unrecognized != nil { i -= len(m.XXX_unrecognized) copy(dAtA[i:], m.XXX_unrecognized) } if m.StreamDone { i-- if m.StreamDone { dAtA[i] = 1 } else { dAtA[i] = 0 } i-- dAtA[i] = 0x58 } if m.StreamId != 0 { i = encodeVarintPb(dAtA, i, uint64(m.StreamId)) i-- dAtA[i] = 0x50 } if len(m.Meta) > 0 { i -= len(m.Meta) copy(dAtA[i:], m.Meta) i = encodeVarintPb(dAtA, i, uint64(len(m.Meta))) i-- dAtA[i] = 0x32 } if m.ExpiresAt != 0 { i = encodeVarintPb(dAtA, i, uint64(m.ExpiresAt)) i-- dAtA[i] = 0x28 } if m.Version != 0 { i = encodeVarintPb(dAtA, i, uint64(m.Version)) i-- dAtA[i] = 0x20 } if len(m.UserMeta) > 0 { i -= len(m.UserMeta) copy(dAtA[i:], m.UserMeta) i = encodeVarintPb(dAtA, i, uint64(len(m.UserMeta))) i-- dAtA[i] = 0x1a } if len(m.Value) > 0 { i -= len(m.Value) copy(dAtA[i:], m.Value) i = encodeVarintPb(dAtA, i, uint64(len(m.Value))) i-- dAtA[i] = 0x12 } if len(m.Key) > 0 { i -= len(m.Key) copy(dAtA[i:], m.Key) i = encodeVarintPb(dAtA, i, uint64(len(m.Key))) i-- dAtA[i] = 0xa } return len(dAtA) - i, nil } func (m *KVList) Marshal() (dAtA []byte, err error) { size := m.Size() dAtA = make([]byte, size) n, err := m.MarshalToSizedBuffer(dAtA[:size]) if err != nil { return nil, err } return dAtA[:n], nil } func (m *KVList) MarshalTo(dAtA []byte) (int, error) { size := m.Size() return m.MarshalToSizedBuffer(dAtA[:size]) } func (m *KVList) MarshalToSizedBuffer(dAtA []byte) (int, error) { i := len(dAtA) _ = i var l int _ = l if m.XXX_unrecognized != nil { i -= len(m.XXX_unrecognized) copy(dAtA[i:], m.XXX_unrecognized) } if len(m.Kv) > 0 { for iNdEx := len(m.Kv) - 1; iNdEx >= 0; iNdEx-- { { size, err := m.Kv[iNdEx].MarshalToSizedBuffer(dAtA[:i]) if err != nil { return 0, err } i -= size i = encodeVarintPb(dAtA, i, uint64(size)) } i-- dAtA[i] = 0xa } } return len(dAtA) - i, nil } func (m *ManifestChangeSet) Marshal() (dAtA []byte, err error) { size := m.Size() dAtA = make([]byte, size) n, err := m.MarshalToSizedBuffer(dAtA[:size]) if err != nil { return nil, err } return dAtA[:n], nil } func (m *ManifestChangeSet) MarshalTo(dAtA []byte) (int, error) { size := m.Size() return m.MarshalToSizedBuffer(dAtA[:size]) } func (m *ManifestChangeSet) MarshalToSizedBuffer(dAtA []byte) (int, error) { i := len(dAtA) _ = i var l int _ = l if m.XXX_unrecognized != nil { i -= len(m.XXX_unrecognized) copy(dAtA[i:], m.XXX_unrecognized) } if len(m.Changes) > 0 { for iNdEx := len(m.Changes) - 1; iNdEx >= 0; iNdEx-- { { size, err := m.Changes[iNdEx].MarshalToSizedBuffer(dAtA[:i]) if err != nil { return 0, err } i -= size i = encodeVarintPb(dAtA, i, uint64(size)) } i-- dAtA[i] = 0xa } } return len(dAtA) - i, nil } func (m *ManifestChange) Marshal() (dAtA []byte, err error) { size := m.Size() dAtA = make([]byte, size) n, err := m.MarshalToSizedBuffer(dAtA[:size]) if err != nil { return nil, err } return dAtA[:n], nil } func (m *ManifestChange) MarshalTo(dAtA []byte) (int, error) { size := m.Size() return m.MarshalToSizedBuffer(dAtA[:size]) } func (m *ManifestChange) MarshalToSizedBuffer(dAtA []byte) (int, error) { i := len(dAtA) _ = i var l int _ = l if m.XXX_unrecognized != nil { i -= len(m.XXX_unrecognized) copy(dAtA[i:], m.XXX_unrecognized) } if m.Compression != 0 { i = encodeVarintPb(dAtA, i, uint64(m.Compression)) i-- dAtA[i] = 0x30 } if m.EncryptionAlgo != 0 { i = encodeVarintPb(dAtA, i, uint64(m.EncryptionAlgo)) i-- dAtA[i] = 0x28 } if m.KeyId != 0 { i = encodeVarintPb(dAtA, i, uint64(m.KeyId)) i-- dAtA[i] = 0x20 } if m.Level != 0 { i = encodeVarintPb(dAtA, i, uint64(m.Level)) i-- dAtA[i] = 0x18 } if m.Op != 0 { i = encodeVarintPb(dAtA, i, uint64(m.Op)) i-- dAtA[i] = 0x10 } if m.Id != 0 { i = encodeVarintPb(dAtA, i, uint64(m.Id)) i-- dAtA[i] = 0x8 } return len(dAtA) - i, nil } func (m *BlockOffset) Marshal() (dAtA []byte, err error) { size := m.Size() dAtA = make([]byte, size) n, err := m.MarshalToSizedBuffer(dAtA[:size]) if err != nil { return nil, err } return dAtA[:n], nil } func (m *BlockOffset) MarshalTo(dAtA []byte) (int, error) { size := m.Size() return m.MarshalToSizedBuffer(dAtA[:size]) } func (m *BlockOffset) MarshalToSizedBuffer(dAtA []byte) (int, error) { i := len(dAtA) _ = i var l int _ = l if m.XXX_unrecognized != nil { i -= len(m.XXX_unrecognized) copy(dAtA[i:], m.XXX_unrecognized) } if m.Len != 0 { i = encodeVarintPb(dAtA, i, uint64(m.Len)) i-- dAtA[i] = 0x18 } if m.Offset != 0 { i = encodeVarintPb(dAtA, i, uint64(m.Offset)) i-- dAtA[i] = 0x10 } if len(m.Key) > 0 { i -= len(m.Key) copy(dAtA[i:], m.Key) i = encodeVarintPb(dAtA, i, uint64(len(m.Key))) i-- dAtA[i] = 0xa } return len(dAtA) - i, nil } func (m *TableIndex) Marshal() (dAtA []byte, err error) { size := m.Size() dAtA = make([]byte, size) n, err := m.MarshalToSizedBuffer(dAtA[:size]) if err != nil { return nil, err } return dAtA[:n], nil } func (m *TableIndex) MarshalTo(dAtA []byte) (int, error) { size := m.Size() return m.MarshalToSizedBuffer(dAtA[:size]) } func (m *TableIndex) MarshalToSizedBuffer(dAtA []byte) (int, error) { i := len(dAtA) _ = i var l int _ = l if m.XXX_unrecognized != nil { i -= len(m.XXX_unrecognized) copy(dAtA[i:], m.XXX_unrecognized) } if m.EstimatedSize != 0 { i = encodeVarintPb(dAtA, i, uint64(m.EstimatedSize)) i-- dAtA[i] = 0x18 } if len(m.BloomFilter) > 0 { i -= len(m.BloomFilter) copy(dAtA[i:], m.BloomFilter) i = encodeVarintPb(dAtA, i, uint64(len(m.BloomFilter))) i-- dAtA[i] = 0x12 } if len(m.Offsets) > 0 { for iNdEx := len(m.Offsets) - 1; iNdEx >= 0; iNdEx-- { { size, err := m.Offsets[iNdEx].MarshalToSizedBuffer(dAtA[:i]) if err != nil { return 0, err } i -= size i = encodeVarintPb(dAtA, i, uint64(size)) } i-- dAtA[i] = 0xa } } return len(dAtA) - i, nil } func (m *Checksum) Marshal() (dAtA []byte, err error) { size := m.Size() dAtA = make([]byte, size) n, err := m.MarshalToSizedBuffer(dAtA[:size]) if err != nil { return nil, err } return dAtA[:n], nil } func (m *Checksum) MarshalTo(dAtA []byte) (int, error) { size := m.Size() return m.MarshalToSizedBuffer(dAtA[:size]) } func (m *Checksum) MarshalToSizedBuffer(dAtA []byte) (int, error) { i := len(dAtA) _ = i var l int _ = l if m.XXX_unrecognized != nil { i -= len(m.XXX_unrecognized) copy(dAtA[i:], m.XXX_unrecognized) } if m.Sum != 0 { i = encodeVarintPb(dAtA, i, uint64(m.Sum)) i-- dAtA[i] = 0x10 } if m.Algo != 0 { i = encodeVarintPb(dAtA, i, uint64(m.Algo)) i-- dAtA[i] = 0x8 } return len(dAtA) - i, nil } func (m *DataKey) Marshal() (dAtA []byte, err error) { size := m.Size() dAtA = make([]byte, size) n, err := m.MarshalToSizedBuffer(dAtA[:size]) if err != nil { return nil, err } return dAtA[:n], nil } func (m *DataKey) MarshalTo(dAtA []byte) (int, error) { size := m.Size() return m.MarshalToSizedBuffer(dAtA[:size]) } func (m *DataKey) MarshalToSizedBuffer(dAtA []byte) (int, error) { i := len(dAtA) _ = i var l int _ = l if m.XXX_unrecognized != nil { i -= len(m.XXX_unrecognized) copy(dAtA[i:], m.XXX_unrecognized) } if m.CreatedAt != 0 { i = encodeVarintPb(dAtA, i, uint64(m.CreatedAt)) i-- dAtA[i] = 0x20 } if len(m.Iv) > 0 { i -= len(m.Iv) copy(dAtA[i:], m.Iv) i = encodeVarintPb(dAtA, i, uint64(len(m.Iv))) i-- dAtA[i] = 0x1a } if len(m.Data) > 0 { i -= len(m.Data) copy(dAtA[i:], m.Data) i = encodeVarintPb(dAtA, i, uint64(len(m.Data))) i-- dAtA[i] = 0x12 } if m.KeyId != 0 { i = encodeVarintPb(dAtA, i, uint64(m.KeyId)) i-- dAtA[i] = 0x8 } return len(dAtA) - i, nil } func encodeVarintPb(dAtA []byte, offset int, v uint64) int { offset -= sovPb(v) base := offset for v >= 1<<7 { dAtA[offset] = uint8(v&0x7f | 0x80) v >>= 7 offset++ } dAtA[offset] = uint8(v) return base } func (m *KV) Size() (n int) { if m == nil { return 0 } var l int _ = l l = len(m.Key) if l > 0 { n += 1 + l + sovPb(uint64(l)) } l = len(m.Value) if l > 0 { n += 1 + l + sovPb(uint64(l)) } l = len(m.UserMeta) if l > 0 { n += 1 + l + sovPb(uint64(l)) } if m.Version != 0 { n += 1 + sovPb(uint64(m.Version)) } if m.ExpiresAt != 0 { n += 1 + sovPb(uint64(m.ExpiresAt)) } l = len(m.Meta) if l > 0 { n += 1 + l + sovPb(uint64(l)) } if m.StreamId != 0 { n += 1 + sovPb(uint64(m.StreamId)) } if m.StreamDone { n += 2 } if m.XXX_unrecognized != nil { n += len(m.XXX_unrecognized) } return n } func (m *KVList) Size() (n int) { if m == nil { return 0 } var l int _ = l if len(m.Kv) > 0 { for _, e := range m.Kv { l = e.Size() n += 1 + l + sovPb(uint64(l)) } } if m.XXX_unrecognized != nil { n += len(m.XXX_unrecognized) } return n } func (m *ManifestChangeSet) Size() (n int) { if m == nil { return 0 } var l int _ = l if len(m.Changes) > 0 { for _, e := range m.Changes { l = e.Size() n += 1 + l + sovPb(uint64(l)) } } if m.XXX_unrecognized != nil { n += len(m.XXX_unrecognized) } return n } func (m *ManifestChange) Size() (n int) { if m == nil { return 0 } var l int _ = l if m.Id != 0 { n += 1 + sovPb(uint64(m.Id)) } if m.Op != 0 { n += 1 + sovPb(uint64(m.Op)) } if m.Level != 0 { n += 1 + sovPb(uint64(m.Level)) } if m.KeyId != 0 { n += 1 + sovPb(uint64(m.KeyId)) } if m.EncryptionAlgo != 0 { n += 1 + sovPb(uint64(m.EncryptionAlgo)) } if m.Compression != 0 { n += 1 + sovPb(uint64(m.Compression)) } if m.XXX_unrecognized != nil { n += len(m.XXX_unrecognized) } return n } func (m *BlockOffset) Size() (n int) { if m == nil { return 0 } var l int _ = l l = len(m.Key) if l > 0 { n += 1 + l + sovPb(uint64(l)) } if m.Offset != 0 { n += 1 + sovPb(uint64(m.Offset)) } if m.Len != 0 { n += 1 + sovPb(uint64(m.Len)) } if m.XXX_unrecognized != nil { n += len(m.XXX_unrecognized) } return n } func (m *TableIndex) Size() (n int) { if m == nil { return 0 } var l int _ = l if len(m.Offsets) > 0 { for _, e := range m.Offsets { l = e.Size() n += 1 + l + sovPb(uint64(l)) } } l = len(m.BloomFilter) if l > 0 { n += 1 + l + sovPb(uint64(l)) } if m.EstimatedSize != 0 { n += 1 + sovPb(uint64(m.EstimatedSize)) } if m.XXX_unrecognized != nil { n += len(m.XXX_unrecognized) } return n } func (m *Checksum) Size() (n int) { if m == nil { return 0 } var l int _ = l if m.Algo != 0 { n += 1 + sovPb(uint64(m.Algo)) } if m.Sum != 0 { n += 1 + sovPb(uint64(m.Sum)) } if m.XXX_unrecognized != nil { n += len(m.XXX_unrecognized) } return n } func (m *DataKey) Size() (n int) { if m == nil { return 0 } var l int _ = l if m.KeyId != 0 { n += 1 + sovPb(uint64(m.KeyId)) } l = len(m.Data) if l > 0 { n += 1 + l + sovPb(uint64(l)) } l = len(m.Iv) if l > 0 { n += 1 + l + sovPb(uint64(l)) } if m.CreatedAt != 0 { n += 1 + sovPb(uint64(m.CreatedAt)) } if m.XXX_unrecognized != nil { n += len(m.XXX_unrecognized) } return n } func sovPb(x uint64) (n int) { return (math_bits.Len64(x|1) + 6) / 7 } func sozPb(x uint64) (n int) { return sovPb(uint64((x << 1) ^ uint64((int64(x) >> 63)))) } func (m *KV) Unmarshal(dAtA []byte) error { l := len(dAtA) iNdEx := 0 for iNdEx < l { preIndex := iNdEx var wire uint64 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ wire |= uint64(b&0x7F) << shift if b < 0x80 { break } } fieldNum := int32(wire >> 3) wireType := int(wire & 0x7) if wireType == 4 { return fmt.Errorf("proto: KV: wiretype end group for non-group") } if fieldNum <= 0 { return fmt.Errorf("proto: KV: illegal tag %d (wire type %d)", fieldNum, wire) } switch fieldNum { case 1: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field Key", wireType) } var byteLen int for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ byteLen |= int(b&0x7F) << shift if b < 0x80 { break } } if byteLen < 0 { return ErrInvalidLengthPb } postIndex := iNdEx + byteLen if postIndex < 0 { return ErrInvalidLengthPb } if postIndex > l { return io.ErrUnexpectedEOF } m.Key = append(m.Key[:0], dAtA[iNdEx:postIndex]...) if m.Key == nil { m.Key = []byte{} } iNdEx = postIndex case 2: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field Value", wireType) } var byteLen int for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ byteLen |= int(b&0x7F) << shift if b < 0x80 { break } } if byteLen < 0 { return ErrInvalidLengthPb } postIndex := iNdEx + byteLen if postIndex < 0 { return ErrInvalidLengthPb } if postIndex > l { return io.ErrUnexpectedEOF } m.Value = append(m.Value[:0], dAtA[iNdEx:postIndex]...) if m.Value == nil { m.Value = []byte{} } iNdEx = postIndex case 3: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field UserMeta", wireType) } var byteLen int for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ byteLen |= int(b&0x7F) << shift if b < 0x80 { break } } if byteLen < 0 { return ErrInvalidLengthPb } postIndex := iNdEx + byteLen if postIndex < 0 { return ErrInvalidLengthPb } if postIndex > l { return io.ErrUnexpectedEOF } m.UserMeta = append(m.UserMeta[:0], dAtA[iNdEx:postIndex]...) if m.UserMeta == nil { m.UserMeta = []byte{} } iNdEx = postIndex case 4: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field Version", wireType) } m.Version = 0 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ m.Version |= uint64(b&0x7F) << shift if b < 0x80 { break } } case 5: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field ExpiresAt", wireType) } m.ExpiresAt = 0 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ m.ExpiresAt |= uint64(b&0x7F) << shift if b < 0x80 { break } } case 6: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field Meta", wireType) } var byteLen int for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ byteLen |= int(b&0x7F) << shift if b < 0x80 { break } } if byteLen < 0 { return ErrInvalidLengthPb } postIndex := iNdEx + byteLen if postIndex < 0 { return ErrInvalidLengthPb } if postIndex > l { return io.ErrUnexpectedEOF } m.Meta = append(m.Meta[:0], dAtA[iNdEx:postIndex]...) if m.Meta == nil { m.Meta = []byte{} } iNdEx = postIndex case 10: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field StreamId", wireType) } m.StreamId = 0 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ m.StreamId |= uint32(b&0x7F) << shift if b < 0x80 { break } } case 11: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field StreamDone", wireType) } var v int for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ v |= int(b&0x7F) << shift if b < 0x80 { break } } m.StreamDone = bool(v != 0) default: iNdEx = preIndex skippy, err := skipPb(dAtA[iNdEx:]) if err != nil { return err } if skippy < 0 { return ErrInvalidLengthPb } if (iNdEx + skippy) < 0 { return ErrInvalidLengthPb } if (iNdEx + skippy) > l { return io.ErrUnexpectedEOF } m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) iNdEx += skippy } } if iNdEx > l { return io.ErrUnexpectedEOF } return nil } func (m *KVList) Unmarshal(dAtA []byte) error { l := len(dAtA) iNdEx := 0 for iNdEx < l { preIndex := iNdEx var wire uint64 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ wire |= uint64(b&0x7F) << shift if b < 0x80 { break } } fieldNum := int32(wire >> 3) wireType := int(wire & 0x7) if wireType == 4 { return fmt.Errorf("proto: KVList: wiretype end group for non-group") } if fieldNum <= 0 { return fmt.Errorf("proto: KVList: illegal tag %d (wire type %d)", fieldNum, wire) } switch fieldNum { case 1: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field Kv", wireType) } var msglen int for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ msglen |= int(b&0x7F) << shift if b < 0x80 { break } } if msglen < 0 { return ErrInvalidLengthPb } postIndex := iNdEx + msglen if postIndex < 0 { return ErrInvalidLengthPb } if postIndex > l { return io.ErrUnexpectedEOF } m.Kv = append(m.Kv, &KV{}) if err := m.Kv[len(m.Kv)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { return err } iNdEx = postIndex default: iNdEx = preIndex skippy, err := skipPb(dAtA[iNdEx:]) if err != nil { return err } if skippy < 0 { return ErrInvalidLengthPb } if (iNdEx + skippy) < 0 { return ErrInvalidLengthPb } if (iNdEx + skippy) > l { return io.ErrUnexpectedEOF } m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) iNdEx += skippy } } if iNdEx > l { return io.ErrUnexpectedEOF } return nil } func (m *ManifestChangeSet) Unmarshal(dAtA []byte) error { l := len(dAtA) iNdEx := 0 for iNdEx < l { preIndex := iNdEx var wire uint64 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ wire |= uint64(b&0x7F) << shift if b < 0x80 { break } } fieldNum := int32(wire >> 3) wireType := int(wire & 0x7) if wireType == 4 { return fmt.Errorf("proto: ManifestChangeSet: wiretype end group for non-group") } if fieldNum <= 0 { return fmt.Errorf("proto: ManifestChangeSet: illegal tag %d (wire type %d)", fieldNum, wire) } switch fieldNum { case 1: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field Changes", wireType) } var msglen int for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ msglen |= int(b&0x7F) << shift if b < 0x80 { break } } if msglen < 0 { return ErrInvalidLengthPb } postIndex := iNdEx + msglen if postIndex < 0 { return ErrInvalidLengthPb } if postIndex > l { return io.ErrUnexpectedEOF } m.Changes = append(m.Changes, &ManifestChange{}) if err := m.Changes[len(m.Changes)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { return err } iNdEx = postIndex default: iNdEx = preIndex skippy, err := skipPb(dAtA[iNdEx:]) if err != nil { return err } if skippy < 0 { return ErrInvalidLengthPb } if (iNdEx + skippy) < 0 { return ErrInvalidLengthPb } if (iNdEx + skippy) > l { return io.ErrUnexpectedEOF } m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) iNdEx += skippy } } if iNdEx > l { return io.ErrUnexpectedEOF } return nil } func (m *ManifestChange) Unmarshal(dAtA []byte) error { l := len(dAtA) iNdEx := 0 for iNdEx < l { preIndex := iNdEx var wire uint64 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ wire |= uint64(b&0x7F) << shift if b < 0x80 { break } } fieldNum := int32(wire >> 3) wireType := int(wire & 0x7) if wireType == 4 { return fmt.Errorf("proto: ManifestChange: wiretype end group for non-group") } if fieldNum <= 0 { return fmt.Errorf("proto: ManifestChange: illegal tag %d (wire type %d)", fieldNum, wire) } switch fieldNum { case 1: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field Id", wireType) } m.Id = 0 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ m.Id |= uint64(b&0x7F) << shift if b < 0x80 { break } } case 2: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field Op", wireType) } m.Op = 0 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ m.Op |= ManifestChange_Operation(b&0x7F) << shift if b < 0x80 { break } } case 3: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field Level", wireType) } m.Level = 0 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ m.Level |= uint32(b&0x7F) << shift if b < 0x80 { break } } case 4: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field KeyId", wireType) } m.KeyId = 0 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ m.KeyId |= uint64(b&0x7F) << shift if b < 0x80 { break } } case 5: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field EncryptionAlgo", wireType) } m.EncryptionAlgo = 0 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ m.EncryptionAlgo |= EncryptionAlgo(b&0x7F) << shift if b < 0x80 { break } } case 6: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field Compression", wireType) } m.Compression = 0 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ m.Compression |= uint32(b&0x7F) << shift if b < 0x80 { break } } default: iNdEx = preIndex skippy, err := skipPb(dAtA[iNdEx:]) if err != nil { return err } if skippy < 0 { return ErrInvalidLengthPb } if (iNdEx + skippy) < 0 { return ErrInvalidLengthPb } if (iNdEx + skippy) > l { return io.ErrUnexpectedEOF } m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) iNdEx += skippy } } if iNdEx > l { return io.ErrUnexpectedEOF } return nil } func (m *BlockOffset) Unmarshal(dAtA []byte) error { l := len(dAtA) iNdEx := 0 for iNdEx < l { preIndex := iNdEx var wire uint64 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ wire |= uint64(b&0x7F) << shift if b < 0x80 { break } } fieldNum := int32(wire >> 3) wireType := int(wire & 0x7) if wireType == 4 { return fmt.Errorf("proto: BlockOffset: wiretype end group for non-group") } if fieldNum <= 0 { return fmt.Errorf("proto: BlockOffset: illegal tag %d (wire type %d)", fieldNum, wire) } switch fieldNum { case 1: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field Key", wireType) } var byteLen int for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ byteLen |= int(b&0x7F) << shift if b < 0x80 { break } } if byteLen < 0 { return ErrInvalidLengthPb } postIndex := iNdEx + byteLen if postIndex < 0 { return ErrInvalidLengthPb } if postIndex > l { return io.ErrUnexpectedEOF } m.Key = append(m.Key[:0], dAtA[iNdEx:postIndex]...) if m.Key == nil { m.Key = []byte{} } iNdEx = postIndex case 2: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field Offset", wireType) } m.Offset = 0 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ m.Offset |= uint32(b&0x7F) << shift if b < 0x80 { break } } case 3: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field Len", wireType) } m.Len = 0 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ m.Len |= uint32(b&0x7F) << shift if b < 0x80 { break } } default: iNdEx = preIndex skippy, err := skipPb(dAtA[iNdEx:]) if err != nil { return err } if skippy < 0 { return ErrInvalidLengthPb } if (iNdEx + skippy) < 0 { return ErrInvalidLengthPb } if (iNdEx + skippy) > l { return io.ErrUnexpectedEOF } m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) iNdEx += skippy } } if iNdEx > l { return io.ErrUnexpectedEOF } return nil } func (m *TableIndex) Unmarshal(dAtA []byte) error { l := len(dAtA) iNdEx := 0 for iNdEx < l { preIndex := iNdEx var wire uint64 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ wire |= uint64(b&0x7F) << shift if b < 0x80 { break } } fieldNum := int32(wire >> 3) wireType := int(wire & 0x7) if wireType == 4 { return fmt.Errorf("proto: TableIndex: wiretype end group for non-group") } if fieldNum <= 0 { return fmt.Errorf("proto: TableIndex: illegal tag %d (wire type %d)", fieldNum, wire) } switch fieldNum { case 1: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field Offsets", wireType) } var msglen int for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ msglen |= int(b&0x7F) << shift if b < 0x80 { break } } if msglen < 0 { return ErrInvalidLengthPb } postIndex := iNdEx + msglen if postIndex < 0 { return ErrInvalidLengthPb } if postIndex > l { return io.ErrUnexpectedEOF } m.Offsets = append(m.Offsets, &BlockOffset{}) if err := m.Offsets[len(m.Offsets)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { return err } iNdEx = postIndex case 2: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field BloomFilter", wireType) } var byteLen int for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ byteLen |= int(b&0x7F) << shift if b < 0x80 { break } } if byteLen < 0 { return ErrInvalidLengthPb } postIndex := iNdEx + byteLen if postIndex < 0 { return ErrInvalidLengthPb } if postIndex > l { return io.ErrUnexpectedEOF } m.BloomFilter = append(m.BloomFilter[:0], dAtA[iNdEx:postIndex]...) if m.BloomFilter == nil { m.BloomFilter = []byte{} } iNdEx = postIndex case 3: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field EstimatedSize", wireType) } m.EstimatedSize = 0 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ m.EstimatedSize |= uint64(b&0x7F) << shift if b < 0x80 { break } } default: iNdEx = preIndex skippy, err := skipPb(dAtA[iNdEx:]) if err != nil { return err } if skippy < 0 { return ErrInvalidLengthPb } if (iNdEx + skippy) < 0 { return ErrInvalidLengthPb } if (iNdEx + skippy) > l { return io.ErrUnexpectedEOF } m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) iNdEx += skippy } } if iNdEx > l { return io.ErrUnexpectedEOF } return nil } func (m *Checksum) Unmarshal(dAtA []byte) error { l := len(dAtA) iNdEx := 0 for iNdEx < l { preIndex := iNdEx var wire uint64 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ wire |= uint64(b&0x7F) << shift if b < 0x80 { break } } fieldNum := int32(wire >> 3) wireType := int(wire & 0x7) if wireType == 4 { return fmt.Errorf("proto: Checksum: wiretype end group for non-group") } if fieldNum <= 0 { return fmt.Errorf("proto: Checksum: illegal tag %d (wire type %d)", fieldNum, wire) } switch fieldNum { case 1: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field Algo", wireType) } m.Algo = 0 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ m.Algo |= Checksum_Algorithm(b&0x7F) << shift if b < 0x80 { break } } case 2: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field Sum", wireType) } m.Sum = 0 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ m.Sum |= uint64(b&0x7F) << shift if b < 0x80 { break } } default: iNdEx = preIndex skippy, err := skipPb(dAtA[iNdEx:]) if err != nil { return err } if skippy < 0 { return ErrInvalidLengthPb } if (iNdEx + skippy) < 0 { return ErrInvalidLengthPb } if (iNdEx + skippy) > l { return io.ErrUnexpectedEOF } m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) iNdEx += skippy } } if iNdEx > l { return io.ErrUnexpectedEOF } return nil } func (m *DataKey) Unmarshal(dAtA []byte) error { l := len(dAtA) iNdEx := 0 for iNdEx < l { preIndex := iNdEx var wire uint64 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ wire |= uint64(b&0x7F) << shift if b < 0x80 { break } } fieldNum := int32(wire >> 3) wireType := int(wire & 0x7) if wireType == 4 { return fmt.Errorf("proto: DataKey: wiretype end group for non-group") } if fieldNum <= 0 { return fmt.Errorf("proto: DataKey: illegal tag %d (wire type %d)", fieldNum, wire) } switch fieldNum { case 1: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field KeyId", wireType) } m.KeyId = 0 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ m.KeyId |= uint64(b&0x7F) << shift if b < 0x80 { break } } case 2: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field Data", wireType) } var byteLen int for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ byteLen |= int(b&0x7F) << shift if b < 0x80 { break } } if byteLen < 0 { return ErrInvalidLengthPb } postIndex := iNdEx + byteLen if postIndex < 0 { return ErrInvalidLengthPb } if postIndex > l { return io.ErrUnexpectedEOF } m.Data = append(m.Data[:0], dAtA[iNdEx:postIndex]...) if m.Data == nil { m.Data = []byte{} } iNdEx = postIndex case 3: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field Iv", wireType) } var byteLen int for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ byteLen |= int(b&0x7F) << shift if b < 0x80 { break } } if byteLen < 0 { return ErrInvalidLengthPb } postIndex := iNdEx + byteLen if postIndex < 0 { return ErrInvalidLengthPb } if postIndex > l { return io.ErrUnexpectedEOF } m.Iv = append(m.Iv[:0], dAtA[iNdEx:postIndex]...) if m.Iv == nil { m.Iv = []byte{} } iNdEx = postIndex case 4: if wireType != 0 { return fmt.Errorf("proto: wrong wireType = %d for field CreatedAt", wireType) } m.CreatedAt = 0 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowPb } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ m.CreatedAt |= int64(b&0x7F) << shift if b < 0x80 { break } } default: iNdEx = preIndex skippy, err := skipPb(dAtA[iNdEx:]) if err != nil { return err } if skippy < 0 { return ErrInvalidLengthPb } if (iNdEx + skippy) < 0 { return ErrInvalidLengthPb } if (iNdEx + skippy) > l { return io.ErrUnexpectedEOF } m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) iNdEx += skippy } } if iNdEx > l { return io.ErrUnexpectedEOF } return nil } func skipPb(dAtA []byte) (n int, err error) { l := len(dAtA) iNdEx := 0 depth := 0 for iNdEx < l { var wire uint64 for shift := uint(0); ; shift += 7 { if shift >= 64 { return 0, ErrIntOverflowPb } if iNdEx >= l { return 0, io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ wire |= (uint64(b) & 0x7F) << shift if b < 0x80 { break } } wireType := int(wire & 0x7) switch wireType { case 0: for shift := uint(0); ; shift += 7 { if shift >= 64 { return 0, ErrIntOverflowPb } if iNdEx >= l { return 0, io.ErrUnexpectedEOF } iNdEx++ if dAtA[iNdEx-1] < 0x80 { break } } case 1: iNdEx += 8 case 2: var length int for shift := uint(0); ; shift += 7 { if shift >= 64 { return 0, ErrIntOverflowPb } if iNdEx >= l { return 0, io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ length |= (int(b) & 0x7F) << shift if b < 0x80 { break } } if length < 0 { return 0, ErrInvalidLengthPb } iNdEx += length case 3: depth++ case 4: if depth == 0 { return 0, ErrUnexpectedEndOfGroupPb } depth-- case 5: iNdEx += 4 default: return 0, fmt.Errorf("proto: illegal wireType %d", wireType) } if iNdEx < 0 { return 0, ErrInvalidLengthPb } if depth == 0 { return iNdEx, nil } } return 0, io.ErrUnexpectedEOF } var ( ErrInvalidLengthPb = fmt.Errorf("proto: negative length found during unmarshaling") ErrIntOverflowPb = fmt.Errorf("proto: integer overflow") ErrUnexpectedEndOfGroupPb = fmt.Errorf("proto: unexpected end of group") ) badger-2.2007.2/pb/pb.proto000066400000000000000000000040451372173116500152700ustar00rootroot00000000000000/* * Copyright (C) 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // Use protos/gen.sh to generate .pb.go files. syntax = "proto3"; package badgerpb2; option go_package = "github.com/dgraph-io/badger/v2/pb"; message KV { bytes key = 1; bytes value = 2; bytes user_meta = 3; uint64 version = 4; uint64 expires_at = 5; bytes meta = 6; // Stream id is used to identify which stream the KV came from. uint32 stream_id = 10; // Stream done is used to indicate end of stream. bool stream_done = 11; } message KVList { repeated KV kv = 1; } message ManifestChangeSet { // A set of changes that are applied atomically. repeated ManifestChange changes = 1; } enum EncryptionAlgo { aes = 0; } message ManifestChange { uint64 Id = 1; // Table ID. enum Operation { CREATE = 0; DELETE = 1; } Operation Op = 2; uint32 Level = 3; // Only used for CREATE. uint64 key_id = 4; EncryptionAlgo encryption_algo = 5; uint32 compression = 6; // Only used for CREATE Op. } message BlockOffset { bytes key = 1; uint32 offset = 2; uint32 len = 3; } message TableIndex { repeated BlockOffset offsets = 1; bytes bloom_filter = 2; uint64 estimated_size = 3; } message Checksum { enum Algorithm { CRC32C = 0; XXHash64 = 1; } Algorithm algo = 1; // For storing type of Checksum algorithm used uint64 sum = 2; } message DataKey { uint64 key_id = 1; bytes data = 2; bytes iv = 3; int64 created_at = 4; } badger-2.2007.2/publisher.go000066400000000000000000000067621372173116500155350ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "sync" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/trie" "github.com/dgraph-io/badger/v2/y" ) type subscriber struct { prefixes [][]byte sendCh chan<- *pb.KVList subCloser *y.Closer } type publisher struct { sync.Mutex pubCh chan requests subscribers map[uint64]subscriber nextID uint64 indexer *trie.Trie } func newPublisher() *publisher { return &publisher{ pubCh: make(chan requests, 1000), subscribers: make(map[uint64]subscriber), nextID: 0, indexer: trie.NewTrie(), } } func (p *publisher) listenForUpdates(c *y.Closer) { defer func() { p.cleanSubscribers() c.Done() }() slurp := func(batch requests) { for { select { case reqs := <-p.pubCh: batch = append(batch, reqs...) default: p.publishUpdates(batch) return } } } for { select { case <-c.HasBeenClosed(): return case reqs := <-p.pubCh: slurp(reqs) } } } func (p *publisher) publishUpdates(reqs requests) { p.Lock() defer func() { p.Unlock() // Release all the request. reqs.DecrRef() }() batchedUpdates := make(map[uint64]*pb.KVList) for _, req := range reqs { for _, e := range req.Entries { ids := p.indexer.Get(e.Key) if len(ids) > 0 { k := y.SafeCopy(nil, e.Key) kv := &pb.KV{ Key: y.ParseKey(k), Value: y.SafeCopy(nil, e.Value), Meta: []byte{e.UserMeta}, ExpiresAt: e.ExpiresAt, Version: y.ParseTs(k), } for id := range ids { if _, ok := batchedUpdates[id]; !ok { batchedUpdates[id] = &pb.KVList{} } batchedUpdates[id].Kv = append(batchedUpdates[id].Kv, kv) } } } } for id, kvs := range batchedUpdates { p.subscribers[id].sendCh <- kvs } } func (p *publisher) newSubscriber(c *y.Closer, prefixes ...[]byte) (<-chan *pb.KVList, uint64) { p.Lock() defer p.Unlock() ch := make(chan *pb.KVList, 1000) id := p.nextID // Increment next ID. p.nextID++ p.subscribers[id] = subscriber{ prefixes: prefixes, sendCh: ch, subCloser: c, } for _, prefix := range prefixes { p.indexer.Add(prefix, id) } return ch, id } // cleanSubscribers stops all the subscribers. Ideally, It should be called while closing DB. func (p *publisher) cleanSubscribers() { p.Lock() defer p.Unlock() for id, s := range p.subscribers { for _, prefix := range s.prefixes { p.indexer.Delete(prefix, id) } delete(p.subscribers, id) s.subCloser.SignalAndWait() } } func (p *publisher) deleteSubscriber(id uint64) { p.Lock() defer p.Unlock() if s, ok := p.subscribers[id]; ok { for _, prefix := range s.prefixes { p.indexer.Delete(prefix, id) } } delete(p.subscribers, id) } func (p *publisher) sendUpdates(reqs requests) { if p.noOfSubscribers() != 0 { reqs.IncrRef() p.pubCh <- reqs } } func (p *publisher) noOfSubscribers() int { p.Lock() defer p.Unlock() return len(p.subscribers) } badger-2.2007.2/publisher_test.go000066400000000000000000000050761372173116500165710ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "context" "fmt" "sync" "testing" "github.com/stretchr/testify/require" "github.com/dgraph-io/badger/v2/pb" ) func TestPublisherOrdering(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { order := []string{} var wg sync.WaitGroup wg.Add(1) var subWg sync.WaitGroup subWg.Add(1) go func() { subWg.Done() updates := 0 err := db.Subscribe(context.Background(), func(kvs *pb.KVList) error { updates += len(kvs.GetKv()) for _, kv := range kvs.GetKv() { order = append(order, string(kv.Value)) } if updates == 5 { wg.Done() } return nil }, []byte("ke")) if err != nil { require.Equal(t, err.Error(), context.Canceled.Error()) } }() subWg.Wait() for i := 0; i < 5; i++ { db.Update(func(txn *Txn) error { e := NewEntry([]byte(fmt.Sprintf("key%d", i)), []byte(fmt.Sprintf("value%d", i))) return txn.SetEntry(e) }) } wg.Wait() for i := 0; i < 5; i++ { require.Equal(t, fmt.Sprintf("value%d", i), order[i]) } }) } func TestMultiplePrefix(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { var wg sync.WaitGroup wg.Add(1) var subWg sync.WaitGroup subWg.Add(1) go func() { subWg.Done() updates := 0 err := db.Subscribe(context.Background(), func(kvs *pb.KVList) error { updates += len(kvs.GetKv()) for _, kv := range kvs.GetKv() { if string(kv.Key) == "key" { require.Equal(t, string(kv.Value), "value") } else { require.Equal(t, string(kv.Value), "badger") } } if updates == 2 { wg.Done() } return nil }, []byte("ke"), []byte("hel")) if err != nil { require.Equal(t, err.Error(), context.Canceled.Error()) } }() subWg.Wait() db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry([]byte("key"), []byte("value"))) }) db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry([]byte("hello"), []byte("badger"))) }) wg.Wait() }) } badger-2.2007.2/skl/000077500000000000000000000000001372173116500137675ustar00rootroot00000000000000badger-2.2007.2/skl/README.md000066400000000000000000000113021372173116500152430ustar00rootroot00000000000000This is much better than `skiplist` and `slist`. ``` BenchmarkReadWrite/frac_0-8 3000000 537 ns/op BenchmarkReadWrite/frac_1-8 3000000 503 ns/op BenchmarkReadWrite/frac_2-8 3000000 492 ns/op BenchmarkReadWrite/frac_3-8 3000000 475 ns/op BenchmarkReadWrite/frac_4-8 3000000 440 ns/op BenchmarkReadWrite/frac_5-8 5000000 442 ns/op BenchmarkReadWrite/frac_6-8 5000000 380 ns/op BenchmarkReadWrite/frac_7-8 5000000 338 ns/op BenchmarkReadWrite/frac_8-8 5000000 294 ns/op BenchmarkReadWrite/frac_9-8 10000000 268 ns/op BenchmarkReadWrite/frac_10-8 100000000 26.3 ns/op ``` And even better than a simple map with read-write lock: ``` BenchmarkReadWriteMap/frac_0-8 2000000 774 ns/op BenchmarkReadWriteMap/frac_1-8 2000000 647 ns/op BenchmarkReadWriteMap/frac_2-8 3000000 605 ns/op BenchmarkReadWriteMap/frac_3-8 3000000 603 ns/op BenchmarkReadWriteMap/frac_4-8 3000000 556 ns/op BenchmarkReadWriteMap/frac_5-8 3000000 472 ns/op BenchmarkReadWriteMap/frac_6-8 3000000 476 ns/op BenchmarkReadWriteMap/frac_7-8 3000000 457 ns/op BenchmarkReadWriteMap/frac_8-8 5000000 444 ns/op BenchmarkReadWriteMap/frac_9-8 5000000 361 ns/op BenchmarkReadWriteMap/frac_10-8 10000000 212 ns/op ``` # Node Pooling Command used ``` rm -Rf tmp && /usr/bin/time -l ./populate -keys_mil 10 ``` For pprof results, we run without using /usr/bin/time. There are four runs below. Results seem to vary quite a bit between runs. ## Before node pooling ``` 1311.53MB of 1338.69MB total (97.97%) Dropped 30 nodes (cum <= 6.69MB) Showing top 10 nodes out of 37 (cum >= 12.50MB) flat flat% sum% cum cum% 523.04MB 39.07% 39.07% 523.04MB 39.07% github.com/dgraph-io/badger/skl.(*Skiplist).Put 184.51MB 13.78% 52.85% 184.51MB 13.78% runtime.stringtoslicebyte 166.01MB 12.40% 65.25% 689.04MB 51.47% github.com/dgraph-io/badger/mem.(*Table).Put 165MB 12.33% 77.58% 165MB 12.33% runtime.convT2E 116.92MB 8.73% 86.31% 116.92MB 8.73% bytes.makeSlice 62.50MB 4.67% 90.98% 62.50MB 4.67% main.newValue 34.50MB 2.58% 93.56% 34.50MB 2.58% github.com/dgraph-io/badger/table.(*BlockIterator).parseKV 25.50MB 1.90% 95.46% 100.06MB 7.47% github.com/dgraph-io/badger/y.(*MergeIterator).Next 21.06MB 1.57% 97.04% 21.06MB 1.57% github.com/dgraph-io/badger/table.(*Table).read 12.50MB 0.93% 97.97% 12.50MB 0.93% github.com/dgraph-io/badger/table.header.Encode 128.31 real 329.37 user 17.11 sys 3355660288 maximum resident set size 0 average shared memory size 0 average unshared data size 0 average unshared stack size 2203080 page reclaims 764 page faults 0 swaps 275 block input operations 76 block output operations 0 messages sent 0 messages received 0 signals received 49173 voluntary context switches 599922 involuntary context switches ``` ## After node pooling ``` 1963.13MB of 2026.09MB total (96.89%) Dropped 29 nodes (cum <= 10.13MB) Showing top 10 nodes out of 41 (cum >= 185.62MB) flat flat% sum% cum cum% 658.05MB 32.48% 32.48% 658.05MB 32.48% github.com/dgraph-io/badger/skl.glob..func1 297.51MB 14.68% 47.16% 297.51MB 14.68% runtime.convT2E 257.51MB 12.71% 59.87% 257.51MB 12.71% runtime.stringtoslicebyte 249.01MB 12.29% 72.16% 1007.06MB 49.70% github.com/dgraph-io/badger/mem.(*Table).Put 142.43MB 7.03% 79.19% 142.43MB 7.03% bytes.makeSlice 100MB 4.94% 84.13% 758.05MB 37.41% github.com/dgraph-io/badger/skl.newNode 99.50MB 4.91% 89.04% 99.50MB 4.91% main.newValue 75MB 3.70% 92.74% 75MB 3.70% github.com/dgraph-io/badger/table.(*BlockIterator).parseKV 44.62MB 2.20% 94.94% 44.62MB 2.20% github.com/dgraph-io/badger/table.(*Table).read 39.50MB 1.95% 96.89% 185.62MB 9.16% github.com/dgraph-io/badger/y.(*MergeIterator).Next 135.58 real 374.29 user 17.65 sys 3740614656 maximum resident set size 0 average shared memory size 0 average unshared data size 0 average unshared stack size 2276566 page reclaims 770 page faults 0 swaps 128 block input operations 90 block output operations 0 messages sent 0 messages received 0 signals received 46434 voluntary context switches 597049 involuntary context switches ``` badger-2.2007.2/skl/arena.go000066400000000000000000000076101372173116500154100ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package skl import ( "sync/atomic" "unsafe" "github.com/dgraph-io/badger/v2/y" ) const ( offsetSize = int(unsafe.Sizeof(uint32(0))) // Always align nodes on 64-bit boundaries, even on 32-bit architectures, // so that the node.value field is 64-bit aligned. This is necessary because // node.getValueOffset uses atomic.LoadUint64, which expects its input // pointer to be 64-bit aligned. nodeAlign = int(unsafe.Sizeof(uint64(0))) - 1 ) // Arena should be lock-free. type Arena struct { n uint32 buf []byte } // newArena returns a new arena. func newArena(n int64) *Arena { // Don't store data at position 0 in order to reserve offset=0 as a kind // of nil pointer. out := &Arena{ n: 1, buf: make([]byte, n), } return out } func (s *Arena) size() int64 { return int64(atomic.LoadUint32(&s.n)) } func (s *Arena) reset() { atomic.StoreUint32(&s.n, 0) } // putNode allocates a node in the arena. The node is aligned on a pointer-sized // boundary. The arena offset of the node is returned. func (s *Arena) putNode(height int) uint32 { // Compute the amount of the tower that will never be used, since the height // is less than maxHeight. unusedSize := (maxHeight - height) * offsetSize // Pad the allocation with enough bytes to ensure pointer alignment. l := uint32(MaxNodeSize - unusedSize + nodeAlign) n := atomic.AddUint32(&s.n, l) y.AssertTruef(int(n) <= len(s.buf), "Arena too small, toWrite:%d newTotal:%d limit:%d", l, n, len(s.buf)) // Return the aligned offset. m := (n - l + uint32(nodeAlign)) & ^uint32(nodeAlign) return m } // Put will *copy* val into arena. To make better use of this, reuse your input // val buffer. Returns an offset into buf. User is responsible for remembering // size of val. We could also store this size inside arena but the encoding and // decoding will incur some overhead. func (s *Arena) putVal(v y.ValueStruct) uint32 { l := uint32(v.EncodedSize()) n := atomic.AddUint32(&s.n, l) y.AssertTruef(int(n) <= len(s.buf), "Arena too small, toWrite:%d newTotal:%d limit:%d", l, n, len(s.buf)) m := n - l v.Encode(s.buf[m:]) return m } func (s *Arena) putKey(key []byte) uint32 { l := uint32(len(key)) n := atomic.AddUint32(&s.n, l) y.AssertTruef(int(n) <= len(s.buf), "Arena too small, toWrite:%d newTotal:%d limit:%d", l, n, len(s.buf)) m := n - l y.AssertTrue(len(key) == copy(s.buf[m:n], key)) return m } // getNode returns a pointer to the node located at offset. If the offset is // zero, then the nil node pointer is returned. func (s *Arena) getNode(offset uint32) *node { if offset == 0 { return nil } return (*node)(unsafe.Pointer(&s.buf[offset])) } // getKey returns byte slice at offset. func (s *Arena) getKey(offset uint32, size uint16) []byte { return s.buf[offset : offset+uint32(size)] } // getVal returns byte slice at offset. The given size should be just the value // size and should NOT include the meta bytes. func (s *Arena) getVal(offset uint32, size uint32) (ret y.ValueStruct) { ret.Decode(s.buf[offset : offset+size]) return } // getNodeOffset returns the offset of node in the arena. If the node pointer is // nil, then the zero offset is returned. func (s *Arena) getNodeOffset(nd *node) uint32 { if nd == nil { return 0 } return uint32(uintptr(unsafe.Pointer(nd)) - uintptr(unsafe.Pointer(&s.buf[0]))) } badger-2.2007.2/skl/skl.go000066400000000000000000000350771372173116500151230ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* Adapted from RocksDB inline skiplist. Key differences: - No optimization for sequential inserts (no "prev"). - No custom comparator. - Support overwrites. This requires care when we see the same key when inserting. For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so there is no need for values. We don't intend to support versioning. In-place updates of values would be more efficient. - We discard all non-concurrent code. - We do not support Splices. This simplifies the code a lot. - No AllocateNode or other pointer arithmetic. - We combine the findLessThan, findGreaterOrEqual, etc into one function. */ package skl import ( "math" "sync/atomic" "unsafe" "github.com/dgraph-io/badger/v2/y" "github.com/dgraph-io/ristretto/z" ) const ( maxHeight = 20 heightIncrease = math.MaxUint32 / 3 ) // MaxNodeSize is the memory footprint of a node of maximum height. const MaxNodeSize = int(unsafe.Sizeof(node{})) type node struct { // Multiple parts of the value are encoded as a single uint64 so that it // can be atomically loaded and stored: // value offset: uint32 (bits 0-31) // value size : uint16 (bits 32-63) value uint64 // A byte slice is 24 bytes. We are trying to save space here. keyOffset uint32 // Immutable. No need to lock to access key. keySize uint16 // Immutable. No need to lock to access key. // Height of the tower. height uint16 // Most nodes do not need to use the full height of the tower, since the // probability of each successive level decreases exponentially. Because // these elements are never accessed, they do not need to be allocated. // Therefore, when a node is allocated in the arena, its memory footprint // is deliberately truncated to not include unneeded tower elements. // // All accesses to elements should use CAS operations, with no need to lock. tower [maxHeight]uint32 } // Skiplist maps keys to values (in memory) type Skiplist struct { height int32 // Current height. 1 <= height <= kMaxHeight. CAS. head *node ref int32 arena *Arena } // IncrRef increases the refcount func (s *Skiplist) IncrRef() { atomic.AddInt32(&s.ref, 1) } // DecrRef decrements the refcount, deallocating the Skiplist when done using it func (s *Skiplist) DecrRef() { newRef := atomic.AddInt32(&s.ref, -1) if newRef > 0 { return } s.arena.reset() // Indicate we are closed. Good for testing. Also, lets GC reclaim memory. Race condition // here would suggest we are accessing skiplist when we are supposed to have no reference! s.arena = nil // Since the head references the arena's buf, as long as the head is kept around // GC can't release the buf. s.head = nil } func newNode(arena *Arena, key []byte, v y.ValueStruct, height int) *node { // The base level is already allocated in the node struct. offset := arena.putNode(height) node := arena.getNode(offset) node.keyOffset = arena.putKey(key) node.keySize = uint16(len(key)) node.height = uint16(height) node.value = encodeValue(arena.putVal(v), v.EncodedSize()) return node } func encodeValue(valOffset uint32, valSize uint32) uint64 { return uint64(valSize)<<32 | uint64(valOffset) } func decodeValue(value uint64) (valOffset uint32, valSize uint32) { valOffset = uint32(value) valSize = uint32(value >> 32) return } // NewSkiplist makes a new empty skiplist, with a given arena size func NewSkiplist(arenaSize int64) *Skiplist { arena := newArena(arenaSize) head := newNode(arena, nil, y.ValueStruct{}, maxHeight) return &Skiplist{ height: 1, head: head, arena: arena, ref: 1, } } func (s *node) getValueOffset() (uint32, uint32) { value := atomic.LoadUint64(&s.value) return decodeValue(value) } func (s *node) key(arena *Arena) []byte { return arena.getKey(s.keyOffset, s.keySize) } func (s *node) setValue(arena *Arena, v y.ValueStruct) { valOffset := arena.putVal(v) value := encodeValue(valOffset, v.EncodedSize()) atomic.StoreUint64(&s.value, value) } func (s *node) getNextOffset(h int) uint32 { return atomic.LoadUint32(&s.tower[h]) } func (s *node) casNextOffset(h int, old, val uint32) bool { return atomic.CompareAndSwapUint32(&s.tower[h], old, val) } // Returns true if key is strictly > n.key. // If n is nil, this is an "end" marker and we return false. //func (s *Skiplist) keyIsAfterNode(key []byte, n *node) bool { // y.AssertTrue(n != s.head) // return n != nil && y.CompareKeys(key, n.key) > 0 //} func (s *Skiplist) randomHeight() int { h := 1 for h < maxHeight && z.FastRand() <= heightIncrease { h++ } return h } func (s *Skiplist) getNext(nd *node, height int) *node { return s.arena.getNode(nd.getNextOffset(height)) } // findNear finds the node near to key. // If less=true, it finds rightmost node such that node.key < key (if allowEqual=false) or // node.key <= key (if allowEqual=true). // If less=false, it finds leftmost node such that node.key > key (if allowEqual=false) or // node.key >= key (if allowEqual=true). // Returns the node found. The bool returned is true if the node has key equal to given key. func (s *Skiplist) findNear(key []byte, less bool, allowEqual bool) (*node, bool) { x := s.head level := int(s.getHeight() - 1) for { // Assume x.key < key. next := s.getNext(x, level) if next == nil { // x.key < key < END OF LIST if level > 0 { // Can descend further to iterate closer to the end. level-- continue } // Level=0. Cannot descend further. Let's return something that makes sense. if !less { return nil, false } // Try to return x. Make sure it is not a head node. if x == s.head { return nil, false } return x, false } nextKey := next.key(s.arena) cmp := y.CompareKeys(key, nextKey) if cmp > 0 { // x.key < next.key < key. We can continue to move right. x = next continue } if cmp == 0 { // x.key < key == next.key. if allowEqual { return next, true } if !less { // We want >, so go to base level to grab the next bigger note. return s.getNext(next, 0), false } // We want <. If not base level, we should go closer in the next level. if level > 0 { level-- continue } // On base level. Return x. if x == s.head { return nil, false } return x, false } // cmp < 0. In other words, x.key < key < next. if level > 0 { level-- continue } // At base level. Need to return something. if !less { return next, false } // Try to return x. Make sure it is not a head node. if x == s.head { return nil, false } return x, false } } // findSpliceForLevel returns (outBefore, outAfter) with outBefore.key <= key <= outAfter.key. // The input "before" tells us where to start looking. // If we found a node with the same key, then we return outBefore = outAfter. // Otherwise, outBefore.key < key < outAfter.key. func (s *Skiplist) findSpliceForLevel(key []byte, before *node, level int) (*node, *node) { for { // Assume before.key < key. next := s.getNext(before, level) if next == nil { return before, next } nextKey := next.key(s.arena) cmp := y.CompareKeys(key, nextKey) if cmp == 0 { // Equality case. return next, next } if cmp < 0 { // before.key < key < next.key. We are done for this level. return before, next } before = next // Keep moving right on this level. } } func (s *Skiplist) getHeight() int32 { return atomic.LoadInt32(&s.height) } // Put inserts the key-value pair. func (s *Skiplist) Put(key []byte, v y.ValueStruct) { // Since we allow overwrite, we may not need to create a new node. We might not even need to // increase the height. Let's defer these actions. listHeight := s.getHeight() var prev [maxHeight + 1]*node var next [maxHeight + 1]*node prev[listHeight] = s.head next[listHeight] = nil for i := int(listHeight) - 1; i >= 0; i-- { // Use higher level to speed up for current level. prev[i], next[i] = s.findSpliceForLevel(key, prev[i+1], i) if prev[i] == next[i] { prev[i].setValue(s.arena, v) return } } // We do need to create a new node. height := s.randomHeight() x := newNode(s.arena, key, v, height) // Try to increase s.height via CAS. listHeight = s.getHeight() for height > int(listHeight) { if atomic.CompareAndSwapInt32(&s.height, listHeight, int32(height)) { // Successfully increased skiplist.height. break } listHeight = s.getHeight() } // We always insert from the base level and up. After you add a node in base level, we cannot // create a node in the level above because it would have discovered the node in the base level. for i := 0; i < height; i++ { for { if prev[i] == nil { y.AssertTrue(i > 1) // This cannot happen in base level. // We haven't computed prev, next for this level because height exceeds old listHeight. // For these levels, we expect the lists to be sparse, so we can just search from head. prev[i], next[i] = s.findSpliceForLevel(key, s.head, i) // Someone adds the exact same key before we are able to do so. This can only happen on // the base level. But we know we are not on the base level. y.AssertTrue(prev[i] != next[i]) } nextOffset := s.arena.getNodeOffset(next[i]) x.tower[i] = nextOffset if prev[i].casNextOffset(i, nextOffset, s.arena.getNodeOffset(x)) { // Managed to insert x between prev[i] and next[i]. Go to the next level. break } // CAS failed. We need to recompute prev and next. // It is unlikely to be helpful to try to use a different level as we redo the search, // because it is unlikely that lots of nodes are inserted between prev[i] and next[i]. prev[i], next[i] = s.findSpliceForLevel(key, prev[i], i) if prev[i] == next[i] { y.AssertTruef(i == 0, "Equality can happen only on base level: %d", i) prev[i].setValue(s.arena, v) return } } } } // Empty returns if the Skiplist is empty. func (s *Skiplist) Empty() bool { return s.findLast() == nil } // findLast returns the last element. If head (empty list), we return nil. All the find functions // will NEVER return the head nodes. func (s *Skiplist) findLast() *node { n := s.head level := int(s.getHeight()) - 1 for { next := s.getNext(n, level) if next != nil { n = next continue } if level == 0 { if n == s.head { return nil } return n } level-- } } // Get gets the value associated with the key. It returns a valid value if it finds equal or earlier // version of the same key. func (s *Skiplist) Get(key []byte) y.ValueStruct { n, _ := s.findNear(key, false, true) // findGreaterOrEqual. if n == nil { return y.ValueStruct{} } nextKey := s.arena.getKey(n.keyOffset, n.keySize) if !y.SameKey(key, nextKey) { return y.ValueStruct{} } valOffset, valSize := n.getValueOffset() vs := s.arena.getVal(valOffset, valSize) vs.Version = y.ParseTs(nextKey) return vs } // NewIterator returns a skiplist iterator. You have to Close() the iterator. func (s *Skiplist) NewIterator() *Iterator { s.IncrRef() return &Iterator{list: s} } // MemSize returns the size of the Skiplist in terms of how much memory is used within its internal // arena. func (s *Skiplist) MemSize() int64 { return s.arena.size() } // Iterator is an iterator over skiplist object. For new objects, you just // need to initialize Iterator.list. type Iterator struct { list *Skiplist n *node } // Close frees the resources held by the iterator func (s *Iterator) Close() error { s.list.DecrRef() return nil } // Valid returns true iff the iterator is positioned at a valid node. func (s *Iterator) Valid() bool { return s.n != nil } // Key returns the key at the current position. func (s *Iterator) Key() []byte { return s.list.arena.getKey(s.n.keyOffset, s.n.keySize) } // Value returns value. func (s *Iterator) Value() y.ValueStruct { valOffset, valSize := s.n.getValueOffset() return s.list.arena.getVal(valOffset, valSize) } // Next advances to the next position. func (s *Iterator) Next() { y.AssertTrue(s.Valid()) s.n = s.list.getNext(s.n, 0) } // Prev advances to the previous position. func (s *Iterator) Prev() { y.AssertTrue(s.Valid()) s.n, _ = s.list.findNear(s.Key(), true, false) // find <. No equality allowed. } // Seek advances to the first entry with a key >= target. func (s *Iterator) Seek(target []byte) { s.n, _ = s.list.findNear(target, false, true) // find >=. } // SeekForPrev finds an entry with key <= target. func (s *Iterator) SeekForPrev(target []byte) { s.n, _ = s.list.findNear(target, true, true) // find <=. } // SeekToFirst seeks position at the first entry in list. // Final state of iterator is Valid() iff list is not empty. func (s *Iterator) SeekToFirst() { s.n = s.list.getNext(s.list.head, 0) } // SeekToLast seeks position at the last entry in list. // Final state of iterator is Valid() iff list is not empty. func (s *Iterator) SeekToLast() { s.n = s.list.findLast() } // UniIterator is a unidirectional memtable iterator. It is a thin wrapper around // Iterator. We like to keep Iterator as before, because it is more powerful and // we might support bidirectional iterators in the future. type UniIterator struct { iter *Iterator reversed bool } // NewUniIterator returns a UniIterator. func (s *Skiplist) NewUniIterator(reversed bool) *UniIterator { return &UniIterator{ iter: s.NewIterator(), reversed: reversed, } } // Next implements y.Interface func (s *UniIterator) Next() { if !s.reversed { s.iter.Next() } else { s.iter.Prev() } } // Rewind implements y.Interface func (s *UniIterator) Rewind() { if !s.reversed { s.iter.SeekToFirst() } else { s.iter.SeekToLast() } } // Seek implements y.Interface func (s *UniIterator) Seek(key []byte) { if !s.reversed { s.iter.Seek(key) } else { s.iter.SeekForPrev(key) } } // Key implements y.Interface func (s *UniIterator) Key() []byte { return s.iter.Key() } // Value implements y.Interface func (s *UniIterator) Value() y.ValueStruct { return s.iter.Value() } // Valid implements y.Interface func (s *UniIterator) Valid() bool { return s.iter.Valid() } // Close implements y.Interface (and frees up the iter's resources) func (s *UniIterator) Close() error { return s.iter.Close() } badger-2.2007.2/skl/skl_test.go000066400000000000000000000357751372173116500161670ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package skl import ( "encoding/binary" "fmt" "math/rand" "strconv" "sync" "sync/atomic" "testing" "time" "github.com/stretchr/testify/require" "github.com/dgraph-io/badger/v2/y" ) const arenaSize = 1 << 20 func (s *Skiplist) valid() bool { return s.arena != nil } func newValue(v int) []byte { return []byte(fmt.Sprintf("%05d", v)) } // length iterates over skiplist to give exact size. func length(s *Skiplist) int { x := s.getNext(s.head, 0) count := 0 for x != nil { count++ x = s.getNext(x, 0) } return count } func TestEmpty(t *testing.T) { key := []byte("aaa") l := NewSkiplist(arenaSize) v := l.Get(key) require.True(t, v.Value == nil) // Cannot use require.Nil for unsafe.Pointer nil. for _, less := range []bool{true, false} { for _, allowEqual := range []bool{true, false} { n, found := l.findNear(key, less, allowEqual) require.Nil(t, n) require.False(t, found) } } it := l.NewIterator() require.False(t, it.Valid()) it.SeekToFirst() require.False(t, it.Valid()) it.SeekToLast() require.False(t, it.Valid()) it.Seek(key) require.False(t, it.Valid()) l.DecrRef() require.True(t, l.valid()) // Check the reference counting. it.Close() require.False(t, l.valid()) // Check the reference counting. } // TestBasic tests single-threaded inserts and updates and gets. func TestBasic(t *testing.T) { l := NewSkiplist(arenaSize) val1 := newValue(42) val2 := newValue(52) val3 := newValue(62) val4 := newValue(72) val5 := []byte(fmt.Sprintf("%0102400d", 1)) // Have size 100 KB which is > math.MaxUint16. // Try inserting values. // Somehow require.Nil doesn't work when checking for unsafe.Pointer(nil). l.Put(y.KeyWithTs([]byte("key1"), 0), y.ValueStruct{Value: val1, Meta: 55, UserMeta: 0}) l.Put(y.KeyWithTs([]byte("key2"), 2), y.ValueStruct{Value: val2, Meta: 56, UserMeta: 0}) l.Put(y.KeyWithTs([]byte("key3"), 0), y.ValueStruct{Value: val3, Meta: 57, UserMeta: 0}) v := l.Get(y.KeyWithTs([]byte("key"), 0)) require.True(t, v.Value == nil) v = l.Get(y.KeyWithTs([]byte("key1"), 0)) require.True(t, v.Value != nil) require.EqualValues(t, "00042", string(v.Value)) require.EqualValues(t, 55, v.Meta) v = l.Get(y.KeyWithTs([]byte("key2"), 0)) require.True(t, v.Value == nil) v = l.Get(y.KeyWithTs([]byte("key3"), 0)) require.True(t, v.Value != nil) require.EqualValues(t, "00062", string(v.Value)) require.EqualValues(t, 57, v.Meta) l.Put(y.KeyWithTs([]byte("key3"), 1), y.ValueStruct{Value: val4, Meta: 12, UserMeta: 0}) v = l.Get(y.KeyWithTs([]byte("key3"), 1)) require.True(t, v.Value != nil) require.EqualValues(t, "00072", string(v.Value)) require.EqualValues(t, 12, v.Meta) l.Put(y.KeyWithTs([]byte("key4"), 1), y.ValueStruct{Value: val5, Meta: 60, UserMeta: 0}) v = l.Get(y.KeyWithTs([]byte("key4"), 1)) require.NotNil(t, v.Value) require.EqualValues(t, val5, v.Value) require.EqualValues(t, 60, v.Meta) } // TestConcurrentBasic tests concurrent writes followed by concurrent reads. func TestConcurrentBasic(t *testing.T) { const n = 1000 l := NewSkiplist(arenaSize) var wg sync.WaitGroup key := func(i int) []byte { return y.KeyWithTs([]byte(fmt.Sprintf("%05d", i)), 0) } for i := 0; i < n; i++ { wg.Add(1) go func(i int) { defer wg.Done() l.Put(key(i), y.ValueStruct{Value: newValue(i), Meta: 0, UserMeta: 0}) }(i) } wg.Wait() // Check values. Concurrent reads. for i := 0; i < n; i++ { wg.Add(1) go func(i int) { defer wg.Done() v := l.Get(key(i)) require.True(t, v.Value != nil) require.EqualValues(t, newValue(i), v.Value) }(i) } wg.Wait() require.EqualValues(t, n, length(l)) } func TestConcurrentBasicBigValues(t *testing.T) { const n = 100 arenaSize := int64(120 << 20) // 120 MB l := NewSkiplist(arenaSize) var wg sync.WaitGroup key := func(i int) []byte { return y.KeyWithTs([]byte(fmt.Sprintf("%05d", i)), 0) } BigValue := func(i int) []byte { return []byte(fmt.Sprintf("%01048576d", i)) // Have 1 MB value which is > math.MaxUint16. } for i := 0; i < n; i++ { wg.Add(1) go func(i int) { defer wg.Done() l.Put(key(i), y.ValueStruct{Value: BigValue(i), Meta: 0, UserMeta: 0}) }(i) } wg.Wait() // Check values. Concurrent reads. for i := 0; i < n; i++ { wg.Add(1) go func(i int) { defer wg.Done() v := l.Get(key(i)) require.NotNil(t, v.Value) require.EqualValues(t, BigValue(i), v.Value) }(i) } wg.Wait() require.EqualValues(t, n, length(l)) } // TestOneKey will read while writing to one single key. func TestOneKey(t *testing.T) { const n = 100 key := y.KeyWithTs([]byte("thekey"), 0) l := NewSkiplist(arenaSize) defer l.DecrRef() var wg sync.WaitGroup for i := 0; i < n; i++ { wg.Add(1) go func(i int) { defer wg.Done() l.Put(key, y.ValueStruct{Value: newValue(i), Meta: 0, UserMeta: 0}) }(i) } // We expect that at least some write made it such that some read returns a value. var sawValue int32 for i := 0; i < n; i++ { wg.Add(1) go func() { defer wg.Done() p := l.Get(key) if p.Value == nil { return } atomic.AddInt32(&sawValue, 1) v, err := strconv.Atoi(string(p.Value)) require.NoError(t, err) require.True(t, 0 <= v && v < n, fmt.Sprintf("invalid value %d", v)) }() } wg.Wait() require.True(t, sawValue > 0) require.EqualValues(t, 1, length(l)) } func TestFindNear(t *testing.T) { l := NewSkiplist(arenaSize) defer l.DecrRef() for i := 0; i < 1000; i++ { key := fmt.Sprintf("%05d", i*10+5) l.Put(y.KeyWithTs([]byte(key), 0), y.ValueStruct{Value: newValue(i), Meta: 0, UserMeta: 0}) } n, eq := l.findNear(y.KeyWithTs([]byte("00001"), 0), false, false) require.NotNil(t, n) require.EqualValues(t, y.KeyWithTs([]byte("00005"), 0), string(n.key(l.arena))) require.False(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("00001"), 0), false, true) require.NotNil(t, n) require.EqualValues(t, y.KeyWithTs([]byte("00005"), 0), string(n.key(l.arena))) require.False(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("00001"), 0), true, false) require.Nil(t, n) require.False(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("00001"), 0), true, true) require.Nil(t, n) require.False(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("00005"), 0), false, false) require.NotNil(t, n) require.EqualValues(t, y.KeyWithTs([]byte("00015"), 0), string(n.key(l.arena))) require.False(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("00005"), 0), false, true) require.NotNil(t, n) require.EqualValues(t, y.KeyWithTs([]byte("00005"), 0), string(n.key(l.arena))) require.True(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("00005"), 0), true, false) require.Nil(t, n) require.False(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("00005"), 0), true, true) require.NotNil(t, n) require.EqualValues(t, y.KeyWithTs([]byte("00005"), 0), string(n.key(l.arena))) require.True(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("05555"), 0), false, false) require.NotNil(t, n) require.EqualValues(t, y.KeyWithTs([]byte("05565"), 0), string(n.key(l.arena))) require.False(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("05555"), 0), false, true) require.NotNil(t, n) require.EqualValues(t, y.KeyWithTs([]byte("05555"), 0), string(n.key(l.arena))) require.True(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("05555"), 0), true, false) require.NotNil(t, n) require.EqualValues(t, y.KeyWithTs([]byte("05545"), 0), string(n.key(l.arena))) require.False(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("05555"), 0), true, true) require.NotNil(t, n) require.EqualValues(t, y.KeyWithTs([]byte("05555"), 0), string(n.key(l.arena))) require.True(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("05558"), 0), false, false) require.NotNil(t, n) require.EqualValues(t, y.KeyWithTs([]byte("05565"), 0), string(n.key(l.arena))) require.False(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("05558"), 0), false, true) require.NotNil(t, n) require.EqualValues(t, y.KeyWithTs([]byte("05565"), 0), string(n.key(l.arena))) require.False(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("05558"), 0), true, false) require.NotNil(t, n) require.EqualValues(t, y.KeyWithTs([]byte("05555"), 0), string(n.key(l.arena))) require.False(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("05558"), 0), true, true) require.NotNil(t, n) require.EqualValues(t, y.KeyWithTs([]byte("05555"), 0), string(n.key(l.arena))) require.False(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("09995"), 0), false, false) require.Nil(t, n) require.False(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("09995"), 0), false, true) require.NotNil(t, n) require.EqualValues(t, y.KeyWithTs([]byte("09995"), 0), string(n.key(l.arena))) require.True(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("09995"), 0), true, false) require.NotNil(t, n) require.EqualValues(t, y.KeyWithTs([]byte("09985"), 0), string(n.key(l.arena))) require.False(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("09995"), 0), true, true) require.NotNil(t, n) require.EqualValues(t, y.KeyWithTs([]byte("09995"), 0), string(n.key(l.arena))) require.True(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("59995"), 0), false, false) require.Nil(t, n) require.False(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("59995"), 0), false, true) require.Nil(t, n) require.False(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("59995"), 0), true, false) require.NotNil(t, n) require.EqualValues(t, y.KeyWithTs([]byte("09995"), 0), string(n.key(l.arena))) require.False(t, eq) n, eq = l.findNear(y.KeyWithTs([]byte("59995"), 0), true, true) require.NotNil(t, n) require.EqualValues(t, y.KeyWithTs([]byte("09995"), 0), string(n.key(l.arena))) require.False(t, eq) } // TestIteratorNext tests a basic iteration over all nodes from the beginning. func TestIteratorNext(t *testing.T) { const n = 100 l := NewSkiplist(arenaSize) defer l.DecrRef() it := l.NewIterator() defer it.Close() require.False(t, it.Valid()) it.SeekToFirst() require.False(t, it.Valid()) for i := n - 1; i >= 0; i-- { l.Put(y.KeyWithTs([]byte(fmt.Sprintf("%05d", i)), 0), y.ValueStruct{Value: newValue(i), Meta: 0, UserMeta: 0}) } it.SeekToFirst() for i := 0; i < n; i++ { require.True(t, it.Valid()) v := it.Value() require.EqualValues(t, newValue(i), v.Value) it.Next() } require.False(t, it.Valid()) } // TestIteratorPrev tests a basic iteration over all nodes from the end. func TestIteratorPrev(t *testing.T) { const n = 100 l := NewSkiplist(arenaSize) defer l.DecrRef() it := l.NewIterator() defer it.Close() require.False(t, it.Valid()) it.SeekToFirst() require.False(t, it.Valid()) for i := 0; i < n; i++ { l.Put(y.KeyWithTs([]byte(fmt.Sprintf("%05d", i)), 0), y.ValueStruct{Value: newValue(i), Meta: 0, UserMeta: 0}) } it.SeekToLast() for i := n - 1; i >= 0; i-- { require.True(t, it.Valid()) v := it.Value() require.EqualValues(t, newValue(i), v.Value) it.Prev() } require.False(t, it.Valid()) } // TestIteratorSeek tests Seek and SeekForPrev. func TestIteratorSeek(t *testing.T) { const n = 100 l := NewSkiplist(arenaSize) defer l.DecrRef() it := l.NewIterator() defer it.Close() require.False(t, it.Valid()) it.SeekToFirst() require.False(t, it.Valid()) // 1000, 1010, 1020, ..., 1990. for i := n - 1; i >= 0; i-- { v := i*10 + 1000 l.Put(y.KeyWithTs([]byte(fmt.Sprintf("%05d", i*10+1000)), 0), y.ValueStruct{Value: newValue(v), Meta: 0, UserMeta: 0}) } it.SeekToFirst() require.True(t, it.Valid()) v := it.Value() require.EqualValues(t, "01000", v.Value) it.Seek(y.KeyWithTs([]byte("01000"), 0)) require.True(t, it.Valid()) v = it.Value() require.EqualValues(t, "01000", v.Value) it.Seek(y.KeyWithTs([]byte("01005"), 0)) require.True(t, it.Valid()) v = it.Value() require.EqualValues(t, "01010", v.Value) it.Seek(y.KeyWithTs([]byte("01010"), 0)) require.True(t, it.Valid()) v = it.Value() require.EqualValues(t, "01010", v.Value) it.Seek(y.KeyWithTs([]byte("99999"), 0)) require.False(t, it.Valid()) // Try SeekForPrev. it.SeekForPrev(y.KeyWithTs([]byte("00"), 0)) require.False(t, it.Valid()) it.SeekForPrev(y.KeyWithTs([]byte("01000"), 0)) require.True(t, it.Valid()) v = it.Value() require.EqualValues(t, "01000", v.Value) it.SeekForPrev(y.KeyWithTs([]byte("01005"), 0)) require.True(t, it.Valid()) v = it.Value() require.EqualValues(t, "01000", v.Value) it.SeekForPrev(y.KeyWithTs([]byte("01010"), 0)) require.True(t, it.Valid()) v = it.Value() require.EqualValues(t, "01010", v.Value) it.SeekForPrev(y.KeyWithTs([]byte("99999"), 0)) require.True(t, it.Valid()) v = it.Value() require.EqualValues(t, "01990", v.Value) } func randomKey(rng *rand.Rand) []byte { b := make([]byte, 8) key := rng.Uint32() key2 := rng.Uint32() binary.LittleEndian.PutUint32(b, key) binary.LittleEndian.PutUint32(b[4:], key2) return y.KeyWithTs(b, 0) } // Standard test. Some fraction is read. Some fraction is write. Writes have // to go through mutex lock. func BenchmarkReadWrite(b *testing.B) { value := newValue(123) for i := 0; i <= 10; i++ { readFrac := float32(i) / 10.0 b.Run(fmt.Sprintf("frac_%d", i), func(b *testing.B) { l := NewSkiplist(int64((b.N + 1) * MaxNodeSize)) defer l.DecrRef() b.ResetTimer() var count int b.RunParallel(func(pb *testing.PB) { rng := rand.New(rand.NewSource(time.Now().UnixNano())) for pb.Next() { if rng.Float32() < readFrac { v := l.Get(randomKey(rng)) if v.Value != nil { count++ } } else { l.Put(randomKey(rng), y.ValueStruct{Value: value, Meta: 0, UserMeta: 0}) } } }) }) } } // Standard test. Some fraction is read. Some fraction is write. Writes have // to go through mutex lock. func BenchmarkReadWriteMap(b *testing.B) { value := newValue(123) for i := 0; i <= 10; i++ { readFrac := float32(i) / 10.0 b.Run(fmt.Sprintf("frac_%d", i), func(b *testing.B) { m := make(map[string][]byte) var mutex sync.RWMutex b.ResetTimer() var count int b.RunParallel(func(pb *testing.PB) { rng := rand.New(rand.NewSource(time.Now().UnixNano())) for pb.Next() { if rng.Float32() < readFrac { mutex.RLock() _, ok := m[string(randomKey(rng))] mutex.RUnlock() if ok { count++ } } else { mutex.Lock() m[string(randomKey(rng))] = value mutex.Unlock() } } }) }) } } func BenchmarkWrite(b *testing.B) { value := newValue(123) l := NewSkiplist(int64((b.N + 1) * MaxNodeSize)) defer l.DecrRef() b.ResetTimer() b.RunParallel(func(pb *testing.PB) { rng := rand.New(rand.NewSource(time.Now().UnixNano())) for pb.Next() { l.Put(randomKey(rng), y.ValueStruct{Value: value, Meta: 0, UserMeta: 0}) } }) } badger-2.2007.2/stream.go000066400000000000000000000270701372173116500150260ustar00rootroot00000000000000/* * Copyright 2018 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "bytes" "context" "math" "sync" "sync/atomic" "time" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/y" humanize "github.com/dustin/go-humanize" "github.com/golang/protobuf/proto" ) const pageSize = 4 << 20 // 4MB // maxStreamSize is the maximum allowed size of a stream batch. This is a soft limit // as a single list that is still over the limit will have to be sent as is since it // cannot be split further. This limit prevents the framework from creating batches // so big that sending them causes issues (e.g running into the max size gRPC limit). var maxStreamSize = uint64(100 << 20) // 100MB // Stream provides a framework to concurrently iterate over a snapshot of Badger, pick up // key-values, batch them up and call Send. Stream does concurrent iteration over many smaller key // ranges. It does NOT send keys in lexicographical sorted order. To get keys in sorted // order, use Iterator. type Stream struct { // Prefix to only iterate over certain range of keys. If set to nil (default), Stream would // iterate over the entire DB. Prefix []byte // Number of goroutines to use for iterating over key ranges. Defaults to 16. NumGo int // Badger would produce log entries in Infof to indicate the progress of Stream. LogPrefix can // be used to help differentiate them from other activities. Default is "Badger.Stream". LogPrefix string // ChooseKey is invoked each time a new key is encountered. Note that this is not called // on every version of the value, only the first encountered version (i.e. the highest version // of the value a key has). ChooseKey can be left nil to select all keys. // // Note: Calls to ChooseKey are concurrent. ChooseKey func(item *Item) bool // KeyToList, similar to ChooseKey, is only invoked on the highest version of the value. It // is upto the caller to iterate over the versions and generate zero, one or more KVs. It // is expected that the user would advance the iterator to go through the versions of the // values. However, the user MUST immediately return from this function on the first encounter // with a mismatching key. See example usage in ToList function. Can be left nil to use ToList // function by default. // // Note: Calls to KeyToList are concurrent. KeyToList func(key []byte, itr *Iterator) (*pb.KVList, error) // This is the method where Stream sends the final output. All calls to Send are done by a // single goroutine, i.e. logic within Send method can expect single threaded execution. Send func(*pb.KVList) error readTs uint64 db *DB rangeCh chan keyRange kvChan chan *pb.KVList nextStreamId uint32 } // ToList is a default implementation of KeyToList. It picks up all valid versions of the key, // skipping over deleted or expired keys. func (st *Stream) ToList(key []byte, itr *Iterator) (*pb.KVList, error) { list := &pb.KVList{} for ; itr.Valid(); itr.Next() { item := itr.Item() if item.IsDeletedOrExpired() { break } if !bytes.Equal(key, item.Key()) { // Break out on the first encounter with another key. break } valCopy, err := item.ValueCopy(nil) if err != nil { return nil, err } kv := &pb.KV{ Key: item.KeyCopy(nil), Value: valCopy, UserMeta: []byte{item.UserMeta()}, Version: item.Version(), ExpiresAt: item.ExpiresAt(), } list.Kv = append(list.Kv, kv) if st.db.opt.NumVersionsToKeep == 1 { break } if item.DiscardEarlierVersions() { break } } return list, nil } // keyRange is [start, end), including start, excluding end. Do ensure that the start, // end byte slices are owned by keyRange struct. func (st *Stream) produceRanges(ctx context.Context) { splits := st.db.KeySplits(st.Prefix) // We don't need to create more key ranges than NumGo goroutines. This way, we will have limited // number of "streams" coming out, which then helps limit the memory used by SSWriter. { pickEvery := int(math.Floor(float64(len(splits)) / float64(st.NumGo))) if pickEvery < 1 { pickEvery = 1 } filtered := splits[:0] for i, split := range splits { if (i+1)%pickEvery == 0 { filtered = append(filtered, split) } } splits = filtered } start := y.SafeCopy(nil, st.Prefix) for _, key := range splits { st.rangeCh <- keyRange{left: start, right: y.SafeCopy(nil, []byte(key))} start = y.SafeCopy(nil, []byte(key)) } // Edge case: prefix is empty and no splits exist. In that case, we should have at least one // keyRange output. st.rangeCh <- keyRange{left: start} close(st.rangeCh) } // produceKVs picks up ranges from rangeCh, generates KV lists and sends them to kvChan. func (st *Stream) produceKVs(ctx context.Context, threadId int) error { var size int var txn *Txn if st.readTs > 0 { txn = st.db.NewTransactionAt(st.readTs, false) } else { txn = st.db.NewTransaction(false) } defer txn.Discard() iterate := func(kr keyRange) error { iterOpts := DefaultIteratorOptions iterOpts.AllVersions = true iterOpts.Prefix = st.Prefix iterOpts.PrefetchValues = false itr := txn.NewIterator(iterOpts) itr.ThreadId = threadId defer itr.Close() // This unique stream id is used to identify all the keys from this iteration. streamId := atomic.AddUint32(&st.nextStreamId, 1) outList := new(pb.KVList) sendIt := func() error { select { case st.kvChan <- outList: case <-ctx.Done(): return ctx.Err() } outList = new(pb.KVList) size = 0 return nil } var prevKey []byte for itr.Seek(kr.left); itr.Valid(); { // it.Valid would only return true for keys with the provided Prefix in iterOpts. item := itr.Item() if bytes.Equal(item.Key(), prevKey) { itr.Next() continue } prevKey = append(prevKey[:0], item.Key()...) // Check if we reached the end of the key range. if len(kr.right) > 0 && bytes.Compare(item.Key(), kr.right) >= 0 { break } // Check if we should pick this key. if st.ChooseKey != nil && !st.ChooseKey(item) { continue } // Now convert to key value. list, err := st.KeyToList(item.KeyCopy(nil), itr) if err != nil { return err } if list == nil || len(list.Kv) == 0 { continue } for _, kv := range list.Kv { size += proto.Size(kv) kv.StreamId = streamId outList.Kv = append(outList.Kv, kv) if size < pageSize { continue } if err := sendIt(); err != nil { return err } } } if len(outList.Kv) > 0 { // TODO: Think of a way to indicate that a stream is over. if err := sendIt(); err != nil { return err } } return nil } for { select { case kr, ok := <-st.rangeCh: if !ok { // Done with the keys. return nil } if err := iterate(kr); err != nil { return err } case <-ctx.Done(): return ctx.Err() } } } func (st *Stream) streamKVs(ctx context.Context) error { var count int var bytesSent uint64 t := time.NewTicker(time.Second) defer t.Stop() now := time.Now() sendBatch := func(batch *pb.KVList) error { sz := uint64(proto.Size(batch)) bytesSent += sz count += len(batch.Kv) t := time.Now() if err := st.Send(batch); err != nil { return err } st.db.opt.Infof("%s Created batch of size: %s in %s.\n", st.LogPrefix, humanize.Bytes(sz), time.Since(t)) return nil } slurp := func(batch *pb.KVList) error { loop: for { // Send the batch immediately if it already exceeds the maximum allowed size. // If the size of the batch exceeds maxStreamSize, break from the loop to // avoid creating a batch that is so big that certain limits are reached. sz := uint64(proto.Size(batch)) if sz > maxStreamSize { break loop } select { case kvs, ok := <-st.kvChan: if !ok { break loop } y.AssertTrue(kvs != nil) batch.Kv = append(batch.Kv, kvs.Kv...) default: break loop } } return sendBatch(batch) } outer: for { var batch *pb.KVList select { case <-ctx.Done(): return ctx.Err() case <-t.C: dur := time.Since(now) durSec := uint64(dur.Seconds()) if durSec == 0 { continue } speed := bytesSent / durSec st.db.opt.Infof("%s Time elapsed: %s, bytes sent: %s, speed: %s/sec\n", st.LogPrefix, y.FixedDuration(dur), humanize.Bytes(bytesSent), humanize.Bytes(speed)) case kvs, ok := <-st.kvChan: if !ok { break outer } y.AssertTrue(kvs != nil) batch = kvs // Otherwise, slurp more keys into this batch. if err := slurp(batch); err != nil { return err } } } st.db.opt.Infof("%s Sent %d keys\n", st.LogPrefix, count) return nil } // Orchestrate runs Stream. It picks up ranges from the SSTables, then runs NumGo number of // goroutines to iterate over these ranges and batch up KVs in lists. It concurrently runs a single // goroutine to pick these lists, batch them up further and send to Output.Send. Orchestrate also // spits logs out to Infof, using provided LogPrefix. Note that all calls to Output.Send // are serial. In case any of these steps encounter an error, Orchestrate would stop execution and // return that error. Orchestrate can be called multiple times, but in serial order. func (st *Stream) Orchestrate(ctx context.Context) error { st.rangeCh = make(chan keyRange, 3) // Contains keys for posting lists. // kvChan should only have a small capacity to ensure that we don't buffer up too much data if // sending is slow. Page size is set to 4MB, which is used to lazily cap the size of each // KVList. To get 128MB buffer, we can set the channel size to 32. st.kvChan = make(chan *pb.KVList, 32) if st.KeyToList == nil { st.KeyToList = st.ToList } // Picks up ranges from Badger, and sends them to rangeCh. go st.produceRanges(ctx) errCh := make(chan error, 1) // Stores error by consumeKeys. var wg sync.WaitGroup for i := 0; i < st.NumGo; i++ { wg.Add(1) go func(threadId int) { defer wg.Done() // Picks up ranges from rangeCh, generates KV lists, and sends them to kvChan. if err := st.produceKVs(ctx, threadId); err != nil { select { case errCh <- err: default: } } }(i) } // Pick up key-values from kvChan and send to stream. kvErr := make(chan error, 1) go func() { // Picks up KV lists from kvChan, and sends them to Output. kvErr <- st.streamKVs(ctx) }() wg.Wait() // Wait for produceKVs to be over. close(st.kvChan) // Now we can close kvChan. select { case err := <-errCh: // Check error from produceKVs. return err default: } // Wait for key streaming to be over. err := <-kvErr return err } func (db *DB) newStream() *Stream { return &Stream{db: db, NumGo: 16, LogPrefix: "Badger.Stream"} } // NewStream creates a new Stream. func (db *DB) NewStream() *Stream { if db.opt.managedTxns { panic("This API can not be called in managed mode.") } return db.newStream() } // NewStreamAt creates a new Stream at a particular timestamp. Should only be used with managed DB. func (db *DB) NewStreamAt(readTs uint64) *Stream { if !db.opt.managedTxns { panic("This API can only be called in managed mode.") } stream := db.newStream() stream.readTs = readTs return stream } badger-2.2007.2/stream_test.go000066400000000000000000000150461372173116500160650ustar00rootroot00000000000000/* * Copyright 2018 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "context" "fmt" "io/ioutil" "math" "strconv" "strings" "testing" bpb "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/y" "github.com/stretchr/testify/require" ) func keyWithPrefix(prefix string, k int) []byte { return []byte(fmt.Sprintf("%s-%d", prefix, k)) } func keyToInt(k []byte) (string, int) { splits := strings.Split(string(k), "-") key, err := strconv.Atoi(splits[1]) y.Check(err) return splits[0], key } func value(k int) []byte { return []byte(fmt.Sprintf("%08d", k)) } type collector struct { kv []*bpb.KV } func (c *collector) Send(list *bpb.KVList) error { c.kv = append(c.kv, list.Kv...) return nil } var ctxb = context.Background() func TestStream(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) db, err := OpenManaged(DefaultOptions(dir)) require.NoError(t, err) var count int for _, prefix := range []string{"p0", "p1", "p2"} { txn := db.NewTransactionAt(math.MaxUint64, true) for i := 1; i <= 100; i++ { require.NoError(t, txn.SetEntry(NewEntry(keyWithPrefix(prefix, i), value(i)))) count++ } require.NoError(t, txn.CommitAt(5, nil)) } stream := db.NewStreamAt(math.MaxUint64) stream.LogPrefix = "Testing" c := &collector{} stream.Send = c.Send // Test case 1. Retrieve everything. err = stream.Orchestrate(ctxb) require.NoError(t, err) require.Equal(t, 300, len(c.kv), "Expected 300. Got: %d", len(c.kv)) m := make(map[string]int) for _, kv := range c.kv { prefix, ki := keyToInt(kv.Key) expected := value(ki) require.Equal(t, expected, kv.Value) m[prefix]++ } require.Equal(t, 3, len(m)) for pred, count := range m { require.Equal(t, 100, count, "Count mismatch for pred: %s", pred) } // Test case 2. Retrieve only 1 predicate. stream.Prefix = []byte("p1") c.kv = c.kv[:0] err = stream.Orchestrate(ctxb) require.NoError(t, err) require.Equal(t, 100, len(c.kv), "Expected 100. Got: %d", len(c.kv)) m = make(map[string]int) for _, kv := range c.kv { prefix, ki := keyToInt(kv.Key) expected := value(ki) require.Equal(t, expected, kv.Value) m[prefix]++ } require.Equal(t, 1, len(m)) for pred, count := range m { require.Equal(t, 100, count, "Count mismatch for pred: %s", pred) } // Test case 3. Retrieve select keys within the predicate. c.kv = c.kv[:0] stream.ChooseKey = func(item *Item) bool { _, k := keyToInt(item.Key()) return k%2 == 0 } err = stream.Orchestrate(ctxb) require.NoError(t, err) require.Equal(t, 50, len(c.kv), "Expected 50. Got: %d", len(c.kv)) m = make(map[string]int) for _, kv := range c.kv { prefix, ki := keyToInt(kv.Key) expected := value(ki) require.Equal(t, expected, kv.Value) m[prefix]++ } require.Equal(t, 1, len(m)) for pred, count := range m { require.Equal(t, 50, count, "Count mismatch for pred: %s", pred) } // Test case 4. Retrieve select keys from all predicates. c.kv = c.kv[:0] stream.Prefix = []byte{} err = stream.Orchestrate(ctxb) require.NoError(t, err) require.Equal(t, 150, len(c.kv), "Expected 150. Got: %d", len(c.kv)) m = make(map[string]int) for _, kv := range c.kv { prefix, ki := keyToInt(kv.Key) expected := value(ki) require.Equal(t, expected, kv.Value) m[prefix]++ } require.Equal(t, 3, len(m)) for pred, count := range m { require.Equal(t, 50, count, "Count mismatch for pred: %s", pred) } require.NoError(t, db.Close()) } func TestStreamWithThreadId(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) db, err := OpenManaged(DefaultOptions(dir)) require.NoError(t, err) var count int for _, prefix := range []string{"p0", "p1", "p2"} { txn := db.NewTransactionAt(math.MaxUint64, true) for i := 1; i <= 100; i++ { require.NoError(t, txn.SetEntry(NewEntry(keyWithPrefix(prefix, i), value(i)))) count++ } require.NoError(t, txn.CommitAt(5, nil)) } stream := db.NewStreamAt(math.MaxUint64) stream.LogPrefix = "Testing" stream.KeyToList = func(key []byte, itr *Iterator) ( *bpb.KVList, error) { require.Less(t, itr.ThreadId, stream.NumGo) return stream.ToList(key, itr) } c := &collector{} stream.Send = c.Send err = stream.Orchestrate(ctxb) require.NoError(t, err) require.Equal(t, 300, len(c.kv), "Expected 300. Got: %d", len(c.kv)) m := make(map[string]int) for _, kv := range c.kv { prefix, ki := keyToInt(kv.Key) expected := value(ki) require.Equal(t, expected, kv.Value) m[prefix]++ } require.Equal(t, 3, len(m)) for pred, count := range m { require.Equal(t, 100, count, "Count mismatch for pred: %s", pred) } require.NoError(t, db.Close()) } func TestBigStream(t *testing.T) { // Set the maxStreamSize to 1MB for the duration of the test so that the it can use a smaller // dataset than it would otherwise need. originalMaxStreamSize := maxStreamSize maxStreamSize = 1 << 20 defer func() { maxStreamSize = originalMaxStreamSize }() testSize := int(1e6) dir, err := ioutil.TempDir("", "badger-big-test") require.NoError(t, err) defer removeDir(dir) db, err := OpenManaged(DefaultOptions(dir)) require.NoError(t, err) var count int wb := db.NewWriteBatchAt(5) for _, prefix := range []string{"p0", "p1", "p2"} { for i := 1; i <= testSize; i++ { require.NoError(t, wb.SetEntry(NewEntry(keyWithPrefix(prefix, i), value(i)))) count++ } } require.NoError(t, wb.Flush()) stream := db.NewStreamAt(math.MaxUint64) stream.LogPrefix = "Testing" c := &collector{} stream.Send = c.Send // Test case 1. Retrieve everything. err = stream.Orchestrate(ctxb) require.NoError(t, err) require.Equal(t, 3*testSize, len(c.kv), "Expected 30000. Got: %d", len(c.kv)) m := make(map[string]int) for _, kv := range c.kv { prefix, ki := keyToInt(kv.Key) expected := value(ki) require.Equal(t, expected, kv.Value) m[prefix]++ } require.Equal(t, 3, len(m)) for pred, count := range m { require.Equal(t, testSize, count, "Count mismatch for pred: %s", pred) } require.NoError(t, db.Close()) } badger-2.2007.2/stream_writer.go000066400000000000000000000323011372173116500164130ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "encoding/hex" "fmt" "math" "sync" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/table" "github.com/dgraph-io/badger/v2/y" humanize "github.com/dustin/go-humanize" "github.com/pkg/errors" ) const headStreamId uint32 = math.MaxUint32 // StreamWriter is used to write data coming from multiple streams. The streams must not have any // overlapping key ranges. Within each stream, the keys must be sorted. Badger Stream framework is // capable of generating such an output. So, this StreamWriter can be used at the other end to build // BadgerDB at a much faster pace by writing SSTables (and value logs) directly to LSM tree levels // without causing any compactions at all. This is way faster than using batched writer or using // transactions, but only applicable in situations where the keys are pre-sorted and the DB is being // bootstrapped. Existing data would get deleted when using this writer. So, this is only useful // when restoring from backup or replicating DB across servers. // // StreamWriter should not be called on in-use DB instances. It is designed only to bootstrap new // DBs. type StreamWriter struct { writeLock sync.Mutex db *DB done func() throttle *y.Throttle maxVersion uint64 writers map[uint32]*sortedWriter maxHead valuePointer } // NewStreamWriter creates a StreamWriter. Right after creating StreamWriter, Prepare must be // called. The memory usage of a StreamWriter is directly proportional to the number of streams // possible. So, efforts must be made to keep the number of streams low. Stream framework would // typically use 16 goroutines and hence create 16 streams. func (db *DB) NewStreamWriter() *StreamWriter { return &StreamWriter{ db: db, // throttle shouldn't make much difference. Memory consumption is based on the number of // concurrent streams being processed. throttle: y.NewThrottle(16), writers: make(map[uint32]*sortedWriter), } } // Prepare should be called before writing any entry to StreamWriter. It deletes all data present in // existing DB, stops compactions and any writes being done by other means. Be very careful when // calling Prepare, because it could result in permanent data loss. Not calling Prepare would result // in a corrupt Badger instance. func (sw *StreamWriter) Prepare() error { sw.writeLock.Lock() defer sw.writeLock.Unlock() var err error sw.done, err = sw.db.dropAll() return err } // Write writes KVList to DB. Each KV within the list contains the stream id which StreamWriter // would use to demux the writes. Write is thread safe and can be called concurrently by multiple // goroutines. func (sw *StreamWriter) Write(kvs *pb.KVList) error { if len(kvs.GetKv()) == 0 { return nil } // closedStreams keeps track of all streams which are going to be marked as done. We are // keeping track of all streams so that we can close them at the end, after inserting all // the valid kvs. closedStreams := make(map[uint32]struct{}) streamReqs := make(map[uint32]*request) for _, kv := range kvs.Kv { if kv.StreamDone { closedStreams[kv.StreamId] = struct{}{} continue } // Panic if some kv comes after stream has been marked as closed. if _, ok := closedStreams[kv.StreamId]; ok { panic(fmt.Sprintf("write performed on closed stream: %d", kv.StreamId)) } var meta, userMeta byte if len(kv.Meta) > 0 { meta = kv.Meta[0] } if len(kv.UserMeta) > 0 { userMeta = kv.UserMeta[0] } if sw.maxVersion < kv.Version { sw.maxVersion = kv.Version } e := &Entry{ Key: y.KeyWithTs(kv.Key, kv.Version), Value: kv.Value, UserMeta: userMeta, ExpiresAt: kv.ExpiresAt, meta: meta, } // If the value can be collocated with the key in LSM tree, we can skip // writing the value to value log. e.skipVlog = sw.db.shouldWriteValueToLSM(*e) req := streamReqs[kv.StreamId] if req == nil { req = &request{} streamReqs[kv.StreamId] = req } req.Entries = append(req.Entries, e) } all := make([]*request, 0, len(streamReqs)) for _, req := range streamReqs { all = append(all, req) } sw.writeLock.Lock() defer sw.writeLock.Unlock() // We are writing all requests to vlog even if some request belongs to already closed stream. // It is safe to do because we are panicking while writing to sorted writer, which will be nil // for closed stream. At restart, stream writer will drop all the data in Prepare function. if err := sw.db.vlog.write(all); err != nil { return err } for streamID, req := range streamReqs { writer, ok := sw.writers[streamID] if !ok { var err error writer, err = sw.newWriter(streamID) if err != nil { return errors.Wrapf(err, "failed to create writer with ID %d", streamID) } sw.writers[streamID] = writer } if writer == nil { panic(fmt.Sprintf("write performed on closed stream: %d", streamID)) } writer.reqCh <- req } // Now we can close any streams if required. We will make writer for // the closed streams as nil. for streamId := range closedStreams { writer, ok := sw.writers[streamId] if !ok { sw.db.opt.Logger.Warningf("Trying to close stream: %d, but no sorted "+ "writer found for it", streamId) continue } writer.closer.SignalAndWait() if err := writer.Done(); err != nil { return err } if sw.maxHead.Less(writer.head) { sw.maxHead = writer.head } sw.writers[streamId] = nil } return nil } // Flush is called once we are done writing all the entries. It syncs DB directories. It also // updates Oracle with maxVersion found in all entries (if DB is not managed). func (sw *StreamWriter) Flush() error { sw.writeLock.Lock() defer sw.writeLock.Unlock() defer sw.done() for _, writer := range sw.writers { if writer != nil { writer.closer.SignalAndWait() } } for _, writer := range sw.writers { if writer == nil { continue } if err := writer.Done(); err != nil { return err } if sw.maxHead.Less(writer.head) { sw.maxHead = writer.head } } // Encode and write the value log head into a new table. data := sw.maxHead.Encode() headWriter, err := sw.newWriter(headStreamId) if err != nil { return errors.Wrap(err, "failed to create head writer") } if err := headWriter.Add( y.KeyWithTs(head, sw.maxVersion), y.ValueStruct{Value: data}); err != nil { return err } headWriter.closer.SignalAndWait() if err := headWriter.Done(); err != nil { return err } if !sw.db.opt.managedTxns { if sw.db.orc != nil { sw.db.orc.Stop() } sw.db.orc = newOracle(sw.db.opt) sw.db.orc.nextTxnTs = sw.maxVersion sw.db.orc.txnMark.Done(sw.maxVersion) sw.db.orc.readMark.Done(sw.maxVersion) sw.db.orc.incrementNextTs() } // Wait for all files to be written. if err := sw.throttle.Finish(); err != nil { return err } // Sort tables at the end. for _, l := range sw.db.lc.levels { l.sortTables() } // Now sync the directories, so all the files are registered. if sw.db.opt.ValueDir != sw.db.opt.Dir { if err := sw.db.syncDir(sw.db.opt.ValueDir); err != nil { return err } } if err := sw.db.syncDir(sw.db.opt.Dir); err != nil { return err } return sw.db.lc.validate() } type sortedWriter struct { db *DB throttle *y.Throttle builder *table.Builder lastKey []byte streamID uint32 reqCh chan *request head valuePointer // Have separate closer for each writer, as it can be closed at any time. closer *y.Closer } func (sw *StreamWriter) newWriter(streamID uint32) (*sortedWriter, error) { dk, err := sw.db.registry.latestDataKey() if err != nil { return nil, err } bopts := buildTableOptions(sw.db.opt) bopts.DataKey = dk w := &sortedWriter{ db: sw.db, streamID: streamID, throttle: sw.throttle, builder: table.NewTableBuilder(bopts), reqCh: make(chan *request, 3), closer: y.NewCloser(1), } go w.handleRequests() return w, nil } func (w *sortedWriter) handleRequests() { defer w.closer.Done() process := func(req *request) { for i, e := range req.Entries { // If badger is running in InMemory mode, len(req.Ptrs) == 0. if i < len(req.Ptrs) { vptr := req.Ptrs[i] if !vptr.IsZero() { y.AssertTrue(w.head.Less(vptr)) w.head = vptr } } var vs y.ValueStruct if e.skipVlog { vs = y.ValueStruct{ Value: e.Value, Meta: e.meta, UserMeta: e.UserMeta, ExpiresAt: e.ExpiresAt, } } else { vptr := req.Ptrs[i] vs = y.ValueStruct{ Value: vptr.Encode(), Meta: e.meta | bitValuePointer, UserMeta: e.UserMeta, ExpiresAt: e.ExpiresAt, } } if err := w.Add(e.Key, vs); err != nil { panic(err) } } } for { select { case req := <-w.reqCh: process(req) case <-w.closer.HasBeenClosed(): close(w.reqCh) for req := range w.reqCh { process(req) } return } } } // Add adds key and vs to sortedWriter. func (w *sortedWriter) Add(key []byte, vs y.ValueStruct) error { if len(w.lastKey) > 0 && y.CompareKeys(key, w.lastKey) <= 0 { return errors.Errorf("keys not in sorted order (last key: %s, key: %s)", hex.Dump(w.lastKey), hex.Dump(key)) } sameKey := y.SameKey(key, w.lastKey) // Same keys should go into the same SSTable. if !sameKey && w.builder.ReachedCapacity(w.db.opt.MaxTableSize) { if err := w.send(false); err != nil { return err } } w.lastKey = y.SafeCopy(w.lastKey, key) var vp valuePointer if vs.Meta&bitValuePointer > 0 { vp.Decode(vs.Value) } w.builder.Add(key, vs, vp.Len) return nil } func (w *sortedWriter) send(done bool) error { if err := w.throttle.Do(); err != nil { return err } go func(builder *table.Builder) { err := w.createTable(builder) w.throttle.Done(err) }(w.builder) // If done is true, this indicates we can close the writer. // No need to allocate underlying TableBuilder now. if done { w.builder = nil return nil } dk, err := w.db.registry.latestDataKey() if err != nil { return y.Wrapf(err, "Error while retriving datakey in sortedWriter.send") } bopts := buildTableOptions(w.db.opt) bopts.DataKey = dk w.builder = table.NewTableBuilder(bopts) return nil } // Done is called once we are done writing all keys and valueStructs // to sortedWriter. It completes writing current SST to disk. func (w *sortedWriter) Done() error { if w.builder.Empty() { // Assign builder as nil, so that underlying memory can be garbage collected. w.builder = nil return nil } return w.send(true) } func (w *sortedWriter) createTable(builder *table.Builder) error { data := builder.Finish() if len(data) == 0 { return nil } fileID := w.db.lc.reserveFileID() opts := buildTableOptions(w.db.opt) opts.DataKey = builder.DataKey() opts.BlockCache = w.db.blockCache opts.IndexCache = w.db.indexCache var tbl *table.Table if w.db.opt.InMemory { var err error if tbl, err = table.OpenInMemoryTable(data, fileID, &opts); err != nil { return err } } else { fd, err := y.CreateSyncedFile(table.NewFilename(fileID, w.db.opt.Dir), true) if err != nil { return err } if _, err := fd.Write(data); err != nil { return err } if tbl, err = table.OpenTable(fd, opts); err != nil { return err } } lc := w.db.lc var lhandler *levelHandler // We should start the levels from 1, because we need level 0 to set the !badger!head key. We // cannot mix up this key with other keys from the DB, otherwise we would introduce a range // overlap violation. y.AssertTrue(len(lc.levels) > 1) for _, l := range lc.levels[1:] { ratio := float64(l.getTotalSize()) / float64(l.maxTotalSize) if ratio < 1.0 { lhandler = l break } } if lhandler == nil { // If we're exceeding the size of the lowest level, shove it in the lowest level. Can't do // better than that. lhandler = lc.levels[len(lc.levels)-1] } if w.streamID == headStreamId { // This is a special !badger!head key. We should store it at level 0, separate from all the // other keys to avoid an overlap. lhandler = lc.levels[0] } // Now that table can be opened successfully, let's add this to the MANIFEST. change := &pb.ManifestChange{ Id: tbl.ID(), KeyId: tbl.KeyID(), Op: pb.ManifestChange_CREATE, Level: uint32(lhandler.level), Compression: uint32(tbl.CompressionType()), } if err := w.db.manifest.addChanges([]*pb.ManifestChange{change}); err != nil { return err } // We are not calling lhandler.replaceTables() here, as it sorts tables on every addition. // We can sort all tables only once during Flush() call. lhandler.addTable(tbl) // Release the ref held by OpenTable. _ = tbl.DecrRef() w.db.opt.Infof("Table created: %d at level: %d for stream: %d. Size: %s\n", fileID, lhandler.level, w.streamID, humanize.Bytes(uint64(tbl.Size()))) return nil } badger-2.2007.2/stream_writer_test.go000066400000000000000000000346431372173116500174650ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "bytes" "encoding/binary" "fmt" "io/ioutil" "math" "math/rand" "os" "testing" "github.com/stretchr/testify/require" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/y" ) func getSortedKVList(valueSize, listSize int) *pb.KVList { value := make([]byte, valueSize) y.Check2(rand.Read(value)) list := &pb.KVList{} for i := 0; i < listSize; i++ { key := make([]byte, 8) binary.BigEndian.PutUint64(key, uint64(i)) list.Kv = append(list.Kv, &pb.KV{ Key: key, Value: value, Version: 20, }) } return list } // check if we can read values after writing using stream writer func TestStreamWriter1(t *testing.T) { test := func(t *testing.T, opts *Options) { runBadgerTest(t, opts, func(t *testing.T, db *DB) { // write entries using stream writer noOfKeys := 1000 valueSize := 128 list := getSortedKVList(valueSize, noOfKeys) sw := db.NewStreamWriter() require.NoError(t, sw.Prepare(), "sw.Prepare() failed") require.NoError(t, sw.Write(list), "sw.Write() failed") require.NoError(t, sw.Flush(), "sw.Flush() failed") err := db.View(func(txn *Txn) error { // read any random key from inserted keys keybyte := make([]byte, 8) keyNo := uint64(rand.Int63n(int64(noOfKeys))) binary.BigEndian.PutUint64(keybyte, keyNo) _, err := txn.Get(keybyte) require.Nil(t, err, "key should be found") // count all keys written using stream writer keysCount := 0 itrOps := DefaultIteratorOptions it := txn.NewIterator(itrOps) defer it.Close() for it.Rewind(); it.Valid(); it.Next() { keysCount++ } require.True(t, keysCount == noOfKeys, "count of keys should be matched") return nil }) require.NoError(t, err, "error while retrieving key") }) } t.Run("Normal mode", func(t *testing.T) { normalModeOpts := getTestOptions("") test(t, &normalModeOpts) }) t.Run("Managed mode", func(t *testing.T) { managedModeOpts := getTestOptions("") managedModeOpts.managedTxns = true test(t, &managedModeOpts) }) t.Run("InMemory mode", func(t *testing.T) { diskLessModeOpts := getTestOptions("") diskLessModeOpts.InMemory = true test(t, &diskLessModeOpts) }) } // write more keys to db after writing keys using stream writer func TestStreamWriter2(t *testing.T) { test := func(t *testing.T, opts *Options) { runBadgerTest(t, opts, func(t *testing.T, db *DB) { // write entries using stream writer noOfKeys := 1000 valueSize := 128 list := getSortedKVList(valueSize, noOfKeys) sw := db.NewStreamWriter() require.NoError(t, sw.Prepare(), "sw.Prepare() failed") require.NoError(t, sw.Write(list), "sw.Write() failed") // get max version of sw, will be used in transactions for managed mode maxVs := sw.maxVersion require.NoError(t, sw.Flush(), "sw.Flush() failed") // delete all the inserted keys val := make([]byte, valueSize) y.Check2(rand.Read(val)) for i := 0; i < noOfKeys; i++ { txn := db.newTransaction(true, opts.managedTxns) if opts.managedTxns { txn.readTs = math.MaxUint64 txn.commitTs = maxVs } keybyte := make([]byte, 8) keyNo := uint64(i) binary.BigEndian.PutUint64(keybyte, keyNo) require.NoError(t, txn.Delete(keybyte), "error while deleting keys") require.NoError(t, txn.Commit(), "error while commit") } // verify while iteration count of keys should be 0 err := db.View(func(txn *Txn) error { keysCount := 0 itrOps := DefaultIteratorOptions it := txn.NewIterator(itrOps) defer it.Close() for it.Rewind(); it.Valid(); it.Next() { keysCount++ } require.Zero(t, keysCount, "count of keys should be 0") return nil }) require.Nil(t, err, "error should be nil while iterating") }) } t.Run("Normal mode", func(t *testing.T) { normalModeOpts := getTestOptions("") test(t, &normalModeOpts) }) t.Run("Managed mode", func(t *testing.T) { managedModeOpts := getTestOptions("") managedModeOpts.managedTxns = true test(t, &managedModeOpts) }) t.Run("InMemory mode", func(t *testing.T) { diskLessModeOpts := getTestOptions("") diskLessModeOpts.InMemory = true test(t, &diskLessModeOpts) }) } func TestStreamWriter3(t *testing.T) { test := func(t *testing.T, opts *Options) { runBadgerTest(t, opts, func(t *testing.T, db *DB) { // write entries using stream writer noOfKeys := 1000 valueSize := 128 // insert keys which are even value := make([]byte, valueSize) y.Check2(rand.Read(value)) list := &pb.KVList{} counter := 0 for i := 0; i < noOfKeys; i++ { key := make([]byte, 8) binary.BigEndian.PutUint64(key, uint64(counter)) list.Kv = append(list.Kv, &pb.KV{ Key: key, Value: value, Version: 20, }) counter = counter + 2 } sw := db.NewStreamWriter() require.NoError(t, sw.Prepare(), "sw.Prepare() failed") require.NoError(t, sw.Write(list), "sw.Write() failed") // get max version of sw, will be used in transactions for managed mode maxVs := sw.maxVersion require.NoError(t, sw.Flush(), "sw.Flush() failed") // insert keys which are odd val := make([]byte, valueSize) y.Check2(rand.Read(val)) counter = 1 for i := 0; i < noOfKeys; i++ { txn := db.newTransaction(true, opts.managedTxns) if opts.managedTxns { txn.readTs = math.MaxUint64 txn.commitTs = maxVs } keybyte := make([]byte, 8) keyNo := uint64(counter) binary.BigEndian.PutUint64(keybyte, keyNo) require.NoError(t, txn.SetEntry(NewEntry(keybyte, val)), "error while inserting entries") require.NoError(t, txn.Commit(), "error while commit") counter = counter + 2 } // verify while iteration keys are in sorted order err := db.View(func(txn *Txn) error { keysCount := 0 itrOps := DefaultIteratorOptions it := txn.NewIterator(itrOps) defer it.Close() prev := uint64(0) for it.Rewind(); it.Valid(); it.Next() { item := it.Item() key := item.Key() current := binary.BigEndian.Uint64(key) if prev != 0 && current != (prev+uint64(1)) { t.Fatal("keys should be in increasing order") } keysCount++ prev = current } require.True(t, keysCount == 2*noOfKeys, "count of keys is not matching") return nil }) require.Nil(t, err, "error should be nil while iterating") }) } t.Run("Normal mode", func(t *testing.T) { normalModeOpts := getTestOptions("") test(t, &normalModeOpts) }) t.Run("Managed mode", func(t *testing.T) { managedModeOpts := getTestOptions("") managedModeOpts.managedTxns = true test(t, &managedModeOpts) }) t.Run("InMemory mode", func(t *testing.T) { diskLessModeOpts := getTestOptions("") diskLessModeOpts.InMemory = true test(t, &diskLessModeOpts) }) } // After inserting all data from streams, StreamWriter reinitializes Oracle and updates its nextTs // to maxVersion found in all entries inserted(if db is running in non managed mode). It also // updates Oracle's txnMark and readMark. If Oracle is not reinitialized, it might cause issue // while updating readMark and txnMark when its nextTs is ahead of maxVersion. This tests verifies // Oracle reinitialization is happening. Try commenting line 171 in stream_writer.go with code // (sw.db.orc = newOracle(sw.db.opt), this test should fail. func TestStreamWriter4(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { // first insert some entries in db for i := 0; i < 10; i++ { err := db.Update(func(txn *Txn) error { key := []byte(fmt.Sprintf("key-%d", i)) value := []byte(fmt.Sprintf("val-%d", i)) return txn.Set(key, value) }) require.NoError(t, err, "error while updating db") } list := &pb.KVList{} list.Kv = append(list.Kv, &pb.KV{ Key: []byte("key-1"), Value: []byte("value-1"), Version: 1, }) sw := db.NewStreamWriter() require.NoError(t, sw.Prepare(), "sw.Prepare() failed") require.NoError(t, sw.Write(list), "sw.Write() failed") require.NoError(t, sw.Flush(), "sw.Flush() failed") }) } func TestStreamWriter5(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { list := &pb.KVList{} left := make([]byte, 6) left[0] = 0x00 copy(left[1:], []byte("break")) right := make([]byte, 6) right[0] = 0xff copy(right[1:], []byte("break")) list.Kv = append(list.Kv, &pb.KV{ Key: left, Value: []byte("val"), Version: 1, }) list.Kv = append(list.Kv, &pb.KV{ Key: right, Value: []byte("val"), Version: 1, }) sw := db.NewStreamWriter() require.NoError(t, sw.Prepare(), "sw.Prepare() failed") require.NoError(t, sw.Write(list), "sw.Write() failed") require.NoError(t, sw.Flush(), "sw.Flush() failed") require.NoError(t, db.Close()) var err error db, err = Open(db.opt) require.NoError(t, err) require.NoError(t, db.Close()) }) } // This test tries to insert multiple equal keys(without version) and verifies // if those are going to same table. func TestStreamWriter6(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { list := &pb.KVList{} str := []string{"a", "a", "b", "b", "c", "c"} ver := 1 for i := range str { kv := &pb.KV{ Key: bytes.Repeat([]byte(str[i]), int(db.opt.MaxTableSize)), Value: []byte("val"), Version: uint64(ver), } list.Kv = append(list.Kv, kv) ver = (ver + 1) % 2 } // list has 3 pairs for equal keys. Since each Key has size equal to MaxTableSize // we would have 6 tables, if keys are not equal. Here we should have 3 tables. sw := db.NewStreamWriter() require.NoError(t, sw.Prepare(), "sw.Prepare() failed") require.NoError(t, sw.Write(list), "sw.Write() failed") require.NoError(t, sw.Flush(), "sw.Flush() failed") tables := db.Tables(true) require.Equal(t, 4, len(tables), "Count of tables not matching") for _, tab := range tables { if tab.Level > 0 { require.Equal(t, 2, int(tab.KeyCount), fmt.Sprintf("failed for level: %d", tab.Level)) } else { require.Equal(t, 1, int(tab.KeyCount)) // level 0 table will have head key } } require.NoError(t, db.Close()) db, err := Open(db.opt) require.NoError(t, err) require.NoError(t, db.Close()) }) } func TestStreamDone(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { sw := db.NewStreamWriter() require.NoError(t, sw.Prepare(), "sw.Prepare() failed") var val [10]byte rand.Read(val[:]) for i := 0; i < 10; i++ { list := &pb.KVList{} kv1 := &pb.KV{ Key: []byte(fmt.Sprintf("%d", i)), Value: val[:], Version: 1, StreamId: uint32(i), } kv2 := &pb.KV{ StreamId: uint32(i), StreamDone: true, } list.Kv = append(list.Kv, kv1, kv2) require.NoError(t, sw.Write(list), "sw.Write() failed") } require.NoError(t, sw.Flush(), "sw.Flush() failed") require.NoError(t, db.Close()) var err error db, err = Open(db.opt) require.NoError(t, err) require.NoError(t, db.Close()) }) } func TestSendOnClosedStream(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer func() { require.NoError(t, os.RemoveAll(dir)) }() opts := getTestOptions(dir) db, err := Open(opts) require.NoError(t, err) sw := db.NewStreamWriter() require.NoError(t, sw.Prepare(), "sw.Prepare() failed") var val [10]byte rand.Read(val[:]) list := &pb.KVList{} kv1 := &pb.KV{ Key: []byte(fmt.Sprintf("%d", 1)), Value: val[:], Version: 1, StreamId: uint32(1), } kv2 := &pb.KV{ StreamId: uint32(1), StreamDone: true, } list.Kv = append(list.Kv, kv1, kv2) require.NoError(t, sw.Write(list), "sw.Write() failed") // Defer for panic. defer func() { require.NotNil(t, recover(), "should have paniced") require.NoError(t, sw.Flush()) require.NoError(t, db.Close()) }() // Send once stream is closed. list = &pb.KVList{} kv1 = &pb.KV{ Key: []byte(fmt.Sprintf("%d", 2)), Value: val[:], Version: 1, StreamId: uint32(1), } list.Kv = append(list.Kv, kv1) sw.Write(list) } func TestSendOnClosedStream2(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer func() { require.NoError(t, os.RemoveAll(dir)) }() opts := getTestOptions(dir) db, err := Open(opts) require.NoError(t, err) sw := db.NewStreamWriter() require.NoError(t, sw.Prepare(), "sw.Prepare() failed") var val [10]byte rand.Read(val[:]) list := &pb.KVList{} kv1 := &pb.KV{ Key: []byte(fmt.Sprintf("%d", 1)), Value: val[:], Version: 1, StreamId: uint32(1), } kv2 := &pb.KV{ StreamId: uint32(1), StreamDone: true, } kv3 := &pb.KV{ Key: []byte(fmt.Sprintf("%d", 2)), Value: val[:], Version: 1, StreamId: uint32(1), } list.Kv = append(list.Kv, kv1, kv2, kv3) // Defer for panic. defer func() { require.NotNil(t, recover(), "should have paniced") require.NoError(t, sw.Flush()) require.NoError(t, db.Close()) }() require.NoError(t, sw.Write(list), "sw.Write() failed") } func TestStreamWriterEncrypted(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) opts := DefaultOptions(dir) defer removeDir(dir) opts = opts.WithEncryptionKey([]byte("badgerkey16bytes")) db, err := Open(opts) require.NoError(t, err) key := []byte("mykey") value := []byte("myvalue") list := &pb.KVList{} list.Kv = append(list.Kv, &pb.KV{ Key: key, Value: value, Version: 20, }) sw := db.NewStreamWriter() require.NoError(t, sw.Prepare(), "Prepare failed") require.NoError(t, sw.Write(list), "Write failed") require.NoError(t, sw.Flush(), "Flush failed") err = db.View(func(txn *Txn) error { item, err := txn.Get(key) require.NoError(t, err) val, err := item.ValueCopy(nil) require.Equal(t, value, val) require.NoError(t, err) return nil }) require.NoError(t, err, "Error while retrieving key") require.NoError(t, db.Close()) opts = opts.WithEncryptionKey([]byte("badgerkey16bytes")) db, err = Open(opts) require.NoError(t, err) require.NoError(t, db.Close()) } badger-2.2007.2/structs.go000066400000000000000000000143621372173116500152420ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "encoding/binary" "fmt" "time" "unsafe" ) type valuePointer struct { Fid uint32 Len uint32 Offset uint32 } const vptrSize = unsafe.Sizeof(valuePointer{}) func (p valuePointer) Less(o valuePointer) bool { if p.Fid != o.Fid { return p.Fid < o.Fid } if p.Offset != o.Offset { return p.Offset < o.Offset } return p.Len < o.Len } func (p valuePointer) IsZero() bool { return p.Fid == 0 && p.Offset == 0 && p.Len == 0 } // Encode encodes Pointer into byte buffer. func (p valuePointer) Encode() []byte { b := make([]byte, vptrSize) // Copy over the content from p to b. *(*valuePointer)(unsafe.Pointer(&b[0])) = p return b } // Decode decodes the value pointer into the provided byte buffer. func (p *valuePointer) Decode(b []byte) { // Copy over data from b into p. Using *p=unsafe.pointer(...) leads to // pointer alignment issues. See https://github.com/dgraph-io/badger/issues/1096 // and comment https://github.com/dgraph-io/badger/pull/1097#pullrequestreview-307361714 copy(((*[vptrSize]byte)(unsafe.Pointer(p))[:]), b[:vptrSize]) } // header is used in value log as a header before Entry. type header struct { klen uint32 vlen uint32 expiresAt uint64 meta byte userMeta byte } const ( // Maximum possible size of the header. The maximum size of header struct will be 18 but the // maximum size of varint encoded header will be 21. maxHeaderSize = 21 ) // Encode encodes the header into []byte. The provided []byte should be atleast 5 bytes. The // function will panic if out []byte isn't large enough to hold all the values. // The encoded header looks like // +------+----------+------------+--------------+-----------+ // | Meta | UserMeta | Key Length | Value Length | ExpiresAt | // +------+----------+------------+--------------+-----------+ func (h header) Encode(out []byte) int { out[0], out[1] = h.meta, h.userMeta index := 2 index += binary.PutUvarint(out[index:], uint64(h.klen)) index += binary.PutUvarint(out[index:], uint64(h.vlen)) index += binary.PutUvarint(out[index:], h.expiresAt) return index } // Decode decodes the given header from the provided byte slice. // Returns the number of bytes read. func (h *header) Decode(buf []byte) int { h.meta, h.userMeta = buf[0], buf[1] index := 2 klen, count := binary.Uvarint(buf[index:]) h.klen = uint32(klen) index += count vlen, count := binary.Uvarint(buf[index:]) h.vlen = uint32(vlen) index += count h.expiresAt, count = binary.Uvarint(buf[index:]) return index + count } // DecodeFrom reads the header from the hashReader. // Returns the number of bytes read. func (h *header) DecodeFrom(reader *hashReader) (int, error) { var err error h.meta, err = reader.ReadByte() if err != nil { return 0, err } h.userMeta, err = reader.ReadByte() if err != nil { return 0, err } klen, err := binary.ReadUvarint(reader) if err != nil { return 0, err } h.klen = uint32(klen) vlen, err := binary.ReadUvarint(reader) if err != nil { return 0, err } h.vlen = uint32(vlen) h.expiresAt, err = binary.ReadUvarint(reader) if err != nil { return 0, err } return reader.bytesRead, nil } // Entry provides Key, Value, UserMeta and ExpiresAt. This struct can be used by // the user to set data. type Entry struct { Key []byte Value []byte UserMeta byte ExpiresAt uint64 // time.Unix meta byte version uint64 // Fields maintained internally. offset uint32 skipVlog bool hlen int // Length of the header. } func (e *Entry) estimateSize(threshold int) int { if len(e.Value) < threshold { return len(e.Key) + len(e.Value) + 2 // Meta, UserMeta } return len(e.Key) + 12 + 2 // 12 for ValuePointer, 2 for metas. } func (e Entry) print(prefix string) { fmt.Printf("%s Key: %s Meta: %d UserMeta: %d Offset: %d len(val)=%d", prefix, e.Key, e.meta, e.UserMeta, e.offset, len(e.Value)) } // NewEntry creates a new entry with key and value passed in args. This newly created entry can be // set in a transaction by calling txn.SetEntry(). All other properties of Entry can be set by // calling WithMeta, WithDiscard, WithTTL methods on it. // This function uses key and value reference, hence users must // not modify key and value until the end of transaction. func NewEntry(key, value []byte) *Entry { return &Entry{ Key: key, Value: value, } } // WithMeta adds meta data to Entry e. This byte is stored alongside the key // and can be used as an aid to interpret the value or store other contextual // bits corresponding to the key-value pair of entry. func (e *Entry) WithMeta(meta byte) *Entry { e.UserMeta = meta return e } // WithDiscard adds a marker to Entry e. This means all the previous versions of the key (of the // Entry) will be eligible for garbage collection. // This method is only useful if you have set a higher limit for options.NumVersionsToKeep. The // default setting is 1, in which case, this function doesn't add any more benefit. If however, you // have a higher setting for NumVersionsToKeep (in Dgraph, we set it to infinity), you can use this // method to indicate that all the older versions can be discarded and removed during compactions. func (e *Entry) WithDiscard() *Entry { e.meta = bitDiscardEarlierVersions return e } // WithTTL adds time to live duration to Entry e. Entry stored with a TTL would automatically expire // after the time has elapsed, and will be eligible for garbage collection. func (e *Entry) WithTTL(dur time.Duration) *Entry { e.ExpiresAt = uint64(time.Now().Add(dur).Unix()) return e } // withMergeBit sets merge bit in entry's metadata. This // function is called by MergeOperator's Add method. func (e *Entry) withMergeBit() *Entry { e.meta = bitMergeEntry return e } badger-2.2007.2/table/000077500000000000000000000000001372173116500142655ustar00rootroot00000000000000badger-2.2007.2/table/README.md000066400000000000000000000070511372173116500155470ustar00rootroot00000000000000Size of table is 123,217,667 bytes for all benchmarks. # BenchmarkRead ``` $ go test -bench ^BenchmarkRead$ -run ^$ -count 3 goos: linux goarch: amd64 pkg: github.com/dgraph-io/badger/table BenchmarkRead-16 10 154074944 ns/op BenchmarkRead-16 10 154340411 ns/op BenchmarkRead-16 10 151914489 ns/op PASS ok github.com/dgraph-io/badger/table 22.467s ``` Size of table is 123,217,667 bytes, which is ~118MB. The rate is ~762MB/s using LoadToRAM (when table is in RAM). To read a 64MB table, this would take ~0.084s, which is negligible. # BenchmarkReadAndBuild ```go $ go test -bench BenchmarkReadAndBuild -run ^$ -count 3 goos: linux goarch: amd64 pkg: github.com/dgraph-io/badger/table BenchmarkReadAndBuild-16 1 1026755231 ns/op BenchmarkReadAndBuild-16 1 1009543316 ns/op BenchmarkReadAndBuild-16 1 1039920546 ns/op PASS ok github.com/dgraph-io/badger/table 12.081s ``` The rate is ~123MB/s. To build a 64MB table, this would take ~0.56s. Note that this does NOT include the flushing of the table to disk. All we are doing above is reading one table (which is in RAM) and write one table in memory. The table building takes 0.56-0.084s ~ 0.4823s. # BenchmarkReadMerged Below, we merge 5 tables. The total size remains unchanged at ~122M. ```go $ go test -bench ReadMerged -run ^$ -count 3 goos: linux goarch: amd64 pkg: github.com/dgraph-io/badger/table BenchmarkReadMerged-16 2 977588975 ns/op BenchmarkReadMerged-16 2 982140738 ns/op BenchmarkReadMerged-16 2 962046017 ns/op PASS ok github.com/dgraph-io/badger/table 27.433s ``` The rate is ~120MB/s. To read a 64MB table using merge iterator, this would take ~0.53s. # BenchmarkRandomRead ```go go test -bench BenchmarkRandomRead$ -run ^$ -count 3 goos: linux goarch: amd64 pkg: github.com/dgraph-io/badger/table BenchmarkRandomRead-16 500000 2645 ns/op BenchmarkRandomRead-16 500000 2648 ns/op BenchmarkRandomRead-16 500000 2614 ns/op PASS ok github.com/dgraph-io/badger/table 50.850s ``` For random read benchmarking, we are randomly reading a key and verifying its value. # DB Open benchmark 1. Create badger DB with 2 billion key-value pairs (about 380GB of data) ``` badger fill -m 2000 --dir="/tmp/data" --sorted ``` 2. Clear buffers and swap memory ``` free -mh && sync && echo 3 | sudo tee /proc/sys/vm/drop_caches && sudo swapoff -a && sudo swapon -a && free -mh ``` Also flush disk buffers ``` blockdev --flushbufs /dev/nvme0n1p4 ``` 3. Run the benchmark ``` go test -run=^$ github.com/dgraph-io/badger -bench ^BenchmarkDBOpen$ -benchdir="/tmp/data" -v badger 2019/06/04 17:15:56 INFO: 126 tables out of 1028 opened in 3.017s badger 2019/06/04 17:15:59 INFO: 257 tables out of 1028 opened in 6.014s badger 2019/06/04 17:16:02 INFO: 387 tables out of 1028 opened in 9.017s badger 2019/06/04 17:16:05 INFO: 516 tables out of 1028 opened in 12.025s badger 2019/06/04 17:16:08 INFO: 645 tables out of 1028 opened in 15.013s badger 2019/06/04 17:16:11 INFO: 775 tables out of 1028 opened in 18.008s badger 2019/06/04 17:16:14 INFO: 906 tables out of 1028 opened in 21.003s badger 2019/06/04 17:16:17 INFO: All 1028 tables opened in 23.851s badger 2019/06/04 17:16:17 INFO: Replaying file id: 1998 at offset: 332000 badger 2019/06/04 17:16:17 INFO: Replay took: 9.81µs goos: linux goarch: amd64 pkg: github.com/dgraph-io/badger BenchmarkDBOpen-16 1 23930082140 ns/op PASS ok github.com/dgraph-io/badger 24.076s ``` It takes about 23.851s to open a DB with 2 billion sorted key-value entries. badger-2.2007.2/table/builder.go000066400000000000000000000266171372173116500162560ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package table import ( "bytes" "crypto/aes" "math" "unsafe" "github.com/dgryski/go-farm" "github.com/golang/protobuf/proto" "github.com/golang/snappy" "github.com/pkg/errors" "github.com/dgraph-io/badger/v2/options" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/y" "github.com/dgraph-io/ristretto/z" ) func newBuffer(sz int) *bytes.Buffer { b := new(bytes.Buffer) b.Grow(sz) return b } type header struct { overlap uint16 // Overlap with base key. diff uint16 // Length of the diff. } const headerSize = uint16(unsafe.Sizeof(header{})) // Encode encodes the header. func (h header) Encode() []byte { var b [4]byte *(*header)(unsafe.Pointer(&b[0])) = h return b[:] } // Decode decodes the header. func (h *header) Decode(buf []byte) { // Copy over data from buf into h. Using *h=unsafe.pointer(...) leads to // pointer alignment issues. See https://github.com/dgraph-io/badger/issues/1096 // and comment https://github.com/dgraph-io/badger/pull/1097#pullrequestreview-307361714 copy(((*[headerSize]byte)(unsafe.Pointer(h))[:]), buf[:headerSize]) } // Builder is used in building a table. type Builder struct { // Typically tens or hundreds of meg. This is for one single file. buf *bytes.Buffer baseKey []byte // Base key for the current block. baseOffset uint32 // Offset for the current block. entryOffsets []uint32 // Offsets of entries present in current block. tableIndex *pb.TableIndex keyHashes []uint64 // Used for building the bloomfilter. opt *Options } // NewTableBuilder makes a new TableBuilder. func NewTableBuilder(opts Options) *Builder { return &Builder{ buf: newBuffer(1 << 20), tableIndex: &pb.TableIndex{}, keyHashes: make([]uint64, 0, 1024), // Avoid some malloc calls. opt: &opts, } } // Close closes the TableBuilder. func (b *Builder) Close() {} // Empty returns whether it's empty. func (b *Builder) Empty() bool { return b.buf.Len() == 0 } // keyDiff returns a suffix of newKey that is different from b.baseKey. func (b *Builder) keyDiff(newKey []byte) []byte { var i int for i = 0; i < len(newKey) && i < len(b.baseKey); i++ { if newKey[i] != b.baseKey[i] { break } } return newKey[i:] } func (b *Builder) addHelper(key []byte, v y.ValueStruct, vpLen uint64) { b.keyHashes = append(b.keyHashes, farm.Fingerprint64(y.ParseKey(key))) // diffKey stores the difference of key with baseKey. var diffKey []byte if len(b.baseKey) == 0 { // Make a copy. Builder should not keep references. Otherwise, caller has to be very careful // and will have to make copies of keys every time they add to builder, which is even worse. b.baseKey = append(b.baseKey[:0], key...) diffKey = key } else { diffKey = b.keyDiff(key) } y.AssertTrue(len(key)-len(diffKey) <= math.MaxUint16) y.AssertTrue(len(diffKey) <= math.MaxUint16) h := header{ overlap: uint16(len(key) - len(diffKey)), diff: uint16(len(diffKey)), } // store current entry's offset y.AssertTrue(uint32(b.buf.Len()) < math.MaxUint32) b.entryOffsets = append(b.entryOffsets, uint32(b.buf.Len())-b.baseOffset) // Layout: header, diffKey, value. b.buf.Write(h.Encode()) b.buf.Write(diffKey) // We only need to store the key difference. v.EncodeTo(b.buf) // Size of KV on SST. sstSz := uint64(uint32(headerSize) + uint32(len(diffKey)) + v.EncodedSize()) // Total estimated size = size on SST + size on vlog (length of value pointer). b.tableIndex.EstimatedSize += (sstSz + vpLen) } /* Structure of Block. +-------------------+---------------------+--------------------+--------------+------------------+ | Entry1 | Entry2 | Entry3 | Entry4 | Entry5 | +-------------------+---------------------+--------------------+--------------+------------------+ | Entry6 | ... | ... | ... | EntryN | +-------------------+---------------------+--------------------+--------------+------------------+ | Block Meta(contains list of offsets used| Block Meta Size | Block | Checksum Size | | to perform binary search in the block) | (4 Bytes) | Checksum | (4 Bytes) | +-----------------------------------------+--------------------+--------------+------------------+ */ // In case the data is encrypted, the "IV" is added to the end of the block. func (b *Builder) finishBlock() { b.buf.Write(y.U32SliceToBytes(b.entryOffsets)) b.buf.Write(y.U32ToBytes(uint32(len(b.entryOffsets)))) blockBuf := b.buf.Bytes()[b.baseOffset:] // Store checksum for current block. b.writeChecksum(blockBuf) // Compress the block. if b.opt.Compression != options.None { var err error // TODO: Find a way to reuse buffers. Current implementation creates a // new buffer for each compressData call. blockBuf, err = b.compressData(b.buf.Bytes()[b.baseOffset:]) y.Check(err) // Truncate already written data. b.buf.Truncate(int(b.baseOffset)) // Write compressed data. b.buf.Write(blockBuf) } if b.shouldEncrypt() { block := b.buf.Bytes()[b.baseOffset:] eBlock, err := b.encrypt(block) y.Check(y.Wrapf(err, "Error while encrypting block in table builder.")) // We're rewriting the block, after encrypting. b.buf.Truncate(int(b.baseOffset)) b.buf.Write(eBlock) } // TODO(Ashish):Add padding: If we want to make block as multiple of OS pages, we can // implement padding. This might be useful while using direct I/O. // Add key to the block index bo := &pb.BlockOffset{ Key: y.Copy(b.baseKey), Offset: b.baseOffset, Len: uint32(b.buf.Len()) - b.baseOffset, } b.tableIndex.Offsets = append(b.tableIndex.Offsets, bo) } func (b *Builder) shouldFinishBlock(key []byte, value y.ValueStruct) bool { // If there is no entry till now, we will return false. if len(b.entryOffsets) <= 0 { return false } // Integer overflow check for statements below. y.AssertTrue((uint32(len(b.entryOffsets))+1)*4+4+8+4 < math.MaxUint32) // We should include current entry also in size, that's why +1 to len(b.entryOffsets). entriesOffsetsSize := uint32((len(b.entryOffsets)+1)*4 + 4 + // size of list 8 + // Sum64 in checksum proto 4) // checksum length estimatedSize := uint32(b.buf.Len()) - b.baseOffset + uint32(6 /*header size for entry*/) + uint32(len(key)) + uint32(value.EncodedSize()) + entriesOffsetsSize if b.shouldEncrypt() { // IV is added at the end of the block, while encrypting. // So, size of IV is added to estimatedSize. estimatedSize += aes.BlockSize } return estimatedSize > uint32(b.opt.BlockSize) } // Add adds a key-value pair to the block. func (b *Builder) Add(key []byte, value y.ValueStruct, valueLen uint32) { if b.shouldFinishBlock(key, value) { b.finishBlock() // Start a new block. Initialize the block. b.baseKey = []byte{} y.AssertTrue(uint32(b.buf.Len()) < math.MaxUint32) b.baseOffset = uint32(b.buf.Len()) b.entryOffsets = b.entryOffsets[:0] } b.addHelper(key, value, uint64(valueLen)) } // TODO: vvv this was the comment on ReachedCapacity. // FinalSize returns the *rough* final size of the array, counting the header which is // not yet written. // TODO: Look into why there is a discrepancy. I suspect it is because of Write(empty, empty) // at the end. The diff can vary. // ReachedCapacity returns true if we... roughly (?) reached capacity? func (b *Builder) ReachedCapacity(cap int64) bool { blocksSize := b.buf.Len() + // length of current buffer len(b.entryOffsets)*4 + // all entry offsets size 4 + // count of all entry offsets 8 + // checksum bytes 4 // checksum length estimateSz := blocksSize + 4 + // Index length 5*(len(b.tableIndex.Offsets)) // approximate index size return int64(estimateSz) > cap } // Finish finishes the table by appending the index. /* The table structure looks like +---------+------------+-----------+---------------+ | Block 1 | Block 2 | Block 3 | Block 4 | +---------+------------+-----------+---------------+ | Block 5 | Block 6 | Block ... | Block N | +---------+------------+-----------+---------------+ | Index | Index Size | Checksum | Checksum Size | +---------+------------+-----------+---------------+ */ // In case the data is encrypted, the "IV" is added to the end of the index. func (b *Builder) Finish() []byte { bf := z.NewBloomFilter(float64(len(b.keyHashes)), b.opt.BloomFalsePositive) for _, h := range b.keyHashes { bf.Add(h) } // Add bloom filter to the index. b.tableIndex.BloomFilter = bf.JSONMarshal() b.finishBlock() // This will never start a new block. index, err := proto.Marshal(b.tableIndex) y.Check(err) if b.shouldEncrypt() { index, err = b.encrypt(index) y.Check(err) } // Write index the file. n, err := b.buf.Write(index) y.Check(err) y.AssertTrue(uint32(n) < math.MaxUint32) // Write index size. _, err = b.buf.Write(y.U32ToBytes(uint32(n))) y.Check(err) b.writeChecksum(index) return b.buf.Bytes() } func (b *Builder) writeChecksum(data []byte) { // Build checksum for the index. checksum := pb.Checksum{ // TODO: The checksum type should be configurable from the // options. // We chose to use CRC32 as the default option because // it performed better compared to xxHash64. // See the BenchmarkChecksum in table_test.go file // Size => 1024 B 2048 B // CRC32 => 63.7 ns/op 112 ns/op // xxHash64 => 87.5 ns/op 158 ns/op Sum: y.CalculateChecksum(data, pb.Checksum_CRC32C), Algo: pb.Checksum_CRC32C, } // Write checksum to the file. chksum, err := proto.Marshal(&checksum) y.Check(err) n, err := b.buf.Write(chksum) y.Check(err) y.AssertTrue(uint32(n) < math.MaxUint32) // Write checksum size. _, err = b.buf.Write(y.U32ToBytes(uint32(n))) y.Check(err) } // DataKey returns datakey of the builder. func (b *Builder) DataKey() *pb.DataKey { return b.opt.DataKey } // encrypt will encrypt the given data and appends IV to the end of the encrypted data. // This should be only called only after checking shouldEncrypt method. func (b *Builder) encrypt(data []byte) ([]byte, error) { iv, err := y.GenerateIV() if err != nil { return data, y.Wrapf(err, "Error while generating IV in Builder.encrypt") } data, err = y.XORBlock(data, b.DataKey().Data, iv) if err != nil { return data, y.Wrapf(err, "Error while encrypting in Builder.encrypt") } data = append(data, iv...) return data, nil } // shouldEncrypt tells us whether to encrypt the data or not. // We encrypt only if the data key exist. Otherwise, not. func (b *Builder) shouldEncrypt() bool { return b.opt.DataKey != nil } // compressData compresses the given data. func (b *Builder) compressData(data []byte) ([]byte, error) { switch b.opt.Compression { case options.None: return data, nil case options.Snappy: return snappy.Encode(nil, data), nil case options.ZSTD: return y.ZSTDCompress(nil, data, b.opt.ZSTDCompressionLevel) } return nil, errors.New("Unsupported compression type") } badger-2.2007.2/table/builder_test.go000066400000000000000000000110651372173116500173040ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package table import ( "fmt" "math/rand" "os" "testing" "time" "github.com/stretchr/testify/require" "github.com/dgraph-io/badger/v2/options" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/y" ) func TestTableIndex(t *testing.T) { rand.Seed(time.Now().Unix()) keyPrefix := "key" t.Run("single key", func(t *testing.T) { opts := Options{Compression: options.ZSTD} f := buildTestTable(t, keyPrefix, 1, opts) tbl, err := OpenTable(f, opts) require.NoError(t, err) require.Len(t, tbl.blockOffset, 1) }) t.Run("multiple keys", func(t *testing.T) { opts := []Options{} // Normal mode. opts = append(opts, Options{BlockSize: 4 * 1024, BloomFalsePositive: 0.01}) // Encryption mode. key := make([]byte, 32) _, err := rand.Read(key) require.NoError(t, err) opts = append(opts, Options{BlockSize: 4 * 1024, BloomFalsePositive: 0.01, DataKey: &pb.DataKey{Data: key}}) // Compression mode. opts = append(opts, Options{BlockSize: 4 * 1024, BloomFalsePositive: 0.01, Compression: options.ZSTD}) keysCount := 10000 for _, opt := range opts { builder := NewTableBuilder(opt) filename := fmt.Sprintf("%s%c%d.sst", os.TempDir(), os.PathSeparator, rand.Uint32()) f, err := y.OpenSyncedFile(filename, true) require.NoError(t, err) blockFirstKeys := make([][]byte, 0) blockCount := 0 for i := 0; i < keysCount; i++ { k := []byte(fmt.Sprintf("%016x", i)) v := fmt.Sprintf("%d", i) vs := y.ValueStruct{Value: []byte(v)} if i == 0 { // This is first key for first block. blockFirstKeys = append(blockFirstKeys, k) blockCount = 1 } else if builder.shouldFinishBlock(k, vs) { blockCount++ blockFirstKeys = append(blockFirstKeys, k) } builder.Add(k, vs, 0) } _, err = f.Write(builder.Finish()) require.NoError(t, err, "unable to write to file") tbl, err := OpenTable(f, opt) require.NoError(t, err, "unable to open table") if opt.DataKey == nil { // key id is zero if thre is no datakey. require.Equal(t, tbl.KeyID(), uint64(0)) } // Ensure index is built correctly require.Equal(t, blockCount, tbl.noOfBlocks) idx, err := tbl.readTableIndex() require.NoError(t, err) for i, ko := range idx.Offsets { require.Equal(t, ko.Key, blockFirstKeys[i]) } f.Close() require.NoError(t, os.RemoveAll(filename)) } }) } func TestInvalidCompression(t *testing.T) { keyPrefix := "key" opts := Options{Compression: options.ZSTD} f := buildTestTable(t, keyPrefix, 1000, opts) t.Run("with correct decompression algo", func(t *testing.T) { _, err := OpenTable(f, opts) require.NoError(t, err) }) t.Run("with incorrect decompression algo", func(t *testing.T) { // Set incorrect compression algorithm. opts.Compression = options.Snappy _, err := OpenTable(f, opts) require.Error(t, err) }) } func BenchmarkBuilder(b *testing.B) { rand.Seed(time.Now().Unix()) key := func(i int) []byte { return []byte(fmt.Sprintf("%032d", i)) } val := make([]byte, 32) rand.Read(val) vs := y.ValueStruct{Value: []byte(val)} keysCount := 1300000 // This number of entries consumes ~64MB of memory. bench := func(b *testing.B, opt *Options) { // KeyCount * (keySize + ValSize) b.SetBytes(int64(keysCount) * (32 + 32)) for i := 0; i < b.N; i++ { opt.BlockSize = 4 * 1024 opt.BloomFalsePositive = 0.01 builder := NewTableBuilder(*opt) for i := 0; i < keysCount; i++ { builder.Add(key(i), vs, 0) } _ = builder.Finish() } } b.Run("no compression", func(b *testing.B) { var opt Options opt.Compression = options.None bench(b, &opt) }) b.Run("zstd compression", func(b *testing.B) { var opt Options opt.Compression = options.ZSTD b.Run("level 1", func(b *testing.B) { opt.ZSTDCompressionLevel = 1 bench(b, &opt) }) b.Run("level 3", func(b *testing.B) { opt.ZSTDCompressionLevel = 3 bench(b, &opt) }) b.Run("level 15", func(b *testing.B) { opt.ZSTDCompressionLevel = 15 bench(b, &opt) }) }) } badger-2.2007.2/table/iterator.go000066400000000000000000000300561372173116500164510ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package table import ( "bytes" "io" "sort" "github.com/dgraph-io/badger/v2/y" "github.com/pkg/errors" ) type blockIterator struct { data []byte idx int // Idx of the entry inside a block err error baseKey []byte key []byte val []byte entryOffsets []uint32 // prevOverlap stores the overlap of the previous key with the base key. // This avoids unnecessary copy of base key when the overlap is same for multiple keys. prevOverlap uint16 } func (itr *blockIterator) setBlock(b *block) { itr.err = nil itr.idx = 0 itr.baseKey = itr.baseKey[:0] itr.prevOverlap = 0 itr.key = itr.key[:0] itr.val = itr.val[:0] // Drop the index from the block. We don't need it anymore. itr.data = b.data[:b.entriesIndexStart] itr.entryOffsets = b.entryOffsets } // setIdx sets the iterator to the entry at index i and set it's key and value. func (itr *blockIterator) setIdx(i int) { itr.idx = i if i >= len(itr.entryOffsets) || i < 0 { itr.err = io.EOF return } itr.err = nil startOffset := int(itr.entryOffsets[i]) // Set base key. if len(itr.baseKey) == 0 { var baseHeader header baseHeader.Decode(itr.data) itr.baseKey = itr.data[headerSize : headerSize+baseHeader.diff] } var endOffset int // idx points to the last entry in the block. if itr.idx+1 == len(itr.entryOffsets) { endOffset = len(itr.data) } else { // idx point to some entry other than the last one in the block. // EndOffset of the current entry is the start offset of the next entry. endOffset = int(itr.entryOffsets[itr.idx+1]) } entryData := itr.data[startOffset:endOffset] var h header h.Decode(entryData) // Header contains the length of key overlap and difference compared to the base key. If the key // before this one had the same or better key overlap, we can avoid copying that part into // itr.key. But, if the overlap was lesser, we could copy over just that portion. if h.overlap > itr.prevOverlap { itr.key = append(itr.key[:itr.prevOverlap], itr.baseKey[itr.prevOverlap:h.overlap]...) } itr.prevOverlap = h.overlap valueOff := headerSize + h.diff diffKey := entryData[headerSize:valueOff] itr.key = append(itr.key[:h.overlap], diffKey...) itr.val = entryData[valueOff:] } func (itr *blockIterator) Valid() bool { return itr != nil && itr.err == nil } func (itr *blockIterator) Error() error { return itr.err } func (itr *blockIterator) Close() {} var ( origin = 0 current = 1 ) // seek brings us to the first block element that is >= input key. func (itr *blockIterator) seek(key []byte, whence int) { itr.err = nil startIndex := 0 // This tells from which index we should start binary search. switch whence { case origin: // We don't need to do anything. startIndex is already at 0 case current: startIndex = itr.idx } foundEntryIdx := sort.Search(len(itr.entryOffsets), func(idx int) bool { // If idx is less than start index then just return false. if idx < startIndex { return false } itr.setIdx(idx) return y.CompareKeys(itr.key, key) >= 0 }) itr.setIdx(foundEntryIdx) } // seekToFirst brings us to the first element. func (itr *blockIterator) seekToFirst() { itr.setIdx(0) } // seekToLast brings us to the last element. func (itr *blockIterator) seekToLast() { itr.setIdx(len(itr.entryOffsets) - 1) } func (itr *blockIterator) next() { itr.setIdx(itr.idx + 1) } func (itr *blockIterator) prev() { itr.setIdx(itr.idx - 1) } // Iterator is an iterator for a Table. type Iterator struct { t *Table bpos int bi blockIterator err error // Internally, Iterator is bidirectional. However, we only expose the // unidirectional functionality for now. reversed bool } // NewIterator returns a new iterator of the Table func (t *Table) NewIterator(reversed bool) *Iterator { t.IncrRef() // Important. ti := &Iterator{t: t, reversed: reversed} ti.next() return ti } // Close closes the iterator (and it must be called). func (itr *Iterator) Close() error { return itr.t.DecrRef() } func (itr *Iterator) reset() { itr.bpos = 0 itr.err = nil } // Valid follows the y.Iterator interface func (itr *Iterator) Valid() bool { return itr.err == nil } func (itr *Iterator) seekToFirst() { numBlocks := itr.t.noOfBlocks if numBlocks == 0 { itr.err = io.EOF return } itr.bpos = 0 block, err := itr.t.block(itr.bpos) if err != nil { itr.err = err return } itr.bi.setBlock(block) itr.bi.seekToFirst() itr.err = itr.bi.Error() } func (itr *Iterator) seekToLast() { numBlocks := itr.t.noOfBlocks if numBlocks == 0 { itr.err = io.EOF return } itr.bpos = numBlocks - 1 block, err := itr.t.block(itr.bpos) if err != nil { itr.err = err return } itr.bi.setBlock(block) itr.bi.seekToLast() itr.err = itr.bi.Error() } func (itr *Iterator) seekHelper(blockIdx int, key []byte) { itr.bpos = blockIdx block, err := itr.t.block(blockIdx) if err != nil { itr.err = err return } itr.bi.setBlock(block) itr.bi.seek(key, origin) itr.err = itr.bi.Error() } // seekFrom brings us to a key that is >= input key. func (itr *Iterator) seekFrom(key []byte, whence int) { itr.err = nil switch whence { case origin: itr.reset() case current: } idx := sort.Search(itr.t.noOfBlocks, func(idx int) bool { ko := itr.t.blockOffsets()[idx] return y.CompareKeys(ko.Key, key) > 0 }) if idx == 0 { // The smallest key in our table is already strictly > key. We can return that. // This is like a SeekToFirst. itr.seekHelper(0, key) return } // block[idx].smallest is > key. // Since idx>0, we know block[idx-1].smallest is <= key. // There are two cases. // 1) Everything in block[idx-1] is strictly < key. In this case, we should go to the first // element of block[idx]. // 2) Some element in block[idx-1] is >= key. We should go to that element. itr.seekHelper(idx-1, key) if itr.err == io.EOF { // Case 1. Need to visit block[idx]. if idx == itr.t.noOfBlocks { // If idx == len(itr.t.blockIndex), then input key is greater than ANY element of table. // There's nothing we can do. Valid() should return false as we seek to end of table. return } // Since block[idx].smallest is > key. This is essentially a block[idx].SeekToFirst. itr.seekHelper(idx, key) } // Case 2: No need to do anything. We already did the seek in block[idx-1]. } // seek will reset iterator and seek to >= key. func (itr *Iterator) seek(key []byte) { itr.seekFrom(key, origin) } // seekForPrev will reset iterator and seek to <= key. func (itr *Iterator) seekForPrev(key []byte) { // TODO: Optimize this. We shouldn't have to take a Prev step. itr.seekFrom(key, origin) if !bytes.Equal(itr.Key(), key) { itr.prev() } } func (itr *Iterator) next() { itr.err = nil if itr.bpos >= itr.t.noOfBlocks { itr.err = io.EOF return } if len(itr.bi.data) == 0 { block, err := itr.t.block(itr.bpos) if err != nil { itr.err = err return } itr.bi.setBlock(block) itr.bi.seekToFirst() itr.err = itr.bi.Error() return } itr.bi.next() if !itr.bi.Valid() { itr.bpos++ itr.bi.data = nil itr.next() return } } func (itr *Iterator) prev() { itr.err = nil if itr.bpos < 0 { itr.err = io.EOF return } if len(itr.bi.data) == 0 { block, err := itr.t.block(itr.bpos) if err != nil { itr.err = err return } itr.bi.setBlock(block) itr.bi.seekToLast() itr.err = itr.bi.Error() return } itr.bi.prev() if !itr.bi.Valid() { itr.bpos-- itr.bi.data = nil itr.prev() return } } // Key follows the y.Iterator interface. // Returns the key with timestamp. func (itr *Iterator) Key() []byte { return itr.bi.key } // Value follows the y.Iterator interface func (itr *Iterator) Value() (ret y.ValueStruct) { ret.Decode(itr.bi.val) return } // ValueCopy copies the current value and returns it as decoded // ValueStruct. func (itr *Iterator) ValueCopy() (ret y.ValueStruct) { dst := y.Copy(itr.bi.val) ret.Decode(dst) return } // Next follows the y.Iterator interface func (itr *Iterator) Next() { if !itr.reversed { itr.next() } else { itr.prev() } } // Rewind follows the y.Iterator interface func (itr *Iterator) Rewind() { if !itr.reversed { itr.seekToFirst() } else { itr.seekToLast() } } // Seek follows the y.Iterator interface func (itr *Iterator) Seek(key []byte) { if !itr.reversed { itr.seek(key) } else { itr.seekForPrev(key) } } // ConcatIterator concatenates the sequences defined by several iterators. (It only works with // TableIterators, probably just because it's faster to not be so generic.) type ConcatIterator struct { idx int // Which iterator is active now. cur *Iterator iters []*Iterator // Corresponds to tables. tables []*Table // Disregarding reversed, this is in ascending order. reversed bool } // NewConcatIterator creates a new concatenated iterator func NewConcatIterator(tbls []*Table, reversed bool) *ConcatIterator { iters := make([]*Iterator, len(tbls)) for i := 0; i < len(tbls); i++ { // Increment the reference count. Since, we're not creating the iterator right now. // Here, We'll hold the reference of the tables, till the lifecycle of the iterator. tbls[i].IncrRef() // Save cycles by not initializing the iterators until needed. // iters[i] = tbls[i].NewIterator(reversed) } return &ConcatIterator{ reversed: reversed, iters: iters, tables: tbls, idx: -1, // Not really necessary because s.it.Valid()=false, but good to have. } } func (s *ConcatIterator) setIdx(idx int) { s.idx = idx if idx < 0 || idx >= len(s.iters) { s.cur = nil return } if s.iters[idx] == nil { s.iters[idx] = s.tables[idx].NewIterator(s.reversed) } s.cur = s.iters[s.idx] } // Rewind implements y.Interface func (s *ConcatIterator) Rewind() { if len(s.iters) == 0 { return } if !s.reversed { s.setIdx(0) } else { s.setIdx(len(s.iters) - 1) } s.cur.Rewind() } // Valid implements y.Interface func (s *ConcatIterator) Valid() bool { return s.cur != nil && s.cur.Valid() } // Key implements y.Interface func (s *ConcatIterator) Key() []byte { return s.cur.Key() } // Value implements y.Interface func (s *ConcatIterator) Value() y.ValueStruct { return s.cur.Value() } // Seek brings us to element >= key if reversed is false. Otherwise, <= key. func (s *ConcatIterator) Seek(key []byte) { var idx int if !s.reversed { idx = sort.Search(len(s.tables), func(i int) bool { return y.CompareKeys(s.tables[i].Biggest(), key) >= 0 }) } else { n := len(s.tables) idx = n - 1 - sort.Search(n, func(i int) bool { return y.CompareKeys(s.tables[n-1-i].Smallest(), key) <= 0 }) } if idx >= len(s.tables) || idx < 0 { s.setIdx(-1) return } // For reversed=false, we know s.tables[i-1].Biggest() < key. Thus, the // previous table cannot possibly contain key. s.setIdx(idx) s.cur.Seek(key) } // Next advances our concat iterator. func (s *ConcatIterator) Next() { s.cur.Next() if s.cur.Valid() { // Nothing to do. Just stay with the current table. return } for { // In case there are empty tables. if !s.reversed { s.setIdx(s.idx + 1) } else { s.setIdx(s.idx - 1) } if s.cur == nil { // End of list. Valid will become false. return } s.cur.Rewind() if s.cur.Valid() { break } } } // Close implements y.Interface. func (s *ConcatIterator) Close() error { for _, t := range s.tables { // DeReference the tables while closing the iterator. if err := t.DecrRef(); err != nil { return err } } for _, it := range s.iters { if it == nil { continue } if err := it.Close(); err != nil { return errors.Wrap(err, "ConcatIterator") } } return nil } badger-2.2007.2/table/merge_iterator.go000066400000000000000000000120361372173116500176260ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package table import ( "bytes" "github.com/dgraph-io/badger/v2/y" "github.com/pkg/errors" ) // MergeIterator merges multiple iterators. // NOTE: MergeIterator owns the array of iterators and is responsible for closing them. type MergeIterator struct { left node right node small *node curKey []byte reverse bool } type node struct { valid bool key []byte iter y.Iterator // The two iterators are type asserted from `y.Iterator`, used to inline more function calls. // Calling functions on concrete types is much faster (about 25-30%) than calling the // interface's function. merge *MergeIterator concat *ConcatIterator } func (n *node) setIterator(iter y.Iterator) { n.iter = iter // It's okay if the type assertion below fails and n.merge/n.concat are set to nil. // We handle the nil values of merge and concat in all the methods. n.merge, _ = iter.(*MergeIterator) n.concat, _ = iter.(*ConcatIterator) } func (n *node) setKey() { switch { case n.merge != nil: n.valid = n.merge.small.valid if n.valid { n.key = n.merge.small.key } case n.concat != nil: n.valid = n.concat.Valid() if n.valid { n.key = n.concat.Key() } default: n.valid = n.iter.Valid() if n.valid { n.key = n.iter.Key() } } } func (n *node) next() { switch { case n.merge != nil: n.merge.Next() case n.concat != nil: n.concat.Next() default: n.iter.Next() } n.setKey() } func (n *node) rewind() { n.iter.Rewind() n.setKey() } func (n *node) seek(key []byte) { n.iter.Seek(key) n.setKey() } func (mi *MergeIterator) fix() { if !mi.bigger().valid { return } if !mi.small.valid { mi.swapSmall() return } cmp := y.CompareKeys(mi.small.key, mi.bigger().key) switch { case cmp == 0: // Both the keys are equal. // In case of same keys, move the right iterator ahead. mi.right.next() if &mi.right == mi.small { mi.swapSmall() } return case cmp < 0: // Small is less than bigger(). if mi.reverse { mi.swapSmall() } else { // we don't need to do anything. Small already points to the smallest. } return default: // bigger() is less than small. if mi.reverse { // Do nothing since we're iterating in reverse. Small currently points to // the bigger key and that's okay in reverse iteration. } else { mi.swapSmall() } return } } func (mi *MergeIterator) bigger() *node { if mi.small == &mi.left { return &mi.right } return &mi.left } func (mi *MergeIterator) swapSmall() { if mi.small == &mi.left { mi.small = &mi.right return } if mi.small == &mi.right { mi.small = &mi.left return } } // Next returns the next element. If it is the same as the current key, ignore it. func (mi *MergeIterator) Next() { for mi.Valid() { if !bytes.Equal(mi.small.key, mi.curKey) { break } mi.small.next() mi.fix() } mi.setCurrent() } func (mi *MergeIterator) setCurrent() { mi.curKey = append(mi.curKey[:0], mi.small.key...) } // Rewind seeks to first element (or last element for reverse iterator). func (mi *MergeIterator) Rewind() { mi.left.rewind() mi.right.rewind() mi.fix() mi.setCurrent() } // Seek brings us to element with key >= given key. func (mi *MergeIterator) Seek(key []byte) { mi.left.seek(key) mi.right.seek(key) mi.fix() mi.setCurrent() } // Valid returns whether the MergeIterator is at a valid element. func (mi *MergeIterator) Valid() bool { return mi.small.valid } // Key returns the key associated with the current iterator. func (mi *MergeIterator) Key() []byte { return mi.small.key } // Value returns the value associated with the iterator. func (mi *MergeIterator) Value() y.ValueStruct { return mi.small.iter.Value() } // Close implements y.Iterator. func (mi *MergeIterator) Close() error { err1 := mi.left.iter.Close() err2 := mi.right.iter.Close() if err1 != nil { return errors.Wrap(err1, "MergeIterator") } return errors.Wrap(err2, "MergeIterator") } // NewMergeIterator creates a merge iterator. func NewMergeIterator(iters []y.Iterator, reverse bool) y.Iterator { switch len(iters) { case 0: return nil case 1: return iters[0] case 2: mi := &MergeIterator{ reverse: reverse, } mi.left.setIterator(iters[0]) mi.right.setIterator(iters[1]) // Assign left iterator randomly. This will be fixed when user calls rewind/seek. mi.small = &mi.left return mi } mid := len(iters) / 2 return NewMergeIterator( []y.Iterator{ NewMergeIterator(iters[:mid], reverse), NewMergeIterator(iters[mid:], reverse), }, reverse) } badger-2.2007.2/table/merge_iterator_test.go000066400000000000000000000271231372173116500206700ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package table import ( "sort" "testing" "github.com/dgraph-io/badger/v2/y" "github.com/stretchr/testify/require" ) type SimpleIterator struct { keys [][]byte vals [][]byte idx int reversed bool } var ( closeCount int ) func (s *SimpleIterator) Close() error { closeCount++; return nil } func (s *SimpleIterator) Next() { if !s.reversed { s.idx++ } else { s.idx-- } } func (s *SimpleIterator) Rewind() { if !s.reversed { s.idx = 0 } else { s.idx = len(s.keys) - 1 } } func (s *SimpleIterator) Seek(key []byte) { key = y.KeyWithTs(key, 0) if !s.reversed { s.idx = sort.Search(len(s.keys), func(i int) bool { return y.CompareKeys(s.keys[i], key) >= 0 }) } else { n := len(s.keys) s.idx = n - 1 - sort.Search(n, func(i int) bool { return y.CompareKeys(s.keys[n-1-i], key) <= 0 }) } } func (s *SimpleIterator) Key() []byte { return s.keys[s.idx] } func (s *SimpleIterator) Value() y.ValueStruct { return y.ValueStruct{ Value: s.vals[s.idx], UserMeta: 55, Meta: 0, } } func (s *SimpleIterator) Valid() bool { return s.idx >= 0 && s.idx < len(s.keys) } var _ y.Iterator = &SimpleIterator{} func newSimpleIterator(keys []string, vals []string, reversed bool) *SimpleIterator { k := make([][]byte, len(keys)) v := make([][]byte, len(vals)) y.AssertTrue(len(keys) == len(vals)) for i := 0; i < len(keys); i++ { k[i] = y.KeyWithTs([]byte(keys[i]), 0) v[i] = []byte(vals[i]) } return &SimpleIterator{ keys: k, vals: v, idx: -1, reversed: reversed, } } func getAll(it y.Iterator) ([]string, []string) { var keys, vals []string for ; it.Valid(); it.Next() { k := it.Key() keys = append(keys, string(y.ParseKey(k))) v := it.Value() vals = append(vals, string(v.Value)) } return keys, vals } func closeAndCheck(t *testing.T, it y.Iterator, expected int) { closeCount = 0 it.Close() require.EqualValues(t, expected, closeCount) } func TestSimpleIterator(t *testing.T) { keys := []string{"1", "2", "3"} vals := []string{"v1", "v2", "v3"} it := newSimpleIterator(keys, vals, false) it.Rewind() k, v := getAll(it) require.EqualValues(t, keys, k) require.EqualValues(t, vals, v) closeAndCheck(t, it, 1) } func reversed(a []string) []string { var out []string for i := len(a) - 1; i >= 0; i-- { out = append(out, a[i]) } return out } func TestMergeSingle(t *testing.T) { keys := []string{"1", "2", "3"} vals := []string{"v1", "v2", "v3"} it := newSimpleIterator(keys, vals, false) mergeIt := NewMergeIterator([]y.Iterator{it}, false) mergeIt.Rewind() k, v := getAll(mergeIt) require.EqualValues(t, keys, k) require.EqualValues(t, vals, v) closeAndCheck(t, mergeIt, 1) } func TestMergeSingleReversed(t *testing.T) { keys := []string{"1", "2", "3"} vals := []string{"v1", "v2", "v3"} it := newSimpleIterator(keys, vals, true) mergeIt := NewMergeIterator([]y.Iterator{it}, true) mergeIt.Rewind() k, v := getAll(mergeIt) require.EqualValues(t, reversed(keys), k) require.EqualValues(t, reversed(vals), v) closeAndCheck(t, mergeIt, 1) } func TestMergeMore(t *testing.T) { it := newSimpleIterator([]string{"1", "3", "7"}, []string{"a1", "a3", "a7"}, false) it2 := newSimpleIterator([]string{"2", "3", "5"}, []string{"b2", "b3", "b5"}, false) it3 := newSimpleIterator([]string{"1"}, []string{"c1"}, false) it4 := newSimpleIterator([]string{"1", "7", "9"}, []string{"d1", "d7", "d9"}, false) t.Run("forward", func(t *testing.T) { expectedKeys := []string{"1", "2", "3", "5", "7", "9"} expectedVals := []string{"a1", "b2", "a3", "b5", "a7", "d9"} t.Run("no duplicates", func(t *testing.T) { mergeIt := NewMergeIterator([]y.Iterator{it, it2, it3, it4}, false) mergeIt.Rewind() k, v := getAll(mergeIt) require.EqualValues(t, expectedKeys, k) require.EqualValues(t, expectedVals, v) closeAndCheck(t, mergeIt, 4) }) t.Run("duplicates", func(t *testing.T) { it5 := newSimpleIterator( []string{"1", "1", "3", "7"}, []string{"a1", "a1-1", "a3", "a7"}, false) mergeIt := NewMergeIterator([]y.Iterator{it5, it2, it3, it4}, false) expectedKeys := []string{"1", "2", "3", "5", "7", "9"} expectedVals := []string{"a1", "b2", "a3", "b5", "a7", "d9"} mergeIt.Rewind() k, v := getAll(mergeIt) require.EqualValues(t, expectedKeys, k) require.EqualValues(t, expectedVals, v) closeAndCheck(t, mergeIt, 4) }) }) t.Run("reverse", func(t *testing.T) { it.reversed = true it2.reversed = true it3.reversed = true it4.reversed = true t.Run("no duplicates", func(t *testing.T) { mergeIt := NewMergeIterator([]y.Iterator{it, it2, it3, it4}, true) expectedKeys := []string{"9", "7", "5", "3", "2", "1"} expectedVals := []string{"d9", "a7", "b5", "a3", "b2", "a1"} mergeIt.Rewind() k, v := getAll(mergeIt) require.EqualValues(t, expectedKeys, k) require.EqualValues(t, expectedVals, v) closeAndCheck(t, mergeIt, 4) }) t.Run("duplicates", func(t *testing.T) { it5 := newSimpleIterator( []string{"1", "1", "3", "7"}, []string{"a1", "a1-1", "a3", "a7"}, true) mergeIt := NewMergeIterator([]y.Iterator{it5, it2, it3, it4}, true) expectedKeys := []string{"9", "7", "5", "3", "2", "1"} expectedVals := []string{"d9", "a7", "b5", "a3", "b2", "a1-1"} mergeIt.Rewind() k, v := getAll(mergeIt) require.EqualValues(t, expectedKeys, k) require.EqualValues(t, expectedVals, v) closeAndCheck(t, mergeIt, 4) }) }) } // Ensure MergeIterator satisfies the Iterator interface func TestMergeIteratorNested(t *testing.T) { keys := []string{"1", "2", "3"} vals := []string{"v1", "v2", "v3"} it := newSimpleIterator(keys, vals, false) mergeIt := NewMergeIterator([]y.Iterator{it}, false) mergeIt2 := NewMergeIterator([]y.Iterator{mergeIt}, false) mergeIt2.Rewind() k, v := getAll(mergeIt2) require.EqualValues(t, keys, k) require.EqualValues(t, vals, v) closeAndCheck(t, mergeIt2, 1) } func TestMergeIteratorSeek(t *testing.T) { it := newSimpleIterator([]string{"1", "3", "7"}, []string{"a1", "a3", "a7"}, false) it2 := newSimpleIterator([]string{"2", "3", "5"}, []string{"b2", "b3", "b5"}, false) it3 := newSimpleIterator([]string{"1"}, []string{"c1"}, false) it4 := newSimpleIterator([]string{"1", "7", "9"}, []string{"d1", "d7", "d9"}, false) mergeIt := NewMergeIterator([]y.Iterator{it, it2, it3, it4}, false) mergeIt.Seek([]byte("4")) k, v := getAll(mergeIt) require.EqualValues(t, []string{"5", "7", "9"}, k) require.EqualValues(t, []string{"b5", "a7", "d9"}, v) closeAndCheck(t, mergeIt, 4) } func TestMergeIteratorSeekReversed(t *testing.T) { it := newSimpleIterator([]string{"1", "3", "7"}, []string{"a1", "a3", "a7"}, true) it2 := newSimpleIterator([]string{"2", "3", "5"}, []string{"b2", "b3", "b5"}, true) it3 := newSimpleIterator([]string{"1"}, []string{"c1"}, true) it4 := newSimpleIterator([]string{"1", "7", "9"}, []string{"d1", "d7", "d9"}, true) mergeIt := NewMergeIterator([]y.Iterator{it, it2, it3, it4}, true) mergeIt.Seek([]byte("5")) k, v := getAll(mergeIt) require.EqualValues(t, []string{"5", "3", "2", "1"}, k) require.EqualValues(t, []string{"b5", "a3", "b2", "a1"}, v) closeAndCheck(t, mergeIt, 4) } func TestMergeIteratorSeekInvalid(t *testing.T) { it := newSimpleIterator([]string{"1", "3", "7"}, []string{"a1", "a3", "a7"}, false) it2 := newSimpleIterator([]string{"2", "3", "5"}, []string{"b2", "b3", "b5"}, false) it3 := newSimpleIterator([]string{"1"}, []string{"c1"}, false) it4 := newSimpleIterator([]string{"1", "7", "9"}, []string{"d1", "d7", "d9"}, false) mergeIt := NewMergeIterator([]y.Iterator{it, it2, it3, it4}, false) mergeIt.Seek([]byte("f")) require.False(t, mergeIt.Valid()) closeAndCheck(t, mergeIt, 4) } func TestMergeIteratorSeekInvalidReversed(t *testing.T) { it := newSimpleIterator([]string{"1", "3", "7"}, []string{"a1", "a3", "a7"}, true) it2 := newSimpleIterator([]string{"2", "3", "5"}, []string{"b2", "b3", "b5"}, true) it3 := newSimpleIterator([]string{"1"}, []string{"c1"}, true) it4 := newSimpleIterator([]string{"1", "7", "9"}, []string{"d1", "d7", "d9"}, true) mergeIt := NewMergeIterator([]y.Iterator{it, it2, it3, it4}, true) mergeIt.Seek([]byte("0")) require.False(t, mergeIt.Valid()) closeAndCheck(t, mergeIt, 4) } func TestMergeIteratorDuplicate(t *testing.T) { it1 := newSimpleIterator([]string{"0", "1", "2"}, []string{"a0", "a1", "a2"}, false) it2 := newSimpleIterator([]string{"1", "3"}, []string{"b1", "b3"}, false) it3 := newSimpleIterator([]string{"0", "1", "2"}, []string{"c0", "c1", "c2"}, false) t.Run("forward", func(t *testing.T) { t.Run("only duplicates", func(t *testing.T) { it := NewMergeIterator([]y.Iterator{it1, it3}, false) expectedKeys := []string{"0", "1", "2"} expectedVals := []string{"a0", "a1", "a2"} it.Rewind() k, v := getAll(it) require.Equal(t, expectedKeys, k) require.Equal(t, expectedVals, v) }) t.Run("one", func(t *testing.T) { it := NewMergeIterator([]y.Iterator{it3, it2, it1}, false) expectedKeys := []string{"0", "1", "2", "3"} expectedVals := []string{"c0", "c1", "c2", "b3"} it.Rewind() k, v := getAll(it) require.Equal(t, expectedKeys, k) require.Equal(t, expectedVals, v) }) t.Run("two", func(t *testing.T) { it1 := newSimpleIterator([]string{"0", "1", "2"}, []string{"0", "1", "2"}, false) it2 := newSimpleIterator([]string{"1"}, []string{"1"}, false) it3 := newSimpleIterator([]string{"2"}, []string{"2"}, false) it := NewMergeIterator([]y.Iterator{it3, it2, it1}, false) var cnt int for it.Rewind(); it.Valid(); it.Next() { require.EqualValues(t, cnt+48, it.Key()[0]) cnt++ } require.Equal(t, 3, cnt) }) }) t.Run("reverse", func(t *testing.T) { it1.reversed = true it2.reversed = true it3.reversed = true it := NewMergeIterator([]y.Iterator{it3, it2, it1}, true) expectedKeys := []string{"3", "2", "1", "0"} expectedVals := []string{"b3", "c2", "c1", "c0"} it.Rewind() k, v := getAll(it) require.Equal(t, expectedKeys, k) require.Equal(t, expectedVals, v) }) } func TestMergeDuplicates(t *testing.T) { it := newSimpleIterator([]string{"1", "1", "1"}, []string{"a1", "a3", "a7"}, false) it2 := newSimpleIterator([]string{"1", "1", "1"}, []string{"b2", "b3", "b5"}, false) it3 := newSimpleIterator([]string{"1"}, []string{"c1"}, false) it4 := newSimpleIterator([]string{"1", "1", "2"}, []string{"d1", "d7", "d9"}, false) t.Run("forward", func(t *testing.T) { expectedKeys := []string{"1", "2"} expectedVals := []string{"a1", "d9"} mergeIt := NewMergeIterator([]y.Iterator{it, it2, it3, it4}, false) mergeIt.Rewind() k, v := getAll(mergeIt) require.EqualValues(t, expectedKeys, k) require.EqualValues(t, expectedVals, v) closeAndCheck(t, mergeIt, 4) }) t.Run("reverse", func(t *testing.T) { it.reversed = true it2.reversed = true it3.reversed = true it4.reversed = true expectedKeys := []string{"2", "1"} expectedVals := []string{"d9", "a7"} mergeIt := NewMergeIterator([]y.Iterator{it, it2, it3, it4}, true) mergeIt.Rewind() k, v := getAll(mergeIt) require.EqualValues(t, expectedKeys, k) require.EqualValues(t, expectedVals, v) closeAndCheck(t, mergeIt, 4) }) } badger-2.2007.2/table/table.go000066400000000000000000000501371372173116500157110ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package table import ( "crypto/aes" "encoding/binary" "fmt" "io" "math" "os" "path" "path/filepath" "strconv" "strings" "sync" "sync/atomic" "unsafe" "github.com/golang/protobuf/proto" "github.com/golang/snappy" "github.com/pkg/errors" "github.com/dgraph-io/badger/v2/options" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/y" "github.com/dgraph-io/ristretto" "github.com/dgraph-io/ristretto/z" ) const fileSuffix = ".sst" const intSize = int(unsafe.Sizeof(int(0))) // 1 word = 8 bytes // sizeOfOffsetStruct is the size of pb.BlockOffset const sizeOfOffsetStruct int64 = 3*8 + // key array take 3 words 1*8 + // offset and len takes 1 word 3*8 + // XXX_unrecognized array takes 3 word. 1*8 // so far 7 words, in order to round the slab we're adding one more word. // Options contains configurable options for Table/Builder. type Options struct { // Options for Opening/Building Table. // ChkMode is the checksum verification mode for Table. ChkMode options.ChecksumVerificationMode // LoadingMode is the mode to be used for loading Table. LoadingMode options.FileLoadingMode // Options for Table builder. // BloomFalsePositive is the false positive probabiltiy of bloom filter. BloomFalsePositive float64 // BlockSize is the size of each block inside SSTable in bytes. BlockSize int // DataKey is the key used to decrypt the encrypted text. DataKey *pb.DataKey // Compression indicates the compression algorithm used for block compression. Compression options.CompressionType BlockCache *ristretto.Cache IndexCache *ristretto.Cache // ZSTDCompressionLevel is the ZSTD compression level used for compressing blocks. ZSTDCompressionLevel int // When LoadBloomsOnOpen is set, bloom filters will be loaded while opening // the table. Otherwise, they will be loaded lazily when they're accessed. LoadBloomsOnOpen bool } // TableInterface is useful for testing. type TableInterface interface { Smallest() []byte Biggest() []byte DoesNotHave(hash uint64) bool } // Table represents a loaded table file with the info we have about it. type Table struct { sync.Mutex fd *os.File // Own fd. tableSize int // Initialized in OpenTable, using fd.Stat(). bfLock sync.Mutex blockOffset []*pb.BlockOffset ref int32 // For file garbage collection. Atomic. bf *z.Bloom // Nil if index cache in enabled. mmap []byte // Memory mapped. // The following are initialized once and const. smallest, biggest []byte // Smallest and largest keys (with timestamps). id uint64 // file id, part of filename Checksum []byte // Stores the total size of key-values stored in this table (including the size on vlog). estimatedSize uint64 indexStart int indexLen int IsInmemory bool // Set to true if the table is on level 0 and opened in memory. opt *Options noOfBlocks int // Total number of blocks. } // CompressionType returns the compression algorithm used for block compression. func (t *Table) CompressionType() options.CompressionType { return t.opt.Compression } // IncrRef increments the refcount (having to do with whether the file should be deleted) func (t *Table) IncrRef() { atomic.AddInt32(&t.ref, 1) } // DecrRef decrements the refcount and possibly deletes the table func (t *Table) DecrRef() error { newRef := atomic.AddInt32(&t.ref, -1) if newRef == 0 { // We can safely delete this file, because for all the current files, we always have // at least one reference pointing to them. // It's necessary to delete windows files. if t.opt.LoadingMode == options.MemoryMap { if err := y.Munmap(t.mmap); err != nil { return err } t.mmap = nil } // fd can be nil if the table belongs to L0 and it is opened in memory. See // OpenTableInMemory method. if t.fd == nil { return nil } if err := t.fd.Truncate(0); err != nil { // This is very important to let the FS know that the file is deleted. return err } filename := t.fd.Name() if err := t.fd.Close(); err != nil { return err } if err := os.Remove(filename); err != nil { return err } // Delete all blocks from the cache. for i := 0; i < t.noOfBlocks; i++ { t.opt.BlockCache.Del(t.blockCacheKey(i)) } // Delete bloom filter and indices from the cache. t.opt.IndexCache.Del(t.blockOffsetsCacheKey()) t.opt.IndexCache.Del(t.bfCacheKey()) } return nil } type block struct { offset int data []byte checksum []byte entriesIndexStart int // start index of entryOffsets list entryOffsets []uint32 chkLen int // checksum length } func (b *block) size() int64 { return int64(3*intSize /* Size of the offset, entriesIndexStart and chkLen */ + cap(b.data) + cap(b.checksum) + cap(b.entryOffsets)*4) } func (b block) verifyCheckSum() error { cs := &pb.Checksum{} if err := proto.Unmarshal(b.checksum, cs); err != nil { return y.Wrapf(err, "unable to unmarshal checksum for block") } return y.VerifyChecksum(b.data, cs) } // OpenTable assumes file has only one table and opens it. Takes ownership of fd upon function // entry. Returns a table with one reference count on it (decrementing which may delete the file! // -- consider t.Close() instead). The fd has to writeable because we call Truncate on it before // deleting. Checksum for all blocks of table is verified based on value of chkMode. func OpenTable(fd *os.File, opts Options) (*Table, error) { fileInfo, err := fd.Stat() if err != nil { // It's OK to ignore fd.Close() errs in this function because we have only read // from the file. _ = fd.Close() return nil, y.Wrap(err) } filename := fileInfo.Name() id, ok := ParseFileID(filename) if !ok { _ = fd.Close() return nil, errors.Errorf("Invalid filename: %s", filename) } t := &Table{ fd: fd, ref: 1, // Caller is given one reference. id: id, opt: &opts, IsInmemory: false, } t.tableSize = int(fileInfo.Size()) switch opts.LoadingMode { case options.LoadToRAM: if _, err := t.fd.Seek(0, io.SeekStart); err != nil { return nil, err } t.mmap = make([]byte, t.tableSize) n, err := t.fd.Read(t.mmap) if err != nil { // It's OK to ignore fd.Close() error because we have only read from the file. _ = t.fd.Close() return nil, y.Wrapf(err, "Failed to load file into RAM") } if n != t.tableSize { return nil, errors.Errorf("Failed to read all bytes from the file."+ "Bytes in file: %d Bytes actually Read: %d", t.tableSize, n) } case options.MemoryMap: t.mmap, err = y.Mmap(fd, false, fileInfo.Size()) if err != nil { _ = fd.Close() return nil, y.Wrapf(err, "Unable to map file: %q", fileInfo.Name()) } case options.FileIO: t.mmap = nil default: panic(fmt.Sprintf("Invalid loading mode: %v", opts.LoadingMode)) } if err := t.initBiggestAndSmallest(); err != nil { return nil, errors.Wrapf(err, "failed to initialize table") } if opts.ChkMode == options.OnTableRead || opts.ChkMode == options.OnTableAndBlockRead { if err := t.VerifyChecksum(); err != nil { _ = fd.Close() return nil, errors.Wrapf(err, "failed to verify checksum") } } return t, nil } // OpenInMemoryTable is similar to OpenTable but it opens a new table from the provided data. // OpenInMemoryTable is used for L0 tables. func OpenInMemoryTable(data []byte, id uint64, opt *Options) (*Table, error) { opt.LoadingMode = options.LoadToRAM t := &Table{ ref: 1, // Caller is given one reference. opt: opt, mmap: data, tableSize: len(data), IsInmemory: true, id: id, // It is important that each table gets a unique ID. } if err := t.initBiggestAndSmallest(); err != nil { return nil, err } return t, nil } func (t *Table) initBiggestAndSmallest() error { var err error var ko *pb.BlockOffset if ko, err = t.initIndex(); err != nil { return errors.Wrapf(err, "failed to read index.") } t.smallest = ko.Key it2 := t.NewIterator(true) defer it2.Close() it2.Rewind() if !it2.Valid() { return errors.Wrapf(it2.err, "failed to initialize biggest for table %s", t.Filename()) } t.biggest = it2.Key() return nil } // Close closes the open table. (Releases resources back to the OS.) func (t *Table) Close() error { if t.opt.LoadingMode == options.MemoryMap { if err := y.Munmap(t.mmap); err != nil { return err } t.mmap = nil } if t.fd == nil { return nil } return t.fd.Close() } func (t *Table) read(off, sz int) ([]byte, error) { if len(t.mmap) > 0 { if len(t.mmap[off:]) < sz { return nil, y.ErrEOF } return t.mmap[off : off+sz], nil } res := make([]byte, sz) nbr, err := t.fd.ReadAt(res, int64(off)) y.NumReads.Add(1) y.NumBytesRead.Add(int64(nbr)) return res, err } func (t *Table) readNoFail(off, sz int) []byte { res, err := t.read(off, sz) y.Check(err) return res } // initIndex reads the index and populate the necessary table fields and returns // first block offset func (t *Table) initIndex() (*pb.BlockOffset, error) { readPos := t.tableSize // Read checksum len from the last 4 bytes. readPos -= 4 buf := t.readNoFail(readPos, 4) checksumLen := int(y.BytesToU32(buf)) if checksumLen < 0 { return nil, errors.New("checksum length less than zero. Data corrupted") } // Read checksum. expectedChk := &pb.Checksum{} readPos -= checksumLen buf = t.readNoFail(readPos, checksumLen) if err := proto.Unmarshal(buf, expectedChk); err != nil { return nil, err } // Read index size from the footer. readPos -= 4 buf = t.readNoFail(readPos, 4) t.indexLen = int(y.BytesToU32(buf)) // Read index. readPos -= t.indexLen t.indexStart = readPos data := t.readNoFail(readPos, t.indexLen) if err := y.VerifyChecksum(data, expectedChk); err != nil { return nil, y.Wrapf(err, "failed to verify checksum for table: %s", t.Filename()) } index, err := t.readTableIndex() if err != nil { return nil, err } t.estimatedSize = index.EstimatedSize t.noOfBlocks = len(index.Offsets) // No cache if t.opt.IndexCache == nil { if t.opt.LoadBloomsOnOpen { bf, err := z.JSONUnmarshal(index.BloomFilter) if err != nil { return nil, errors.Wrapf(err, "failed to unmarshal bloomfilter for table:%d", t.id) } // Keep blooms in memory. t.bfLock.Lock() t.bf = bf t.bfLock.Unlock() } // Keep block offsets in memory since there is no cache. t.blockOffset = index.Offsets } // We don't need to put anything in the indexCache here. Table.Open will // create an iterator and that iterator will push the indices in cache. return index.Offsets[0], nil } // blockOffsets returns block offsets of this table. func (t *Table) blockOffsets() []*pb.BlockOffset { if t.opt.IndexCache == nil { return t.blockOffset } if val, ok := t.opt.IndexCache.Get(t.blockOffsetsCacheKey()); ok && val != nil { return val.([]*pb.BlockOffset) } index, err := t.readTableIndex() y.Check(err) t.opt.IndexCache.Set( t.blockOffsetsCacheKey(), index.Offsets, calculateOffsetsSize(index.Offsets)) return index.Offsets } // calculateOffsetsSize returns the size of *pb.BlockOffset array func calculateOffsetsSize(offsets []*pb.BlockOffset) int64 { totalSize := sizeOfOffsetStruct * int64(len(offsets)) for _, ko := range offsets { // add key size. totalSize += int64(cap(ko.Key)) // add XXX_unrecognized size. totalSize += int64(cap(ko.XXX_unrecognized)) } // Add three words for array size. return totalSize + 3*8 } func (t *Table) block(idx int) (*block, error) { y.AssertTruef(idx >= 0, "idx=%d", idx) if idx >= t.noOfBlocks { return nil, errors.New("block out of index") } if t.opt.BlockCache != nil { key := t.blockCacheKey(idx) blk, ok := t.opt.BlockCache.Get(key) if ok && blk != nil { return blk.(*block), nil } } // Read the block index if it's nil ko := t.blockOffsets()[idx] blk := &block{ offset: int(ko.Offset), } var err error if blk.data, err = t.read(blk.offset, int(ko.Len)); err != nil { return nil, errors.Wrapf(err, "failed to read from file: %s at offset: %d, len: %d", t.fd.Name(), blk.offset, ko.Len) } if t.shouldDecrypt() { // Decrypt the block if it is encrypted. if blk.data, err = t.decrypt(blk.data); err != nil { return nil, err } } blk.data, err = t.decompressData(blk.data) if err != nil { return nil, errors.Wrapf(err, "failed to decode compressed data in file: %s at offset: %d, len: %d", t.fd.Name(), blk.offset, ko.Len) } // Read meta data related to block. readPos := len(blk.data) - 4 // First read checksum length. blk.chkLen = int(y.BytesToU32(blk.data[readPos : readPos+4])) // Checksum length greater than block size could happen if the table was compressed and // it was opened with an incorrect compression algorithm (or the data was corrupted). if blk.chkLen > len(blk.data) { return nil, errors.New("invalid checksum length. Either the data is" + "corrupted or the table options are incorrectly set") } // Read checksum and store it readPos -= blk.chkLen blk.checksum = blk.data[readPos : readPos+blk.chkLen] // Move back and read numEntries in the block. readPos -= 4 numEntries := int(y.BytesToU32(blk.data[readPos : readPos+4])) entriesIndexStart := readPos - (numEntries * 4) entriesIndexEnd := entriesIndexStart + numEntries*4 blk.entryOffsets = y.BytesToU32Slice(blk.data[entriesIndexStart:entriesIndexEnd]) blk.entriesIndexStart = entriesIndexStart // Drop checksum and checksum length. // The checksum is calculated for actual data + entry index + index length blk.data = blk.data[:readPos+4] // Verify checksum on if checksum verification mode is OnRead on OnStartAndRead. if t.opt.ChkMode == options.OnBlockRead || t.opt.ChkMode == options.OnTableAndBlockRead { if err = blk.verifyCheckSum(); err != nil { return nil, err } } if t.opt.BlockCache != nil { key := t.blockCacheKey(idx) t.opt.BlockCache.Set(key, blk, blk.size()) } return blk, nil } // bfCacheKey returns the cache key for bloom filter. Bloom filters are stored in index cache. func (t *Table) bfCacheKey() []byte { y.AssertTrue(t.id < math.MaxUint32) buf := make([]byte, 6) // Without the "bf" prefix, we will have conflict with the blockCacheKey. buf[0] = 'b' buf[1] = 'f' binary.BigEndian.PutUint32(buf[2:], uint32(t.id)) return buf } // blockCacheKey is used to store blocks in the block cache. func (t *Table) blockCacheKey(idx int) []byte { y.AssertTrue(t.id < math.MaxUint32) y.AssertTrue(uint32(idx) < math.MaxUint32) buf := make([]byte, 8) // Assume t.ID does not overflow uint32. binary.BigEndian.PutUint32(buf[:4], uint32(t.ID())) binary.BigEndian.PutUint32(buf[4:], uint32(idx)) return buf } // blockOffsetsCacheKey returns the cache key for block offsets. blockOffsets // are stored in the index cache. func (t *Table) blockOffsetsCacheKey() uint64 { return t.id } // EstimatedSize returns the total size of key-values stored in this table (including the // disk space occupied on the value log). func (t *Table) EstimatedSize() uint64 { return t.estimatedSize } // Size is its file size in bytes func (t *Table) Size() int64 { return int64(t.tableSize) } // Smallest is its smallest key, or nil if there are none func (t *Table) Smallest() []byte { return t.smallest } // Biggest is its biggest key, or nil if there are none func (t *Table) Biggest() []byte { return t.biggest } // Filename is NOT the file name. Just kidding, it is. func (t *Table) Filename() string { return t.fd.Name() } // ID is the table's ID number (used to make the file name). func (t *Table) ID() uint64 { return t.id } // DoesNotHave returns true if (but not "only if") the table does not have the key hash. // It does a bloom filter lookup. func (t *Table) DoesNotHave(hash uint64) bool { // Return fast if the cache is absent. if t.opt.IndexCache == nil { t.bfLock.Lock() if t.bf == nil { y.AssertTrue(!t.opt.LoadBloomsOnOpen) // Load bloomfilter into memory since the cache is absent. t.bf, _ = t.readBloomFilter() } t.bfLock.Unlock() return !t.bf.Has(hash) } // Check if the bloom filter exists in the cache. if bf, ok := t.opt.IndexCache.Get(t.bfCacheKey()); bf != nil && ok { return !bf.(*z.Bloom).Has(hash) } bf, sz := t.readBloomFilter() t.opt.IndexCache.Set(t.bfCacheKey(), bf, int64(sz)) return !bf.Has(hash) } // readBloomFilter reads the bloom filter from the SST and returns its length // along with the bloom filter. func (t *Table) readBloomFilter() (*z.Bloom, int) { // Read bloom filter from the SST. index, err := t.readTableIndex() y.Check(err) bf, err := z.JSONUnmarshal(index.BloomFilter) y.Check(err) return bf, len(index.BloomFilter) } // readTableIndex reads table index from the sst and returns its pb format. func (t *Table) readTableIndex() (*pb.TableIndex, error) { data := t.readNoFail(t.indexStart, t.indexLen) index := pb.TableIndex{} var err error // Decrypt the table index if it is encrypted. if t.shouldDecrypt() { if data, err = t.decrypt(data); err != nil { return nil, y.Wrapf(err, "Error while decrypting table index for the table %d in readTableIndex", t.id) } } y.Check(proto.Unmarshal(data, &index)) return &index, nil } // VerifyChecksum verifies checksum for all blocks of table. This function is called by // OpenTable() function. This function is also called inside levelsController.VerifyChecksum(). func (t *Table) VerifyChecksum() error { for i, os := range t.blockOffsets() { b, err := t.block(i) if err != nil { return y.Wrapf(err, "checksum validation failed for table: %s, block: %d, offset:%d", t.Filename(), i, os.Offset) } // OnBlockRead or OnTableAndBlockRead, we don't need to call verify checksum // on block, verification would be done while reading block itself. if !(t.opt.ChkMode == options.OnBlockRead || t.opt.ChkMode == options.OnTableAndBlockRead) { if err = b.verifyCheckSum(); err != nil { return y.Wrapf(err, "checksum validation failed for table: %s, block: %d, offset:%d", t.Filename(), i, os.Offset) } } } return nil } // shouldDecrypt tells whether to decrypt or not. We decrypt only if the datakey exist // for the table. func (t *Table) shouldDecrypt() bool { return t.opt.DataKey != nil } // KeyID returns data key id. func (t *Table) KeyID() uint64 { if t.opt.DataKey != nil { return t.opt.DataKey.KeyId } // By default it's 0, if it is plain text. return 0 } // decrypt decrypts the given data. It should be called only after checking shouldDecrypt. func (t *Table) decrypt(data []byte) ([]byte, error) { // Last BlockSize bytes of the data is the IV. iv := data[len(data)-aes.BlockSize:] // Rest all bytes are data. data = data[:len(data)-aes.BlockSize] return y.XORBlock(data, t.opt.DataKey.Data, iv) } // ParseFileID reads the file id out of a filename. func ParseFileID(name string) (uint64, bool) { name = path.Base(name) if !strings.HasSuffix(name, fileSuffix) { return 0, false } // suffix := name[len(fileSuffix):] name = strings.TrimSuffix(name, fileSuffix) id, err := strconv.Atoi(name) if err != nil { return 0, false } y.AssertTrue(id >= 0) return uint64(id), true } // IDToFilename does the inverse of ParseFileID func IDToFilename(id uint64) string { return fmt.Sprintf("%06d", id) + fileSuffix } // NewFilename should be named TableFilepath -- it combines the dir with the ID to make a table // filepath. func NewFilename(id uint64, dir string) string { return filepath.Join(dir, IDToFilename(id)) } // decompressData decompresses the given data. func (t *Table) decompressData(data []byte) ([]byte, error) { switch t.opt.Compression { case options.None: return data, nil case options.Snappy: return snappy.Decode(nil, data) case options.ZSTD: return y.ZSTDDecompress(nil, data) } return nil, errors.New("Unsupported compression type") } badger-2.2007.2/table/table_test.go000066400000000000000000000623261372173116500167530ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package table import ( "bytes" "crypto/sha256" "fmt" "hash/crc32" "math/rand" "os" "sort" "strings" "sync" "testing" "time" "github.com/cespare/xxhash" "github.com/dgraph-io/badger/v2/options" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/y" "github.com/dgraph-io/ristretto" "github.com/stretchr/testify/require" ) const ( KB = 1024 MB = KB * 1024 ) func key(prefix string, i int) string { return prefix + fmt.Sprintf("%04d", i) } func getTestTableOptions() Options { return Options{ Compression: options.ZSTD, ZSTDCompressionLevel: 15, LoadingMode: options.LoadToRAM, BlockSize: 4 * 1024, BloomFalsePositive: 0.01, } } func buildTestTable(t *testing.T, prefix string, n int, opts Options) *os.File { if opts.BloomFalsePositive == 0 { opts.BloomFalsePositive = 0.01 } if opts.BlockSize == 0 { opts.BlockSize = 4 * 1024 } y.AssertTrue(n <= 10000) keyValues := make([][]string, n) for i := 0; i < n; i++ { k := key(prefix, i) v := fmt.Sprintf("%d", i) keyValues[i] = []string{k, v} } return buildTable(t, keyValues, opts) } // keyValues is n by 2 where n is number of pairs. func buildTable(t *testing.T, keyValues [][]string, opts Options) *os.File { b := NewTableBuilder(opts) defer b.Close() // TODO: Add test for file garbage collection here. No files should be left after the tests here. filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Uint32()) f, err := y.CreateSyncedFile(filename, true) require.NoError(t, err) sort.Slice(keyValues, func(i, j int) bool { return keyValues[i][0] < keyValues[j][0] }) for _, kv := range keyValues { y.AssertTrue(len(kv) == 2) b.Add(y.KeyWithTs([]byte(kv[0]), 0), y.ValueStruct{Value: []byte(kv[1]), Meta: 'A', UserMeta: 0}, 0) } _, err = f.Write(b.Finish()) require.NoError(t, err, "writing to file failed") f.Close() f, _ = y.OpenSyncedFile(filename, true) return f } func TestTableIterator(t *testing.T) { for _, n := range []int{99, 100, 101} { t.Run(fmt.Sprintf("n=%d", n), func(t *testing.T) { opts := getTestTableOptions() f := buildTestTable(t, "key", n, opts) table, err := OpenTable(f, opts) require.NoError(t, err) defer table.DecrRef() it := table.NewIterator(false) defer it.Close() count := 0 for it.Rewind(); it.Valid(); it.Next() { v := it.Value() k := y.KeyWithTs([]byte(key("key", count)), 0) require.EqualValues(t, k, it.Key()) require.EqualValues(t, fmt.Sprintf("%d", count), string(v.Value)) count++ } require.Equal(t, count, n) }) } } func TestSeekToFirst(t *testing.T) { for _, n := range []int{99, 100, 101, 199, 200, 250, 9999, 10000} { t.Run(fmt.Sprintf("n=%d", n), func(t *testing.T) { opts := getTestTableOptions() f := buildTestTable(t, "key", n, opts) table, err := OpenTable(f, opts) require.NoError(t, err) defer table.DecrRef() it := table.NewIterator(false) defer it.Close() it.seekToFirst() require.True(t, it.Valid()) v := it.Value() require.EqualValues(t, "0", string(v.Value)) require.EqualValues(t, 'A', v.Meta) }) } } func TestSeekToLast(t *testing.T) { for _, n := range []int{99, 100, 101, 199, 200, 250, 9999, 10000} { t.Run(fmt.Sprintf("n=%d", n), func(t *testing.T) { opts := getTestTableOptions() f := buildTestTable(t, "key", n, opts) table, err := OpenTable(f, opts) require.NoError(t, err) defer table.DecrRef() it := table.NewIterator(false) defer it.Close() it.seekToLast() require.True(t, it.Valid()) v := it.Value() require.EqualValues(t, fmt.Sprintf("%d", n-1), string(v.Value)) require.EqualValues(t, 'A', v.Meta) it.prev() require.True(t, it.Valid()) v = it.Value() require.EqualValues(t, fmt.Sprintf("%d", n-2), string(v.Value)) require.EqualValues(t, 'A', v.Meta) }) } } func TestSeek(t *testing.T) { opts := getTestTableOptions() f := buildTestTable(t, "k", 10000, opts) table, err := OpenTable(f, opts) require.NoError(t, err) defer table.DecrRef() it := table.NewIterator(false) defer it.Close() var data = []struct { in string valid bool out string }{ {"abc", true, "k0000"}, {"k0100", true, "k0100"}, {"k0100b", true, "k0101"}, // Test case where we jump to next block. {"k1234", true, "k1234"}, {"k1234b", true, "k1235"}, {"k9999", true, "k9999"}, {"z", false, ""}, } for _, tt := range data { it.seek(y.KeyWithTs([]byte(tt.in), 0)) if !tt.valid { require.False(t, it.Valid()) continue } require.True(t, it.Valid()) k := it.Key() require.EqualValues(t, tt.out, string(y.ParseKey(k))) } } func TestSeekForPrev(t *testing.T) { opts := getTestTableOptions() f := buildTestTable(t, "k", 10000, opts) table, err := OpenTable(f, opts) require.NoError(t, err) defer table.DecrRef() it := table.NewIterator(false) defer it.Close() var data = []struct { in string valid bool out string }{ {"abc", false, ""}, {"k0100", true, "k0100"}, {"k0100b", true, "k0100"}, // Test case where we jump to next block. {"k1234", true, "k1234"}, {"k1234b", true, "k1234"}, {"k9999", true, "k9999"}, {"z", true, "k9999"}, } for _, tt := range data { it.seekForPrev(y.KeyWithTs([]byte(tt.in), 0)) if !tt.valid { require.False(t, it.Valid()) continue } require.True(t, it.Valid()) k := it.Key() require.EqualValues(t, tt.out, string(y.ParseKey(k))) } } func TestIterateFromStart(t *testing.T) { // Vary the number of elements added. for _, n := range []int{99, 100, 101, 199, 200, 250, 9999, 10000} { t.Run(fmt.Sprintf("n=%d", n), func(t *testing.T) { opts := getTestTableOptions() f := buildTestTable(t, "key", n, opts) table, err := OpenTable(f, opts) require.NoError(t, err) defer table.DecrRef() ti := table.NewIterator(false) defer ti.Close() ti.reset() ti.seekToFirst() require.True(t, ti.Valid()) // No need to do a Next. // ti.Seek brings us to the first key >= "". Essentially a SeekToFirst. var count int for ; ti.Valid(); ti.next() { v := ti.Value() require.EqualValues(t, fmt.Sprintf("%d", count), string(v.Value)) require.EqualValues(t, 'A', v.Meta) count++ } require.EqualValues(t, n, count) }) } } func TestIterateFromEnd(t *testing.T) { // Vary the number of elements added. for _, n := range []int{99, 100, 101, 199, 200, 250, 9999, 10000} { t.Run(fmt.Sprintf("n=%d", n), func(t *testing.T) { opts := getTestTableOptions() f := buildTestTable(t, "key", n, opts) table, err := OpenTable(f, opts) require.NoError(t, err) defer table.DecrRef() ti := table.NewIterator(false) defer ti.Close() ti.reset() ti.seek(y.KeyWithTs([]byte("zzzzzz"), 0)) // Seek to end, an invalid element. require.False(t, ti.Valid()) for i := n - 1; i >= 0; i-- { ti.prev() require.True(t, ti.Valid()) v := ti.Value() require.EqualValues(t, fmt.Sprintf("%d", i), string(v.Value)) require.EqualValues(t, 'A', v.Meta) } ti.prev() require.False(t, ti.Valid()) }) } } func TestTable(t *testing.T) { opts := getTestTableOptions() opts.LoadingMode = options.FileIO f := buildTestTable(t, "key", 10000, opts) table, err := OpenTable(f, opts) require.NoError(t, err) defer table.DecrRef() ti := table.NewIterator(false) defer ti.Close() kid := 1010 seek := y.KeyWithTs([]byte(key("key", kid)), 0) for ti.seek(seek); ti.Valid(); ti.next() { k := ti.Key() require.EqualValues(t, string(y.ParseKey(k)), key("key", kid)) kid++ } if kid != 10000 { t.Errorf("Expected kid: 10000. Got: %v", kid) } ti.seek(y.KeyWithTs([]byte(key("key", 99999)), 0)) require.False(t, ti.Valid()) ti.seek(y.KeyWithTs([]byte(key("key", -1)), 0)) require.True(t, ti.Valid()) k := ti.Key() require.EqualValues(t, string(y.ParseKey(k)), key("key", 0)) } func TestIterateBackAndForth(t *testing.T) { opts := getTestTableOptions() f := buildTestTable(t, "key", 10000, opts) table, err := OpenTable(f, opts) require.NoError(t, err) defer table.DecrRef() seek := y.KeyWithTs([]byte(key("key", 1010)), 0) it := table.NewIterator(false) defer it.Close() it.seek(seek) require.True(t, it.Valid()) k := it.Key() require.EqualValues(t, seek, k) it.prev() it.prev() require.True(t, it.Valid()) k = it.Key() require.EqualValues(t, key("key", 1008), string(y.ParseKey(k))) it.next() it.next() require.True(t, it.Valid()) k = it.Key() require.EqualValues(t, key("key", 1010), y.ParseKey(k)) it.seek(y.KeyWithTs([]byte(key("key", 2000)), 0)) require.True(t, it.Valid()) k = it.Key() require.EqualValues(t, key("key", 2000), y.ParseKey(k)) it.prev() require.True(t, it.Valid()) k = it.Key() require.EqualValues(t, key("key", 1999), y.ParseKey(k)) it.seekToFirst() k = it.Key() require.EqualValues(t, key("key", 0), y.ParseKey(k)) } func TestUniIterator(t *testing.T) { opts := getTestTableOptions() f := buildTestTable(t, "key", 10000, opts) table, err := OpenTable(f, opts) require.NoError(t, err) defer table.DecrRef() { it := table.NewIterator(false) defer it.Close() var count int for it.Rewind(); it.Valid(); it.Next() { v := it.Value() require.EqualValues(t, fmt.Sprintf("%d", count), string(v.Value)) require.EqualValues(t, 'A', v.Meta) count++ } require.EqualValues(t, 10000, count) } { it := table.NewIterator(true) defer it.Close() var count int for it.Rewind(); it.Valid(); it.Next() { v := it.Value() require.EqualValues(t, fmt.Sprintf("%d", 10000-1-count), string(v.Value)) require.EqualValues(t, 'A', v.Meta) count++ } require.EqualValues(t, 10000, count) } } // Try having only one table. func TestConcatIteratorOneTable(t *testing.T) { opts := getTestTableOptions() f := buildTable(t, [][]string{ {"k1", "a1"}, {"k2", "a2"}, }, opts) tbl, err := OpenTable(f, opts) require.NoError(t, err) defer tbl.DecrRef() it := NewConcatIterator([]*Table{tbl}, false) defer it.Close() it.Rewind() require.True(t, it.Valid()) k := it.Key() require.EqualValues(t, "k1", string(y.ParseKey(k))) vs := it.Value() require.EqualValues(t, "a1", string(vs.Value)) require.EqualValues(t, 'A', vs.Meta) } func TestConcatIterator(t *testing.T) { opts := getTestTableOptions() f := buildTestTable(t, "keya", 10000, opts) f2 := buildTestTable(t, "keyb", 10000, opts) f3 := buildTestTable(t, "keyc", 10000, opts) tbl, err := OpenTable(f, opts) require.NoError(t, err) defer tbl.DecrRef() tbl2, err := OpenTable(f2, opts) require.NoError(t, err) defer tbl2.DecrRef() tbl3, err := OpenTable(f3, opts) require.NoError(t, err) defer tbl3.DecrRef() { it := NewConcatIterator([]*Table{tbl, tbl2, tbl3}, false) defer it.Close() it.Rewind() require.True(t, it.Valid()) var count int for ; it.Valid(); it.Next() { vs := it.Value() require.EqualValues(t, fmt.Sprintf("%d", count%10000), string(vs.Value)) require.EqualValues(t, 'A', vs.Meta) count++ } require.EqualValues(t, 30000, count) it.Seek(y.KeyWithTs([]byte("a"), 0)) require.EqualValues(t, "keya0000", string(y.ParseKey(it.Key()))) vs := it.Value() require.EqualValues(t, "0", string(vs.Value)) it.Seek(y.KeyWithTs([]byte("keyb"), 0)) require.EqualValues(t, "keyb0000", string(y.ParseKey(it.Key()))) vs = it.Value() require.EqualValues(t, "0", string(vs.Value)) it.Seek(y.KeyWithTs([]byte("keyb9999b"), 0)) require.EqualValues(t, "keyc0000", string(y.ParseKey(it.Key()))) vs = it.Value() require.EqualValues(t, "0", string(vs.Value)) it.Seek(y.KeyWithTs([]byte("keyd"), 0)) require.False(t, it.Valid()) } { it := NewConcatIterator([]*Table{tbl, tbl2, tbl3}, true) defer it.Close() it.Rewind() require.True(t, it.Valid()) var count int for ; it.Valid(); it.Next() { vs := it.Value() require.EqualValues(t, fmt.Sprintf("%d", 10000-(count%10000)-1), string(vs.Value)) require.EqualValues(t, 'A', vs.Meta) count++ } require.EqualValues(t, 30000, count) it.Seek(y.KeyWithTs([]byte("a"), 0)) require.False(t, it.Valid()) it.Seek(y.KeyWithTs([]byte("keyb"), 0)) require.EqualValues(t, "keya9999", string(y.ParseKey(it.Key()))) vs := it.Value() require.EqualValues(t, "9999", string(vs.Value)) it.Seek(y.KeyWithTs([]byte("keyb9999b"), 0)) require.EqualValues(t, "keyb9999", string(y.ParseKey(it.Key()))) vs = it.Value() require.EqualValues(t, "9999", string(vs.Value)) it.Seek(y.KeyWithTs([]byte("keyd"), 0)) require.EqualValues(t, "keyc9999", string(y.ParseKey(it.Key()))) vs = it.Value() require.EqualValues(t, "9999", string(vs.Value)) } } func TestMergingIterator(t *testing.T) { opts := getTestTableOptions() f1 := buildTable(t, [][]string{ {"k1", "a1"}, {"k4", "a4"}, {"k5", "a5"}, }, opts) f2 := buildTable(t, [][]string{ {"k2", "b2"}, {"k3", "b3"}, {"k4", "b4"}, }, opts) expected := []struct { key string value string }{ {"k1", "a1"}, {"k2", "b2"}, {"k3", "b3"}, {"k4", "a4"}, {"k5", "a5"}, } tbl1, err := OpenTable(f1, opts) require.NoError(t, err) defer tbl1.DecrRef() tbl2, err := OpenTable(f2, opts) require.NoError(t, err) defer tbl2.DecrRef() it1 := tbl1.NewIterator(false) it2 := NewConcatIterator([]*Table{tbl2}, false) it := NewMergeIterator([]y.Iterator{it1, it2}, false) defer it.Close() var i int for it.Rewind(); it.Valid(); it.Next() { k := it.Key() vs := it.Value() require.EqualValues(t, expected[i].key, string(y.ParseKey(k))) require.EqualValues(t, expected[i].value, string(vs.Value)) require.EqualValues(t, 'A', vs.Meta) i++ } require.Equal(t, i, len(expected)) require.False(t, it.Valid()) } func TestMergingIteratorReversed(t *testing.T) { opts := getTestTableOptions() f1 := buildTable(t, [][]string{ {"k1", "a1"}, {"k2", "a2"}, {"k4", "a4"}, {"k5", "a5"}, }, opts) f2 := buildTable(t, [][]string{ {"k1", "b2"}, {"k3", "b3"}, {"k4", "b4"}, {"k5", "b5"}, }, opts) expected := []struct { key string value string }{ {"k5", "a5"}, {"k4", "a4"}, {"k3", "b3"}, {"k2", "a2"}, {"k1", "a1"}, } tbl1, err := OpenTable(f1, opts) require.NoError(t, err) defer tbl1.DecrRef() tbl2, err := OpenTable(f2, opts) require.NoError(t, err) defer tbl2.DecrRef() it1 := tbl1.NewIterator(true) it2 := NewConcatIterator([]*Table{tbl2}, true) it := NewMergeIterator([]y.Iterator{it1, it2}, true) defer it.Close() var i int for it.Rewind(); it.Valid(); it.Next() { k := it.Key() vs := it.Value() require.EqualValues(t, expected[i].key, string(y.ParseKey(k))) require.EqualValues(t, expected[i].value, string(vs.Value)) require.EqualValues(t, 'A', vs.Meta) i++ } require.Equal(t, i, len(expected)) require.False(t, it.Valid()) } // Take only the first iterator. func TestMergingIteratorTakeOne(t *testing.T) { opts := getTestTableOptions() f1 := buildTable(t, [][]string{ {"k1", "a1"}, {"k2", "a2"}, }, opts) f2 := buildTable(t, [][]string{{"l1", "b1"}}, opts) t1, err := OpenTable(f1, opts) require.NoError(t, err) defer t1.DecrRef() t2, err := OpenTable(f2, opts) require.NoError(t, err) defer t2.DecrRef() it1 := NewConcatIterator([]*Table{t1}, false) it2 := NewConcatIterator([]*Table{t2}, false) it := NewMergeIterator([]y.Iterator{it1, it2}, false) defer it.Close() it.Rewind() require.True(t, it.Valid()) k := it.Key() require.EqualValues(t, "k1", string(y.ParseKey(k))) vs := it.Value() require.EqualValues(t, "a1", string(vs.Value)) require.EqualValues(t, 'A', vs.Meta) it.Next() require.True(t, it.Valid()) k = it.Key() require.EqualValues(t, "k2", string(y.ParseKey(k))) vs = it.Value() require.EqualValues(t, "a2", string(vs.Value)) require.EqualValues(t, 'A', vs.Meta) it.Next() k = it.Key() require.EqualValues(t, "l1", string(y.ParseKey(k))) vs = it.Value() require.EqualValues(t, "b1", string(vs.Value)) require.EqualValues(t, 'A', vs.Meta) it.Next() require.False(t, it.Valid()) } // Take only the second iterator. func TestMergingIteratorTakeTwo(t *testing.T) { opts := getTestTableOptions() f1 := buildTable(t, [][]string{{"l1", "b1"}}, opts) f2 := buildTable(t, [][]string{ {"k1", "a1"}, {"k2", "a2"}, }, opts) t1, err := OpenTable(f1, opts) require.NoError(t, err) defer t1.DecrRef() t2, err := OpenTable(f2, opts) require.NoError(t, err) defer t2.DecrRef() it1 := NewConcatIterator([]*Table{t1}, false) it2 := NewConcatIterator([]*Table{t2}, false) it := NewMergeIterator([]y.Iterator{it1, it2}, false) defer it.Close() it.Rewind() require.True(t, it.Valid()) k := it.Key() require.EqualValues(t, "k1", string(y.ParseKey(k))) vs := it.Value() require.EqualValues(t, "a1", string(vs.Value)) require.EqualValues(t, 'A', vs.Meta) it.Next() require.True(t, it.Valid()) k = it.Key() require.EqualValues(t, "k2", string(y.ParseKey(k))) vs = it.Value() require.EqualValues(t, "a2", string(vs.Value)) require.EqualValues(t, 'A', vs.Meta) it.Next() require.True(t, it.Valid()) k = it.Key() require.EqualValues(t, "l1", string(y.ParseKey(k))) vs = it.Value() require.EqualValues(t, "b1", string(vs.Value)) require.EqualValues(t, 'A', vs.Meta) it.Next() require.False(t, it.Valid()) } func TestTableBigValues(t *testing.T) { value := func(i int) []byte { return []byte(fmt.Sprintf("%01048576d", i)) // Return 1MB value which is > math.MaxUint16. } rand.Seed(time.Now().UnixNano()) filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Int63()) f, err := y.OpenSyncedFile(filename, true) require.NoError(t, err, "unable to create file") n := 100 // Insert 100 keys. opts := Options{Compression: options.ZSTD, BlockSize: 4 * 1024, BloomFalsePositive: 0.01} builder := NewTableBuilder(opts) for i := 0; i < n; i++ { key := y.KeyWithTs([]byte(key("", i)), 0) vs := y.ValueStruct{Value: value(i)} builder.Add(key, vs, 0) } _, err = f.Write(builder.Finish()) require.NoError(t, err, "unable to write to file") tbl, err := OpenTable(f, opts) require.NoError(t, err, "unable to open table") defer tbl.DecrRef() itr := tbl.NewIterator(false) require.True(t, itr.Valid()) count := 0 for itr.Rewind(); itr.Valid(); itr.Next() { require.Equal(t, []byte(key("", count)), y.ParseKey(itr.Key()), "keys are not equal") require.Equal(t, value(count), itr.Value().Value, "values are not equal") count++ } require.False(t, itr.Valid(), "table iterator should be invalid now") require.Equal(t, n, count) } // This test is for verifying checksum failure during table open. func TestTableChecksum(t *testing.T) { rand.Seed(time.Now().Unix()) // we are going to write random byte at random location in table file. rb := make([]byte, 100) rand.Read(rb) opts := getTestTableOptions() opts.ChkMode = options.OnTableAndBlockRead f := buildTestTable(t, "k", 10000, opts) fi, err := f.Stat() require.NoError(t, err, "unable to get file information") // Write random bytes at random location. n, err := f.WriteAt(rb, rand.Int63n(fi.Size())) require.NoError(t, err) require.Equal(t, n, len(rb)) _, err = OpenTable(f, opts) if err == nil || !strings.Contains(err.Error(), "checksum") { t.Fatal("Test should have been failed with checksum mismatch error") } } var cacheConfig = ristretto.Config{ NumCounters: 1000000 * 10, MaxCost: 1000000, BufferItems: 64, Metrics: true, } func BenchmarkRead(b *testing.B) { n := int(5 * 1e6) tbl := getTableForBenchmarks(b, n, nil) defer tbl.DecrRef() b.ResetTimer() // Iterate b.N times over the entire table. for i := 0; i < b.N; i++ { func() { it := tbl.NewIterator(false) defer it.Close() for it.seekToFirst(); it.Valid(); it.next() { } }() } } func BenchmarkReadAndBuild(b *testing.B) { n := int(5 * 1e6) var cache, _ = ristretto.NewCache(&cacheConfig) tbl := getTableForBenchmarks(b, n, cache) defer tbl.DecrRef() b.ResetTimer() // Iterate b.N times over the entire table. for i := 0; i < b.N; i++ { func() { opts := Options{Compression: options.ZSTD, BlockSize: 4 * 0124, BloomFalsePositive: 0.01} opts.BlockCache = cache newBuilder := NewTableBuilder(opts) it := tbl.NewIterator(false) defer it.Close() for it.seekToFirst(); it.Valid(); it.next() { vs := it.Value() newBuilder.Add(it.Key(), vs, 0) } newBuilder.Finish() }() } } func BenchmarkReadMerged(b *testing.B) { n := int(5 * 1e6) m := 5 // Number of tables. y.AssertTrue((n % m) == 0) tableSize := n / m var tables []*Table var cache, err = ristretto.NewCache(&cacheConfig) require.NoError(b, err) for i := 0; i < m; i++ { filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Int63()) opts := Options{Compression: options.ZSTD, BlockSize: 4 * 1024, BloomFalsePositive: 0.01} opts.BlockCache = cache builder := NewTableBuilder(opts) f, err := y.OpenSyncedFile(filename, true) y.Check(err) for j := 0; j < tableSize; j++ { id := j*m + i // Arrays are interleaved. // id := i*tableSize+j (not interleaved) k := fmt.Sprintf("%016x", id) v := fmt.Sprintf("%d", id) builder.Add([]byte(k), y.ValueStruct{Value: []byte(v), Meta: 123, UserMeta: 0}, 0) } _, err = f.Write(builder.Finish()) require.NoError(b, err, "unable to write to file") tbl, err := OpenTable(f, opts) y.Check(err) tables = append(tables, tbl) defer tbl.DecrRef() } b.ResetTimer() // Iterate b.N times over the entire table. for i := 0; i < b.N; i++ { func() { var iters []y.Iterator for _, tbl := range tables { iters = append(iters, tbl.NewIterator(false)) } it := NewMergeIterator(iters, false) defer it.Close() for it.Rewind(); it.Valid(); it.Next() { } }() } } func BenchmarkChecksum(b *testing.B) { keySz := []int{KB, 2 * KB, 4 * KB, 8 * KB, 16 * KB, 32 * KB, 64 * KB, 128 * KB, 256 * KB, MB} for _, kz := range keySz { key := make([]byte, kz) b.Run(fmt.Sprintf("CRC %d", kz), func(b *testing.B) { for i := 0; i < b.N; i++ { crc32.ChecksumIEEE(key) } }) b.Run(fmt.Sprintf("xxHash64 %d", kz), func(b *testing.B) { for i := 0; i < b.N; i++ { xxhash.Sum64(key) } }) b.Run(fmt.Sprintf("SHA256 %d", kz), func(b *testing.B) { for i := 0; i < b.N; i++ { sha256.Sum256(key) } }) } } func BenchmarkRandomRead(b *testing.B) { n := int(5 * 1e6) tbl := getTableForBenchmarks(b, n, nil) defer tbl.DecrRef() r := rand.New(rand.NewSource(time.Now().Unix())) b.ResetTimer() for i := 0; i < b.N; i++ { itr := tbl.NewIterator(false) no := r.Intn(n) k := []byte(fmt.Sprintf("%016x", no)) v := []byte(fmt.Sprintf("%d", no)) itr.Seek(k) if !itr.Valid() { b.Fatal("itr should be valid") } v1 := itr.Value().Value if !bytes.Equal(v, v1) { fmt.Println("value does not match") b.Fatal() } itr.Close() } } func getTableForBenchmarks(b *testing.B, count int, cache *ristretto.Cache) *Table { rand.Seed(time.Now().Unix()) opts := Options{Compression: options.ZSTD, BlockSize: 4 * 1024, BloomFalsePositive: 0.01} if cache == nil { var err error cache, err = ristretto.NewCache(&cacheConfig) require.NoError(b, err) } opts.BlockCache = cache builder := NewTableBuilder(opts) filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Int63()) f, err := y.OpenSyncedFile(filename, true) require.NoError(b, err) for i := 0; i < count; i++ { k := fmt.Sprintf("%016x", i) v := fmt.Sprintf("%d", i) builder.Add([]byte(k), y.ValueStruct{Value: []byte(v)}, 0) } _, err = f.Write(builder.Finish()) require.NoError(b, err, "unable to write to file") tbl, err := OpenTable(f, opts) require.NoError(b, err, "unable to open table") return tbl } func TestMain(m *testing.M) { rand.Seed(time.Now().UTC().UnixNano()) os.Exit(m.Run()) } func TestOpenKVSize(t *testing.T) { opts := getTestTableOptions() table, err := OpenTable(buildTestTable(t, "foo", 1, opts), opts) require.NoError(t, err) // The following values might change if the table/header structure is changed. var entrySize uint64 = 15 /* DiffKey len */ + 4 /* Header Size */ + 4 /* Encoded vp */ require.Equal(t, entrySize, table.EstimatedSize()) } // Run this test with command "go test -race -run TestDoesNotHaveRace" func TestDoesNotHaveRace(t *testing.T) { opts := getTestTableOptions() f := buildTestTable(t, "key", 10000, opts) table, err := OpenTable(f, opts) require.NoError(t, err) defer table.DecrRef() var wg sync.WaitGroup wg.Add(5) for i := 0; i < 5; i++ { go func() { require.True(t, table.DoesNotHave(uint64(1237882))) wg.Done() }() } wg.Wait() } var ko *pb.BlockOffset // Use this benchmark to manually verify block offset size calculation func BenchmarkBlockOffsetSizeCalculation(b *testing.B) { for i := 0; i < b.N; i++ { ko = &pb.BlockOffset{ Key: []byte{1, 23}, } } } func TestBlockOffsetSizeCalculation(t *testing.T) { // Empty struct testing. require.Equal(t, calculateOffsetsSize([]*pb.BlockOffset{&pb.BlockOffset{}}), int64(88)) // Testing with key bytes require.Equal(t, calculateOffsetsSize([]*pb.BlockOffset{&pb.BlockOffset{Key: []byte{1, 1}}}), int64(90)) } badger-2.2007.2/test.sh000077500000000000000000000024121372173116500145130ustar00rootroot00000000000000#!/bin/bash set -e go version packages=$(go list ./... | grep github.com/dgraph-io/badger/v2/) if [[ ! -z "$TEAMCITY_VERSION" ]]; then export GOFLAGS="-json" fi # Ensure that we can compile the binary. pushd badger go build -v . popd # Run the memory intensive tests first. go test -v -run='TestBigKeyValuePairs$' --manual=true go test -v -run='TestPushValueLogLimit' --manual=true # Run the special Truncate test. rm -rf p go test -v -run='TestTruncateVlogNoClose$' --manual=true truncate --size=4096 p/000000.vlog go test -v -run='TestTruncateVlogNoClose2$' --manual=true go test -v -run='TestTruncateVlogNoClose3$' --manual=true rm -rf p # Then the normal tests. echo echo "==> Starting test for table, skl and y package" go test -v -race github.com/dgraph-io/badger/v2/skl # Run test for all package except the top level package. The top level package support the # `vlog_mmap` flag which rest of the packages don't support. go test -v -race $packages echo echo "==> Starting tests with value log mmapped..." # Run top level package tests with mmap flag. go test -timeout=25m -v -race github.com/dgraph-io/badger/v2 --vlog_mmap=true echo echo "==> Starting tests with value log not mmapped..." go test -timeout=25m -v -race github.com/dgraph-io/badger/v2 --vlog_mmap=false badger-2.2007.2/trie/000077500000000000000000000000001372173116500141415ustar00rootroot00000000000000badger-2.2007.2/trie/trie.go000066400000000000000000000045521372173116500154410ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package trie type node struct { children map[byte]*node ids []uint64 } func newNode() *node { return &node{ children: make(map[byte]*node), ids: []uint64{}, } } // Trie datastructure. type Trie struct { root *node } // NewTrie returns Trie. func NewTrie() *Trie { return &Trie{ root: newNode(), } } // Add adds the id in the trie for the given prefix path. func (t *Trie) Add(prefix []byte, id uint64) { node := t.root for _, val := range prefix { child, ok := node.children[val] if !ok { child = newNode() node.children[val] = child } node = child } // We only need to add the id to the last node of the given prefix. node.ids = append(node.ids, id) } // Get returns prefix matched ids for the given key. func (t *Trie) Get(key []byte) map[uint64]struct{} { out := make(map[uint64]struct{}) node := t.root // If root has ids that means we have subscribers for "nil/[]byte{}" // prefix. Add them to the list. if len(node.ids) > 0 { for _, i := range node.ids { out[i] = struct{}{} } } for _, val := range key { child, ok := node.children[val] if !ok { break } // We need ids of the all the node in the matching key path. for _, id := range child.ids { out[id] = struct{}{} } node = child } return out } // Delete will delete the id if the id exist in the given index path. func (t *Trie) Delete(index []byte, id uint64) { node := t.root for _, val := range index { child, ok := node.children[val] if !ok { return } node = child } // We're just removing the id not the hanging path. out := node.ids[:0] for _, val := range node.ids { if val != id { out = append(out, val) } } for i := len(out); i < len(node.ids); i++ { node.ids[i] = 0 // garbage collecting } node.ids = out } badger-2.2007.2/trie/trie_test.go000066400000000000000000000035611372173116500164770ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package trie import ( "testing" "github.com/stretchr/testify/require" ) func TestGet(t *testing.T) { trie := NewTrie() trie.Add([]byte("hello"), 1) trie.Add([]byte("hello"), 3) trie.Add([]byte("hello"), 4) trie.Add([]byte("hel"), 20) trie.Add([]byte("he"), 20) trie.Add([]byte("badger"), 30) trie.Add(nil, 10) require.Equal(t, map[uint64]struct{}{10: {}}, trie.Get([]byte("A"))) ids := trie.Get([]byte("hel")) require.Equal(t, 2, len(ids)) require.Equal(t, map[uint64]struct{}{10: {}, 20: {}}, ids) ids = trie.Get([]byte("badger")) require.Equal(t, 2, len(ids)) require.Equal(t, map[uint64]struct{}{10: {}, 30: {}}, ids) ids = trie.Get([]byte("hello")) require.Equal(t, 5, len(ids)) require.Equal(t, map[uint64]struct{}{10: {}, 1: {}, 3: {}, 4: {}, 20: {}}, ids) trie.Add([]byte{}, 11) require.Equal(t, map[uint64]struct{}{10: {}, 11: {}}, trie.Get([]byte("A"))) } func TestTrieDelete(t *testing.T) { trie := NewTrie() trie.Add([]byte("hello"), 1) trie.Add([]byte("hello"), 3) trie.Add([]byte("hello"), 4) trie.Add(nil, 5) trie.Delete([]byte("hello"), 4) require.Equal(t, map[uint64]struct{}{5: {}, 1: {}, 3: {}}, trie.Get([]byte("hello"))) trie.Delete(nil, 5) require.Equal(t, map[uint64]struct{}{1: {}, 3: {}}, trie.Get([]byte("hello"))) } badger-2.2007.2/txn.go000066400000000000000000000564771372173116500143610ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "bytes" "context" "encoding/hex" "math" "sort" "strconv" "sync" "sync/atomic" "github.com/dgraph-io/badger/v2/y" "github.com/dgraph-io/ristretto/z" "github.com/pkg/errors" ) type oracle struct { isManaged bool // Does not change value, so no locking required. detectConflicts bool // Determines if the txns should be checked for conflicts. sync.Mutex // For nextTxnTs and commits. // writeChLock lock is for ensuring that transactions go to the write // channel in the same order as their commit timestamps. writeChLock sync.Mutex nextTxnTs uint64 // Used to block NewTransaction, so all previous commits are visible to a new read. txnMark *y.WaterMark // Either of these is used to determine which versions can be permanently // discarded during compaction. discardTs uint64 // Used by ManagedDB. readMark *y.WaterMark // Used by DB. // committedTxns contains all committed writes (contains fingerprints // of keys written and their latest commit counter). committedTxns []committedTxn lastCleanupTs uint64 // closer is used to stop watermarks. closer *y.Closer } type committedTxn struct { ts uint64 // ConflictKeys Keeps track of the entries written at timestamp ts. conflictKeys map[uint64]struct{} } func newOracle(opt Options) *oracle { orc := &oracle{ isManaged: opt.managedTxns, detectConflicts: opt.DetectConflicts, // We're not initializing nextTxnTs and readOnlyTs. It would be done after replay in Open. // // WaterMarks must be 64-bit aligned for atomic package, hence we must use pointers here. // See https://golang.org/pkg/sync/atomic/#pkg-note-BUG. readMark: &y.WaterMark{Name: "badger.PendingReads"}, txnMark: &y.WaterMark{Name: "badger.TxnTimestamp"}, closer: y.NewCloser(2), } orc.readMark.Init(orc.closer) orc.txnMark.Init(orc.closer) return orc } func (o *oracle) Stop() { o.closer.SignalAndWait() } func (o *oracle) readTs() uint64 { if o.isManaged { panic("ReadTs should not be retrieved for managed DB") } var readTs uint64 o.Lock() readTs = o.nextTxnTs - 1 o.readMark.Begin(readTs) o.Unlock() // Wait for all txns which have no conflicts, have been assigned a commit // timestamp and are going through the write to value log and LSM tree // process. Not waiting here could mean that some txns which have been // committed would not be read. y.Check(o.txnMark.WaitForMark(context.Background(), readTs)) return readTs } func (o *oracle) nextTs() uint64 { o.Lock() defer o.Unlock() return o.nextTxnTs } func (o *oracle) incrementNextTs() { o.Lock() defer o.Unlock() o.nextTxnTs++ } // Any deleted or invalid versions at or below ts would be discarded during // compaction to reclaim disk space in LSM tree and thence value log. func (o *oracle) setDiscardTs(ts uint64) { o.Lock() defer o.Unlock() o.discardTs = ts o.cleanupCommittedTransactions() } func (o *oracle) discardAtOrBelow() uint64 { if o.isManaged { o.Lock() defer o.Unlock() return o.discardTs } return o.readMark.DoneUntil() } // hasConflict must be called while having a lock. func (o *oracle) hasConflict(txn *Txn) bool { if len(txn.reads) == 0 { return false } for _, committedTxn := range o.committedTxns { // If the committedTxn.ts is less than txn.readTs that implies that the // committedTxn finished before the current transaction started. // We don't need to check for conflict in that case. // This change assumes linearizability. Lack of linearizability could // cause the read ts of a new txn to be lower than the commit ts of // a txn before it (@mrjn). if committedTxn.ts <= txn.readTs { continue } for _, ro := range txn.reads { if _, has := committedTxn.conflictKeys[ro]; has { return true } } } return false } func (o *oracle) newCommitTs(txn *Txn) uint64 { o.Lock() defer o.Unlock() if o.hasConflict(txn) { return 0 } var ts uint64 if !o.isManaged { o.doneRead(txn) o.cleanupCommittedTransactions() // This is the general case, when user doesn't specify the read and commit ts. ts = o.nextTxnTs o.nextTxnTs++ o.txnMark.Begin(ts) } else { // If commitTs is set, use it instead. ts = txn.commitTs } y.AssertTrue(ts >= o.lastCleanupTs) if o.detectConflicts { // We should ensure that txns are not added to o.committedTxns slice when // conflict detection is disabled otherwise this slice would keep growing. o.committedTxns = append(o.committedTxns, committedTxn{ ts: ts, conflictKeys: txn.conflictKeys, }) } return ts } func (o *oracle) doneRead(txn *Txn) { if !txn.doneRead { txn.doneRead = true o.readMark.Done(txn.readTs) } } func (o *oracle) cleanupCommittedTransactions() { // Must be called under o.Lock if !o.detectConflicts { // When detectConflicts is set to false, we do not store any // committedTxns and so there's nothing to clean up. return } // Same logic as discardAtOrBelow but unlocked var maxReadTs uint64 if o.isManaged { maxReadTs = o.discardTs } else { maxReadTs = o.readMark.DoneUntil() } y.AssertTrue(maxReadTs >= o.lastCleanupTs) // do not run clean up if the maxReadTs (read timestamp of the // oldest transaction that is still in flight) has not increased if maxReadTs == o.lastCleanupTs { return } o.lastCleanupTs = maxReadTs tmp := o.committedTxns[:0] for _, txn := range o.committedTxns { if txn.ts <= maxReadTs { continue } tmp = append(tmp, txn) } o.committedTxns = tmp } func (o *oracle) doneCommit(cts uint64) { if o.isManaged { // No need to update anything. return } o.txnMark.Done(cts) } // Txn represents a Badger transaction. type Txn struct { readTs uint64 commitTs uint64 update bool // update is used to conditionally keep track of reads. reads []uint64 // contains fingerprints of keys read. // contains fingerprints of keys written. This is used for conflict detection. conflictKeys map[uint64]struct{} readsLock sync.Mutex // guards the reads slice. See addReadKey. pendingWrites map[string]*Entry // cache stores any writes done by txn. duplicateWrites []*Entry // Used in managed mode to store duplicate entries. db *DB discarded bool doneRead bool size int64 count int64 numIterators int32 } type pendingWritesIterator struct { entries []*Entry nextIdx int readTs uint64 reversed bool } func (pi *pendingWritesIterator) Next() { pi.nextIdx++ } func (pi *pendingWritesIterator) Rewind() { pi.nextIdx = 0 } func (pi *pendingWritesIterator) Seek(key []byte) { key = y.ParseKey(key) pi.nextIdx = sort.Search(len(pi.entries), func(idx int) bool { cmp := bytes.Compare(pi.entries[idx].Key, key) if !pi.reversed { return cmp >= 0 } return cmp <= 0 }) } func (pi *pendingWritesIterator) Key() []byte { y.AssertTrue(pi.Valid()) entry := pi.entries[pi.nextIdx] return y.KeyWithTs(entry.Key, pi.readTs) } func (pi *pendingWritesIterator) Value() y.ValueStruct { y.AssertTrue(pi.Valid()) entry := pi.entries[pi.nextIdx] return y.ValueStruct{ Value: entry.Value, Meta: entry.meta, UserMeta: entry.UserMeta, ExpiresAt: entry.ExpiresAt, Version: pi.readTs, } } func (pi *pendingWritesIterator) Valid() bool { return pi.nextIdx < len(pi.entries) } func (pi *pendingWritesIterator) Close() error { return nil } func (txn *Txn) newPendingWritesIterator(reversed bool) *pendingWritesIterator { if !txn.update || len(txn.pendingWrites) == 0 { return nil } entries := make([]*Entry, 0, len(txn.pendingWrites)) for _, e := range txn.pendingWrites { entries = append(entries, e) } // Number of pending writes per transaction shouldn't be too big in general. sort.Slice(entries, func(i, j int) bool { cmp := bytes.Compare(entries[i].Key, entries[j].Key) if !reversed { return cmp < 0 } return cmp > 0 }) return &pendingWritesIterator{ readTs: txn.readTs, entries: entries, reversed: reversed, } } func (txn *Txn) checkSize(e *Entry) error { count := txn.count + 1 // Extra bytes for the version in key. size := txn.size + int64(e.estimateSize(txn.db.opt.ValueThreshold)) + 10 if count >= txn.db.opt.maxBatchCount || size >= txn.db.opt.maxBatchSize { return ErrTxnTooBig } txn.count, txn.size = count, size return nil } func exceedsSize(prefix string, max int64, key []byte) error { return errors.Errorf("%s with size %d exceeded %d limit. %s:\n%s", prefix, len(key), max, prefix, hex.Dump(key[:1<<10])) } func (txn *Txn) modify(e *Entry) error { const maxKeySize = 65000 switch { case !txn.update: return ErrReadOnlyTxn case txn.discarded: return ErrDiscardedTxn case len(e.Key) == 0: return ErrEmptyKey case bytes.HasPrefix(e.Key, badgerPrefix): return ErrInvalidKey case len(e.Key) > maxKeySize: // Key length can't be more than uint16, as determined by table::header. To // keep things safe and allow badger move prefix and a timestamp suffix, let's // cut it down to 65000, instead of using 65536. return exceedsSize("Key", maxKeySize, e.Key) case int64(len(e.Value)) > txn.db.opt.ValueLogFileSize: return exceedsSize("Value", txn.db.opt.ValueLogFileSize, e.Value) case txn.db.opt.InMemory && len(e.Value) > txn.db.opt.ValueThreshold: return exceedsSize("Value", int64(txn.db.opt.ValueThreshold), e.Value) } if err := txn.checkSize(e); err != nil { return err } // The txn.conflictKeys is used for conflict detection. If conflict detection // is disabled, we don't need to store key hashes in this map. if txn.db.opt.DetectConflicts { fp := z.MemHash(e.Key) // Avoid dealing with byte arrays. txn.conflictKeys[fp] = struct{}{} } // If a duplicate entry was inserted in managed mode, move it to the duplicate writes slice. // Add the entry to duplicateWrites only if both the entries have different versions. For // same versions, we will overwrite the existing entry. if oldEntry, ok := txn.pendingWrites[string(e.Key)]; ok && oldEntry.version != e.version { txn.duplicateWrites = append(txn.duplicateWrites, oldEntry) } txn.pendingWrites[string(e.Key)] = e return nil } // Set adds a key-value pair to the database. // It will return ErrReadOnlyTxn if update flag was set to false when creating the transaction. // // The current transaction keeps a reference to the key and val byte slice // arguments. Users must not modify key and val until the end of the transaction. func (txn *Txn) Set(key, val []byte) error { return txn.SetEntry(NewEntry(key, val)) } // SetEntry takes an Entry struct and adds the key-value pair in the struct, // along with other metadata to the database. // // The current transaction keeps a reference to the entry passed in argument. // Users must not modify the entry until the end of the transaction. func (txn *Txn) SetEntry(e *Entry) error { return txn.modify(e) } // Delete deletes a key. // // This is done by adding a delete marker for the key at commit timestamp. Any // reads happening before this timestamp would be unaffected. Any reads after // this commit would see the deletion. // // The current transaction keeps a reference to the key byte slice argument. // Users must not modify the key until the end of the transaction. func (txn *Txn) Delete(key []byte) error { e := &Entry{ Key: key, meta: bitDelete, } return txn.modify(e) } // Get looks for key and returns corresponding Item. // If key is not found, ErrKeyNotFound is returned. func (txn *Txn) Get(key []byte) (item *Item, rerr error) { if len(key) == 0 { return nil, ErrEmptyKey } else if txn.discarded { return nil, ErrDiscardedTxn } item = new(Item) if txn.update { if e, has := txn.pendingWrites[string(key)]; has && bytes.Equal(key, e.Key) { if isDeletedOrExpired(e.meta, e.ExpiresAt) { return nil, ErrKeyNotFound } // Fulfill from cache. item.meta = e.meta item.val = e.Value item.userMeta = e.UserMeta item.key = key item.status = prefetched item.version = txn.readTs item.expiresAt = e.ExpiresAt // We probably don't need to set db on item here. return item, nil } // Only track reads if this is update txn. No need to track read if txn serviced it // internally. txn.addReadKey(key) } seek := y.KeyWithTs(key, txn.readTs) vs, err := txn.db.get(seek) if err != nil { return nil, errors.Wrapf(err, "DB::Get key: %q", key) } if vs.Value == nil && vs.Meta == 0 { return nil, ErrKeyNotFound } if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) { return nil, ErrKeyNotFound } item.key = key item.version = vs.Version item.meta = vs.Meta item.userMeta = vs.UserMeta item.db = txn.db item.vptr = y.SafeCopy(item.vptr, vs.Value) item.txn = txn item.expiresAt = vs.ExpiresAt return item, nil } func (txn *Txn) addReadKey(key []byte) { if txn.update { fp := z.MemHash(key) // Because of the possibility of multiple iterators it is now possible // for multiple threads within a read-write transaction to read keys at // the same time. The reads slice is not currently thread-safe and // needs to be locked whenever we mark a key as read. txn.readsLock.Lock() txn.reads = append(txn.reads, fp) txn.readsLock.Unlock() } } // Discard discards a created transaction. This method is very important and must be called. Commit // method calls this internally, however, calling this multiple times doesn't cause any issues. So, // this can safely be called via a defer right when transaction is created. // // NOTE: If any operations are run on a discarded transaction, ErrDiscardedTxn is returned. func (txn *Txn) Discard() { if txn.discarded { // Avoid a re-run. return } if atomic.LoadInt32(&txn.numIterators) > 0 { panic("Unclosed iterator at time of Txn.Discard.") } txn.discarded = true if !txn.db.orc.isManaged { txn.db.orc.doneRead(txn) } } func (txn *Txn) commitAndSend() (func() error, error) { orc := txn.db.orc // Ensure that the order in which we get the commit timestamp is the same as // the order in which we push these updates to the write channel. So, we // acquire a writeChLock before getting a commit timestamp, and only release // it after pushing the entries to it. orc.writeChLock.Lock() defer orc.writeChLock.Unlock() commitTs := orc.newCommitTs(txn) // The commitTs can be zero if the transaction is running in managed mode. // Individual entries might have their own timestamps. if commitTs == 0 && !txn.db.opt.managedTxns { return nil, ErrConflict } keepTogether := true setVersion := func(e *Entry) { if e.version == 0 { e.version = commitTs } else { keepTogether = false } } for _, e := range txn.pendingWrites { setVersion(e) } // The duplicateWrites slice will be non-empty only if there are duplicate // entries with different versions. for _, e := range txn.duplicateWrites { setVersion(e) } entries := make([]*Entry, 0, len(txn.pendingWrites)+len(txn.duplicateWrites)+1) processEntry := func(e *Entry) { // Suffix the keys with commit ts, so the key versions are sorted in // descending order of commit timestamp. e.Key = y.KeyWithTs(e.Key, e.version) // Add bitTxn only if these entries are part of a transaction. We // support SetEntryAt(..) in managed mode which means a single // transaction can have entries with different timestamps. If entries // in a single transaction have different timestamps, we don't add the // transaction markers. if keepTogether { e.meta |= bitTxn } entries = append(entries, e) } // The following debug information is what led to determining the cause of // bank txn violation bug, and it took a whole bunch of effort to narrow it // down to here. So, keep this around for at least a couple of months. // var b strings.Builder // fmt.Fprintf(&b, "Read: %d. Commit: %d. reads: %v. writes: %v. Keys: ", // txn.readTs, commitTs, txn.reads, txn.conflictKeys) for _, e := range txn.pendingWrites { processEntry(e) } for _, e := range txn.duplicateWrites { processEntry(e) } if keepTogether { // CommitTs should not be zero if we're inserting transaction markers. y.AssertTrue(commitTs != 0) e := &Entry{ Key: y.KeyWithTs(txnKey, commitTs), Value: []byte(strconv.FormatUint(commitTs, 10)), meta: bitFinTxn, } entries = append(entries, e) } req, err := txn.db.sendToWriteCh(entries) if err != nil { orc.doneCommit(commitTs) return nil, err } ret := func() error { err := req.Wait() // Wait before marking commitTs as done. // We can't defer doneCommit above, because it is being called from a // callback here. orc.doneCommit(commitTs) return err } return ret, nil } func (txn *Txn) commitPrecheck() error { if txn.discarded { return errors.New("Trying to commit a discarded txn") } keepTogether := true for _, e := range txn.pendingWrites { if e.version != 0 { keepTogether = false } } // If keepTogether is True, it implies transaction markers will be added. // In that case, commitTs should not be never be zero. This might happen if // someone uses txn.Commit instead of txn.CommitAt in managed mode. This // should happen only in managed mode. In normal mode, keepTogether will // always be true. if keepTogether && txn.db.opt.managedTxns && txn.commitTs == 0 { return errors.New("CommitTs cannot be zero. Please use commitAt instead") } return nil } // Commit commits the transaction, following these steps: // // 1. If there are no writes, return immediately. // // 2. Check if read rows were updated since txn started. If so, return ErrConflict. // // 3. If no conflict, generate a commit timestamp and update written rows' commit ts. // // 4. Batch up all writes, write them to value log and LSM tree. // // 5. If callback is provided, Badger will return immediately after checking // for conflicts. Writes to the database will happen in the background. If // there is a conflict, an error will be returned and the callback will not // run. If there are no conflicts, the callback will be called in the // background upon successful completion of writes or any error during write. // // If error is nil, the transaction is successfully committed. In case of a non-nil error, the LSM // tree won't be updated, so there's no need for any rollback. func (txn *Txn) Commit() error { // txn.conflictKeys can be zero if conflict detection is turned off. So we // should check txn.pendingWrites. if len(txn.pendingWrites) == 0 { return nil // Nothing to do. } // Precheck before discarding txn. if err := txn.commitPrecheck(); err != nil { return err } defer txn.Discard() txnCb, err := txn.commitAndSend() if err != nil { return err } // If batchSet failed, LSM would not have been updated. So, no need to rollback anything. // TODO: What if some of the txns successfully make it to value log, but others fail. // Nothing gets updated to LSM, until a restart happens. return txnCb() } type txnCb struct { commit func() error user func(error) err error } func runTxnCallback(cb *txnCb) { switch { case cb == nil: panic("txn callback is nil") case cb.user == nil: panic("Must have caught a nil callback for txn.CommitWith") case cb.err != nil: cb.user(cb.err) case cb.commit != nil: err := cb.commit() cb.user(err) default: cb.user(nil) } } // CommitWith acts like Commit, but takes a callback, which gets run via a // goroutine to avoid blocking this function. The callback is guaranteed to run, // so it is safe to increment sync.WaitGroup before calling CommitWith, and // decrementing it in the callback; to block until all callbacks are run. func (txn *Txn) CommitWith(cb func(error)) { if cb == nil { panic("Nil callback provided to CommitWith") } if len(txn.pendingWrites) == 0 { // Do not run these callbacks from here, because the CommitWith and the // callback might be acquiring the same locks. Instead run the callback // from another goroutine. go runTxnCallback(&txnCb{user: cb, err: nil}) return } // Precheck before discarding txn. if err := txn.commitPrecheck(); err != nil { cb(err) return } defer txn.Discard() commitCb, err := txn.commitAndSend() if err != nil { go runTxnCallback(&txnCb{user: cb, err: err}) return } go runTxnCallback(&txnCb{user: cb, commit: commitCb}) } // ReadTs returns the read timestamp of the transaction. func (txn *Txn) ReadTs() uint64 { return txn.readTs } // NewTransaction creates a new transaction. Badger supports concurrent execution of transactions, // providing serializable snapshot isolation, avoiding write skews. Badger achieves this by tracking // the keys read and at Commit time, ensuring that these read keys weren't concurrently modified by // another transaction. // // For read-only transactions, set update to false. In this mode, we don't track the rows read for // any changes. Thus, any long running iterations done in this mode wouldn't pay this overhead. // // Running transactions concurrently is OK. However, a transaction itself isn't thread safe, and // should only be run serially. It doesn't matter if a transaction is created by one goroutine and // passed down to other, as long as the Txn APIs are called serially. // // When you create a new transaction, it is absolutely essential to call // Discard(). This should be done irrespective of what the update param is set // to. Commit API internally runs Discard, but running it twice wouldn't cause // any issues. // // txn := db.NewTransaction(false) // defer txn.Discard() // // Call various APIs. func (db *DB) NewTransaction(update bool) *Txn { return db.newTransaction(update, false) } func (db *DB) newTransaction(update, isManaged bool) *Txn { if db.opt.ReadOnly && update { // DB is read-only, force read-only transaction. update = false } txn := &Txn{ update: update, db: db, count: 1, // One extra entry for BitFin. size: int64(len(txnKey) + 10), // Some buffer for the extra entry. } if update { if db.opt.DetectConflicts { txn.conflictKeys = make(map[uint64]struct{}) } txn.pendingWrites = make(map[string]*Entry) } if !isManaged { txn.readTs = db.orc.readTs() } return txn } // View executes a function creating and managing a read-only transaction for the user. Error // returned by the function is relayed by the View method. // If View is used with managed transactions, it would assume a read timestamp of MaxUint64. func (db *DB) View(fn func(txn *Txn) error) error { if db.IsClosed() { return ErrDBClosed } var txn *Txn if db.opt.managedTxns { txn = db.NewTransactionAt(math.MaxUint64, false) } else { txn = db.NewTransaction(false) } defer txn.Discard() return fn(txn) } // Update executes a function, creating and managing a read-write transaction // for the user. Error returned by the function is relayed by the Update method. // Update cannot be used with managed transactions. func (db *DB) Update(fn func(txn *Txn) error) error { if db.IsClosed() { return ErrDBClosed } if db.opt.managedTxns { panic("Update can only be used with managedDB=false.") } txn := db.NewTransaction(true) defer txn.Discard() if err := fn(txn); err != nil { return err } return txn.Commit() } badger-2.2007.2/txn_test.go000066400000000000000000000600521372173116500154000ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "fmt" "io/ioutil" "math/rand" "strconv" "sync" "sync/atomic" "testing" "time" "github.com/dgraph-io/badger/v2/options" "github.com/dgraph-io/badger/v2/y" "github.com/stretchr/testify/require" ) func TestTxnSimple(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { txn := db.NewTransaction(true) for i := 0; i < 10; i++ { k := []byte(fmt.Sprintf("key=%d", i)) v := []byte(fmt.Sprintf("val=%d", i)) require.NoError(t, txn.SetEntry(NewEntry(k, v))) } item, err := txn.Get([]byte("key=8")) require.NoError(t, err) require.NoError(t, item.Value(func(val []byte) error { require.Equal(t, []byte("val=8"), val) return nil })) require.Panics(t, func() { txn.CommitAt(100, nil) }) require.NoError(t, txn.Commit()) }) } func TestTxnReadAfterWrite(t *testing.T) { test := func(t *testing.T, db *DB) { var wg sync.WaitGroup N := 100 wg.Add(N) for i := 0; i < N; i++ { go func(i int) { defer wg.Done() key := []byte(fmt.Sprintf("key%d", i)) err := db.Update(func(tx *Txn) error { return tx.SetEntry(NewEntry(key, key)) }) require.NoError(t, err) err = db.View(func(tx *Txn) error { item, err := tx.Get(key) require.NoError(t, err) val, err := item.ValueCopy(nil) require.NoError(t, err) require.Equal(t, val, key) return nil }) require.NoError(t, err) }(i) } wg.Wait() } t.Run("disk mode", func(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { test(t, db) }) }) t.Run("InMemory mode", func(t *testing.T) { opt := getTestOptions("") opt.InMemory = true db, err := Open(opt) require.NoError(t, err) test(t, db) require.NoError(t, db.Close()) }) } func TestTxnCommitAsync(t *testing.T) { key := func(i int) []byte { return []byte(fmt.Sprintf("key=%d", i)) } runBadgerTest(t, nil, func(t *testing.T, db *DB) { txn := db.NewTransaction(true) for i := 0; i < 40; i++ { err := txn.SetEntry(NewEntry(key(i), []byte(strconv.Itoa(100)))) require.NoError(t, err) } require.NoError(t, txn.Commit()) txn.Discard() closer := y.NewCloser(1) go func() { defer closer.Done() for { select { case <-closer.HasBeenClosed(): return default: } // Keep checking balance variant txn := db.NewTransaction(false) totalBalance := 0 for i := 0; i < 40; i++ { item, err := txn.Get(key(i)) require.NoError(t, err) val, err := item.ValueCopy(nil) require.NoError(t, err) bal, err := strconv.Atoi(string(val)) require.NoError(t, err) totalBalance += bal } require.Equal(t, totalBalance, 4000) txn.Discard() } }() var wg sync.WaitGroup wg.Add(100) for i := 0; i < 100; i++ { go func() { txn := db.NewTransaction(true) delta := rand.Intn(100) for i := 0; i < 20; i++ { err := txn.SetEntry(NewEntry(key(i), []byte(strconv.Itoa(100-delta)))) require.NoError(t, err) } for i := 20; i < 40; i++ { err := txn.SetEntry(NewEntry(key(i), []byte(strconv.Itoa(100+delta)))) require.NoError(t, err) } // We are only doing writes, so there won't be any conflicts. txn.CommitWith(func(err error) {}) txn.Discard() wg.Done() }() } wg.Wait() closer.SignalAndWait() time.Sleep(time.Millisecond * 10) // allow goroutine to complete. }) } func TestTxnVersions(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { k := []byte("key") for i := 1; i < 10; i++ { txn := db.NewTransaction(true) require.NoError(t, txn.SetEntry(NewEntry(k, []byte(fmt.Sprintf("valversion=%d", i))))) require.NoError(t, txn.Commit()) require.Equal(t, uint64(i), db.orc.readTs()) } checkIterator := func(itr *Iterator, i int) { defer itr.Close() count := 0 for itr.Rewind(); itr.Valid(); itr.Next() { item := itr.Item() require.Equal(t, k, item.Key()) val, err := item.ValueCopy(nil) require.NoError(t, err) exp := fmt.Sprintf("valversion=%d", i) require.Equal(t, exp, string(val), "i=%d", i) count++ } require.Equal(t, 1, count, "i=%d", i) // Should only loop once. } checkAllVersions := func(itr *Iterator, i int) { var version uint64 if itr.opt.Reverse { version = 1 } else { version = uint64(i) } count := 0 for itr.Rewind(); itr.Valid(); itr.Next() { item := itr.Item() require.Equal(t, k, item.Key()) require.Equal(t, version, item.Version()) val, err := item.ValueCopy(nil) require.NoError(t, err) exp := fmt.Sprintf("valversion=%d", version) require.Equal(t, exp, string(val), "v=%d", version) count++ if itr.opt.Reverse { version++ } else { version-- } } require.Equal(t, i, count, "i=%d", i) // Should loop as many times as i. } for i := 1; i < 10; i++ { txn := db.NewTransaction(true) txn.readTs = uint64(i) // Read version at i. item, err := txn.Get(k) require.NoError(t, err) val, err := item.ValueCopy(nil) require.NoError(t, err) require.Equal(t, []byte(fmt.Sprintf("valversion=%d", i)), val, "Expected versions to match up at i=%d", i) // Try retrieving the latest version forward and reverse. itr := txn.NewIterator(DefaultIteratorOptions) checkIterator(itr, i) opt := DefaultIteratorOptions opt.Reverse = true itr = txn.NewIterator(opt) checkIterator(itr, i) // Now try retrieving all versions forward and reverse. opt = DefaultIteratorOptions opt.AllVersions = true itr = txn.NewIterator(opt) checkAllVersions(itr, i) itr.Close() opt = DefaultIteratorOptions opt.AllVersions = true opt.Reverse = true itr = txn.NewIterator(opt) checkAllVersions(itr, i) itr.Close() txn.Discard() } txn := db.NewTransaction(true) defer txn.Discard() item, err := txn.Get(k) require.NoError(t, err) val, err := item.ValueCopy(nil) require.NoError(t, err) require.Equal(t, []byte("valversion=9"), val) }) } func TestTxnWriteSkew(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { // Accounts ax := []byte("x") ay := []byte("y") // Set balance to $100 in each account. txn := db.NewTransaction(true) defer txn.Discard() val := []byte(strconv.Itoa(100)) require.NoError(t, txn.SetEntry(NewEntry(ax, val))) require.NoError(t, txn.SetEntry(NewEntry(ay, val))) require.NoError(t, txn.Commit()) require.Equal(t, uint64(1), db.orc.readTs()) getBal := func(txn *Txn, key []byte) (bal int) { item, err := txn.Get(key) require.NoError(t, err) val, err := item.ValueCopy(nil) require.NoError(t, err) bal, err = strconv.Atoi(string(val)) require.NoError(t, err) return bal } // Start two transactions, each would read both accounts and deduct from one account. txn1 := db.NewTransaction(true) sum := getBal(txn1, ax) sum += getBal(txn1, ay) require.Equal(t, 200, sum) require.NoError(t, txn1.SetEntry(NewEntry(ax, []byte("0")))) // Deduct 100 from ax. // Let's read this back. sum = getBal(txn1, ax) require.Equal(t, 0, sum) sum += getBal(txn1, ay) require.Equal(t, 100, sum) // Don't commit yet. txn2 := db.NewTransaction(true) sum = getBal(txn2, ax) sum += getBal(txn2, ay) require.Equal(t, 200, sum) require.NoError(t, txn2.SetEntry(NewEntry(ay, []byte("0")))) // Deduct 100 from ay. // Let's read this back. sum = getBal(txn2, ax) require.Equal(t, 100, sum) sum += getBal(txn2, ay) require.Equal(t, 100, sum) // Commit both now. require.NoError(t, txn1.Commit()) require.Error(t, txn2.Commit()) // This should fail. require.Equal(t, uint64(2), db.orc.readTs()) }) } // a3, a2, b4 (del), b3, c2, c1 // Read at ts=4 -> a3, c2 // Read at ts=4(Uncommitted) -> a3, b4 // Read at ts=3 -> a3, b3, c2 // Read at ts=2 -> a2, c2 // Read at ts=1 -> c1 func TestTxnIterationEdgeCase(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { ka := []byte("a") kb := []byte("b") kc := []byte("c") // c1 txn := db.NewTransaction(true) require.NoError(t, txn.SetEntry(NewEntry(kc, []byte("c1")))) require.NoError(t, txn.Commit()) require.Equal(t, uint64(1), db.orc.readTs()) // a2, c2 txn = db.NewTransaction(true) require.NoError(t, txn.SetEntry(NewEntry(ka, []byte("a2")))) require.NoError(t, txn.SetEntry(NewEntry(kc, []byte("c2")))) require.NoError(t, txn.Commit()) require.Equal(t, uint64(2), db.orc.readTs()) // b3 txn = db.NewTransaction(true) require.NoError(t, txn.SetEntry(NewEntry(ka, []byte("a3")))) require.NoError(t, txn.SetEntry(NewEntry(kb, []byte("b3")))) require.NoError(t, txn.Commit()) require.Equal(t, uint64(3), db.orc.readTs()) // b4, c4(del) (Uncommitted) txn4 := db.NewTransaction(true) require.NoError(t, txn4.SetEntry(NewEntry(kb, []byte("b4")))) require.NoError(t, txn4.Delete(kc)) require.Equal(t, uint64(3), db.orc.readTs()) // b4 (del) txn = db.NewTransaction(true) txn.Delete(kb) require.NoError(t, txn.Commit()) require.Equal(t, uint64(4), db.orc.readTs()) checkIterator := func(itr *Iterator, expected []string) { defer itr.Close() var i int for itr.Rewind(); itr.Valid(); itr.Next() { item := itr.Item() val, err := item.ValueCopy(nil) require.NoError(t, err) require.Equal(t, expected[i], string(val), "readts=%d", itr.readTs) i++ } require.Equal(t, len(expected), i) } txn = db.NewTransaction(true) defer txn.Discard() itr := txn.NewIterator(DefaultIteratorOptions) itr5 := txn4.NewIterator(DefaultIteratorOptions) checkIterator(itr, []string{"a3", "c2"}) checkIterator(itr5, []string{"a3", "b4"}) rev := DefaultIteratorOptions rev.Reverse = true itr = txn.NewIterator(rev) itr5 = txn4.NewIterator(rev) checkIterator(itr, []string{"c2", "a3"}) checkIterator(itr5, []string{"b4", "a3"}) txn.readTs = 3 itr = txn.NewIterator(DefaultIteratorOptions) checkIterator(itr, []string{"a3", "b3", "c2"}) itr = txn.NewIterator(rev) checkIterator(itr, []string{"c2", "b3", "a3"}) txn.readTs = 2 itr = txn.NewIterator(DefaultIteratorOptions) checkIterator(itr, []string{"a2", "c2"}) itr = txn.NewIterator(rev) checkIterator(itr, []string{"c2", "a2"}) txn.readTs = 1 itr = txn.NewIterator(DefaultIteratorOptions) checkIterator(itr, []string{"c1"}) itr = txn.NewIterator(rev) checkIterator(itr, []string{"c1"}) }) } // a2, a3, b4 (del), b3, c2, c1 // Read at ts=4 -> a3, c2 // Read at ts=3 -> a3, b3, c2 // Read at ts=2 -> a2, c2 // Read at ts=1 -> c1 func TestTxnIterationEdgeCase2(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { ka := []byte("a") kb := []byte("aa") kc := []byte("aaa") // c1 txn := db.NewTransaction(true) require.NoError(t, txn.SetEntry(NewEntry(kc, []byte("c1")))) require.NoError(t, txn.Commit()) require.Equal(t, uint64(1), db.orc.readTs()) // a2, c2 txn = db.NewTransaction(true) require.NoError(t, txn.SetEntry(NewEntry(ka, []byte("a2")))) require.NoError(t, txn.SetEntry(NewEntry(kc, []byte("c2")))) require.NoError(t, txn.Commit()) require.Equal(t, uint64(2), db.orc.readTs()) // b3 txn = db.NewTransaction(true) require.NoError(t, txn.SetEntry(NewEntry(ka, []byte("a3")))) require.NoError(t, txn.SetEntry(NewEntry(kb, []byte("b3")))) require.NoError(t, txn.Commit()) require.Equal(t, uint64(3), db.orc.readTs()) // b4 (del) txn = db.NewTransaction(true) txn.Delete(kb) require.NoError(t, txn.Commit()) require.Equal(t, uint64(4), db.orc.readTs()) checkIterator := func(itr *Iterator, expected []string) { defer itr.Close() var i int for itr.Rewind(); itr.Valid(); itr.Next() { item := itr.Item() val, err := item.ValueCopy(nil) require.NoError(t, err) require.Equal(t, expected[i], string(val), "readts=%d", itr.readTs) i++ } require.Equal(t, len(expected), i) } txn = db.NewTransaction(true) defer txn.Discard() rev := DefaultIteratorOptions rev.Reverse = true itr := txn.NewIterator(DefaultIteratorOptions) checkIterator(itr, []string{"a3", "c2"}) itr = txn.NewIterator(rev) checkIterator(itr, []string{"c2", "a3"}) txn.readTs = 5 itr = txn.NewIterator(DefaultIteratorOptions) itr.Seek(ka) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), ka) itr.Seek(kc) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kc) itr.Close() itr = txn.NewIterator(rev) itr.Seek(ka) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), ka) itr.Seek(kc) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kc) itr.Close() txn.readTs = 3 itr = txn.NewIterator(DefaultIteratorOptions) checkIterator(itr, []string{"a3", "b3", "c2"}) itr = txn.NewIterator(rev) checkIterator(itr, []string{"c2", "b3", "a3"}) txn.readTs = 2 itr = txn.NewIterator(DefaultIteratorOptions) checkIterator(itr, []string{"a2", "c2"}) itr = txn.NewIterator(rev) checkIterator(itr, []string{"c2", "a2"}) txn.readTs = 1 itr = txn.NewIterator(DefaultIteratorOptions) checkIterator(itr, []string{"c1"}) itr = txn.NewIterator(rev) checkIterator(itr, []string{"c1"}) }) } func TestTxnIterationEdgeCase3(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { kb := []byte("abc") kc := []byte("acd") kd := []byte("ade") // c1 txn := db.NewTransaction(true) require.NoError(t, txn.SetEntry(NewEntry(kc, []byte("c1")))) require.NoError(t, txn.Commit()) require.Equal(t, uint64(1), db.orc.readTs()) // b2 txn = db.NewTransaction(true) require.NoError(t, txn.SetEntry(NewEntry(kb, []byte("b2")))) require.NoError(t, txn.Commit()) require.Equal(t, uint64(2), db.orc.readTs()) txn2 := db.NewTransaction(true) require.NoError(t, txn2.SetEntry(NewEntry(kd, []byte("d2")))) require.NoError(t, txn2.Delete(kc)) txn = db.NewTransaction(true) defer txn.Discard() rev := DefaultIteratorOptions rev.Reverse = true itr := txn.NewIterator(DefaultIteratorOptions) itr.Seek([]byte("ab")) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kb) itr.Seek([]byte("ac")) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kc) itr.Seek(nil) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kb) itr.Seek([]byte("ac")) itr.Rewind() itr.Seek(nil) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kb) itr.Seek([]byte("ac")) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kc) itr.Close() // Keys: "abc", "ade" // Read pending writes. itr = txn2.NewIterator(DefaultIteratorOptions) itr.Seek([]byte("ab")) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kb) itr.Seek([]byte("ac")) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kd) itr.Seek(nil) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kb) itr.Seek([]byte("ac")) itr.Rewind() itr.Seek(nil) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kb) itr.Seek([]byte("ad")) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kd) itr.Close() itr = txn.NewIterator(rev) itr.Seek([]byte("ac")) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kb) itr.Seek([]byte("ad")) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kc) itr.Seek(nil) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kc) itr.Seek([]byte("ac")) itr.Rewind() require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kc) itr.Seek([]byte("ad")) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kc) itr.Close() // Keys: "abc", "ade" itr = txn2.NewIterator(rev) itr.Seek([]byte("ad")) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kb) itr.Seek([]byte("ae")) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kd) itr.Seek(nil) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kd) itr.Seek([]byte("ab")) itr.Rewind() require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kd) itr.Seek([]byte("ac")) require.True(t, itr.Valid()) require.Equal(t, itr.item.Key(), kb) itr.Close() }) } func TestIteratorAllVersionsWithDeleted(t *testing.T) { test := func(t *testing.T, db *DB) { // Write two keys err := db.Update(func(txn *Txn) error { require.NoError(t, txn.SetEntry(NewEntry([]byte("answer1"), []byte("42")))) return txn.SetEntry(NewEntry([]byte("answer2"), []byte("43"))) }) require.NoError(t, err) // Delete the specific key version from underlying db directly err = db.View(func(txn *Txn) error { item, err := txn.Get([]byte("answer1")) require.NoError(t, err) err = txn.db.batchSet([]*Entry{ { Key: y.KeyWithTs(item.key, item.version), meta: bitDelete, }, }) require.NoError(t, err) return err }) require.NoError(t, err) opts := DefaultIteratorOptions opts.AllVersions = true opts.PrefetchValues = false // Verify that deleted shows up when AllVersions is set. err = db.View(func(txn *Txn) error { it := txn.NewIterator(opts) defer it.Close() var count int for it.Rewind(); it.Valid(); it.Next() { count++ item := it.Item() if count == 1 { require.Equal(t, []byte("answer1"), item.Key()) require.True(t, item.meta&bitDelete > 0) } else { require.Equal(t, []byte("answer2"), item.Key()) } } require.Equal(t, 2, count) return nil }) require.NoError(t, err) } t.Run("disk mode", func(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { test(t, db) }) }) t.Run("InMemory mode", func(t *testing.T) { opt := getTestOptions("") opt.InMemory = true db, err := Open(opt) require.NoError(t, err) test(t, db) require.NoError(t, db.Close()) }) } func TestIteratorAllVersionsWithDeleted2(t *testing.T) { runBadgerTest(t, nil, func(t *testing.T, db *DB) { // Set and delete alternatively for i := 0; i < 4; i++ { err := db.Update(func(txn *Txn) error { if i%2 == 0 { require.NoError(t, txn.SetEntry(NewEntry([]byte("key"), []byte("value")))) return nil } return txn.Delete([]byte("key")) }) require.NoError(t, err) } opts := DefaultIteratorOptions opts.AllVersions = true opts.PrefetchValues = false // Verify that deleted shows up when AllVersions is set. err := db.View(func(txn *Txn) error { it := txn.NewIterator(opts) defer it.Close() var count int for it.Rewind(); it.Valid(); it.Next() { item := it.Item() require.Equal(t, []byte("key"), item.Key()) if count%2 != 0 { val, err := item.ValueCopy(nil) require.NoError(t, err) require.Equal(t, val, []byte("value")) } else { require.True(t, item.meta&bitDelete > 0) } count++ } require.Equal(t, 4, count) return nil }) require.NoError(t, err) }) } func TestManagedDB(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opt := getTestOptions(dir) opt.managedTxns = true test := func(t *testing.T, db *DB) { key := func(i int) []byte { return []byte(fmt.Sprintf("key-%02d", i)) } val := func(i int) []byte { return []byte(fmt.Sprintf("val-%d", i)) } require.Panics(t, func() { db.Update(func(tx *Txn) error { return nil }) }) err = db.View(func(tx *Txn) error { return nil }) require.NoError(t, err) // Write data at t=3. txn := db.NewTransactionAt(3, true) for i := 0; i <= 3; i++ { require.NoError(t, txn.SetEntry(NewEntry(key(i), val(i)))) } require.Error(t, txn.Commit()) require.NoError(t, txn.CommitAt(3, nil)) // Read data at t=2. txn = db.NewTransactionAt(2, false) for i := 0; i <= 3; i++ { _, err := txn.Get(key(i)) require.Equal(t, ErrKeyNotFound, err) } txn.Discard() // Read data at t=3. txn = db.NewTransactionAt(3, false) for i := 0; i <= 3; i++ { item, err := txn.Get(key(i)) require.NoError(t, err) require.Equal(t, uint64(3), item.Version()) v, err := item.ValueCopy(nil) require.NoError(t, err) require.Equal(t, val(i), v) } txn.Discard() // Write data at t=7. txn = db.NewTransactionAt(6, true) for i := 0; i <= 7; i++ { _, err := txn.Get(key(i)) if err == nil { continue // Don't overwrite existing keys. } require.NoError(t, txn.SetEntry(NewEntry(key(i), val(i)))) } require.NoError(t, txn.CommitAt(7, nil)) // Read data at t=9. txn = db.NewTransactionAt(9, false) for i := 0; i <= 9; i++ { item, err := txn.Get(key(i)) if i <= 7 { require.NoError(t, err) } else { require.Equal(t, ErrKeyNotFound, err) } if i <= 3 { require.Equal(t, uint64(3), item.Version()) } else if i <= 7 { require.Equal(t, uint64(7), item.Version()) } if i <= 7 { v, err := item.ValueCopy(nil) require.NoError(t, err) require.Equal(t, val(i), v) } } txn.Discard() } t.Run("disk mode", func(t *testing.T) { db, err := Open(opt) require.NoError(t, err) test(t, db) require.NoError(t, db.Close()) }) t.Run("InMemory mode", func(t *testing.T) { opt.InMemory = true opt.Dir = "" opt.ValueDir = "" db, err := Open(opt) require.NoError(t, err) test(t, db) require.NoError(t, db.Close()) }) } func TestArmV7Issue311Fix(t *testing.T) { dir, err := ioutil.TempDir("", "") require.NoError(t, err) defer removeDir(dir) db, err := Open(DefaultOptions(dir). WithTableLoadingMode(options.MemoryMap). WithValueLogFileSize(16 << 20). WithLevelOneSize(8 << 20). WithMaxTableSize(2 << 20). WithSyncWrites(false)) require.NoError(t, err) err = db.View(func(txn *Txn) error { return nil }) require.NoError(t, err) err = db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry([]byte{0x11}, []byte{0x22})) }) require.NoError(t, err) err = db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry([]byte{0x11}, []byte{0x22})) }) require.NoError(t, err) require.NoError(t, db.Close()) } // This test tries to perform a GetAndSet operation using multiple concurrent // transaction and only one of the transactions should be successful. // Regression test for https://github.com/dgraph-io/badger/issues/1289 func TestConflict(t *testing.T) { key := []byte("foo") setCount := uint32(0) testAndSet := func(wg *sync.WaitGroup, db *DB) { defer wg.Done() txn := db.NewTransaction(true) defer txn.Discard() _, err := txn.Get(key) if err == ErrKeyNotFound { // Unset the error. err = nil require.NoError(t, txn.Set(key, []byte("AA"))) txn.CommitWith(func(err error) { if err == nil { require.LessOrEqual(t, uint32(1), atomic.AddUint32(&setCount, 1)) } else { require.Error(t, err, ErrConflict) } }) } require.NoError(t, err) } testAndSetItr := func(wg *sync.WaitGroup, db *DB) { defer wg.Done() txn := db.NewTransaction(true) defer txn.Discard() iopt := DefaultIteratorOptions it := txn.NewIterator(iopt) found := false for it.Seek(key); it.Valid(); it.Next() { found = true } it.Close() if !found { require.NoError(t, txn.Set(key, []byte("AA"))) txn.CommitWith(func(err error) { if err == nil { require.LessOrEqual(t, atomic.AddUint32(&setCount, 1), uint32(1)) } else { require.Error(t, err, ErrConflict) } }) } } runTest := func(t *testing.T, fn func(wg *sync.WaitGroup, db *DB)) { loop := 10 numGo := 16 // This many concurrent transactions. for i := 0; i < loop; i++ { var wg sync.WaitGroup wg.Add(numGo) setCount = 0 runBadgerTest(t, nil, func(t *testing.T, db *DB) { for j := 0; j < numGo; j++ { go fn(&wg, db) } wg.Wait() }) require.Equal(t, uint32(1), atomic.LoadUint32(&setCount)) } } t.Run("TxnGet", func(t *testing.T) { runTest(t, testAndSet) }) t.Run("ItrSeek", func(t *testing.T) { runTest(t, testAndSetItr) }) } badger-2.2007.2/util.go000066400000000000000000000056401372173116500145070ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "encoding/hex" "io/ioutil" "math/rand" "sync/atomic" "time" "github.com/dgraph-io/badger/v2/table" "github.com/dgraph-io/badger/v2/y" "github.com/pkg/errors" ) func (s *levelsController) validate() error { for _, l := range s.levels { if err := l.validate(); err != nil { return errors.Wrap(err, "Levels Controller") } } return nil } // Check does some sanity check on one level of data or in-memory index. func (s *levelHandler) validate() error { if s.level == 0 { return nil } s.RLock() defer s.RUnlock() numTables := len(s.tables) for j := 1; j < numTables; j++ { if j >= len(s.tables) { return errors.Errorf("Level %d, j=%d numTables=%d", s.level, j, numTables) } if y.CompareKeys(s.tables[j-1].Biggest(), s.tables[j].Smallest()) >= 0 { return errors.Errorf( "Inter: Biggest(j-1) \n%s\n vs Smallest(j): \n%s\n: level=%d j=%d numTables=%d", hex.Dump(s.tables[j-1].Biggest()), hex.Dump(s.tables[j].Smallest()), s.level, j, numTables) } if y.CompareKeys(s.tables[j].Smallest(), s.tables[j].Biggest()) > 0 { return errors.Errorf( "Intra: \n%s\n vs \n%s\n: level=%d j=%d numTables=%d", hex.Dump(s.tables[j].Smallest()), hex.Dump(s.tables[j].Biggest()), s.level, j, numTables) } } return nil } // func (s *KV) debugPrintMore() { s.lc.debugPrintMore() } // // debugPrintMore shows key ranges of each level. // func (s *levelsController) debugPrintMore() { // s.Lock() // defer s.Unlock() // for i := 0; i < s.kv.opt.MaxLevels; i++ { // s.levels[i].debugPrintMore() // } // } // func (s *levelHandler) debugPrintMore() { // s.RLock() // defer s.RUnlock() // s.elog.Printf("Level %d:", s.level) // for _, t := range s.tables { // y.Printf(" [%s, %s]", t.Smallest(), t.Biggest()) // } // y.Printf("\n") // } // reserveFileID reserves a unique file id. func (s *levelsController) reserveFileID() uint64 { id := atomic.AddUint64(&s.nextFileID, 1) return id - 1 } func getIDMap(dir string) map[uint64]struct{} { fileInfos, err := ioutil.ReadDir(dir) y.Check(err) idMap := make(map[uint64]struct{}) for _, info := range fileInfos { if info.IsDir() { continue } fileID, ok := table.ParseFileID(info.Name()) if !ok { continue } idMap[fileID] = struct{}{} } return idMap } func init() { rand.Seed(time.Now().UnixNano()) } badger-2.2007.2/value.go000066400000000000000000001632561372173116500146560ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "bufio" "bytes" "crypto/aes" cryptorand "crypto/rand" "encoding/binary" "encoding/json" "fmt" "hash" "hash/crc32" "io" "io/ioutil" "math" "math/rand" "os" "sort" "strconv" "strings" "sync" "sync/atomic" "time" "github.com/dgraph-io/badger/v2/options" "github.com/dgraph-io/badger/v2/pb" "github.com/dgraph-io/badger/v2/y" "github.com/pkg/errors" "golang.org/x/net/trace" ) // maxVlogFileSize is the maximum size of the vlog file which can be created. Vlog Offset is of // uint32, so limiting at max uint32. var maxVlogFileSize = math.MaxUint32 // Values have their first byte being byteData or byteDelete. This helps us distinguish between // a key that has never been seen and a key that has been explicitly deleted. const ( bitDelete byte = 1 << 0 // Set if the key has been deleted. bitValuePointer byte = 1 << 1 // Set if the value is NOT stored directly next to key. bitDiscardEarlierVersions byte = 1 << 2 // Set if earlier versions can be discarded. // Set if item shouldn't be discarded via compactions (used by merge operator) bitMergeEntry byte = 1 << 3 // The MSB 2 bits are for transactions. bitTxn byte = 1 << 6 // Set if the entry is part of a txn. bitFinTxn byte = 1 << 7 // Set if the entry is to indicate end of txn in value log. mi int64 = 1 << 20 // The number of updates after which discard map should be flushed into badger. discardStatsFlushThreshold = 100 // size of vlog header. // +----------------+------------------+ // | keyID(8 bytes) | baseIV(12 bytes)| // +----------------+------------------+ vlogHeaderSize = 20 ) type logFile struct { path string // This is a lock on the log file. It guards the fd’s value, the file’s // existence and the file’s memory map. // // Use shared ownership when reading/writing the file or memory map, use // exclusive ownership to open/close the descriptor, unmap or remove the file. lock sync.RWMutex fd *os.File fid uint32 fmap []byte size uint32 loadingMode options.FileLoadingMode dataKey *pb.DataKey baseIV []byte registry *KeyRegistry } // encodeEntry will encode entry to the buf // layout of entry // +--------+-----+-------+-------+ // | header | key | value | crc32 | // +--------+-----+-------+-------+ func (lf *logFile) encodeEntry(e *Entry, buf *bytes.Buffer, offset uint32) (int, error) { h := header{ klen: uint32(len(e.Key)), vlen: uint32(len(e.Value)), expiresAt: e.ExpiresAt, meta: e.meta, userMeta: e.UserMeta, } // encode header. var headerEnc [maxHeaderSize]byte sz := h.Encode(headerEnc[:]) y.Check2(buf.Write(headerEnc[:sz])) // write hash. hash := crc32.New(y.CastagnoliCrcTable) y.Check2(hash.Write(headerEnc[:sz])) // we'll encrypt only key and value. if lf.encryptionEnabled() { // TODO: no need to allocate the bytes. we can calculate the encrypted buf one by one // since we're using ctr mode of AES encryption. Ordering won't changed. Need some // refactoring in XORBlock which will work like stream cipher. eBuf := make([]byte, 0, len(e.Key)+len(e.Value)) eBuf = append(eBuf, e.Key...) eBuf = append(eBuf, e.Value...) var err error eBuf, err = y.XORBlock(eBuf, lf.dataKey.Data, lf.generateIV(offset)) if err != nil { return 0, y.Wrapf(err, "Error while encoding entry for vlog.") } // write encrypted buf. y.Check2(buf.Write(eBuf)) // write the hash. y.Check2(hash.Write(eBuf)) } else { // Encryption is disabled so writing directly to the buffer. // write key. y.Check2(buf.Write(e.Key)) // write key hash. y.Check2(hash.Write(e.Key)) // write value. y.Check2(buf.Write(e.Value)) // write value hash. y.Check2(hash.Write(e.Value)) } // write crc32 hash. var crcBuf [crc32.Size]byte binary.BigEndian.PutUint32(crcBuf[:], hash.Sum32()) y.Check2(buf.Write(crcBuf[:])) // return encoded length. return len(headerEnc[:sz]) + len(e.Key) + len(e.Value) + len(crcBuf), nil } func (lf *logFile) decodeEntry(buf []byte, offset uint32) (*Entry, error) { var h header hlen := h.Decode(buf) kv := buf[hlen:] if lf.encryptionEnabled() { var err error // No need to worry about mmap. because, XORBlock allocates a byte array to do the // xor. So, the given slice is not being mutated. if kv, err = lf.decryptKV(kv, offset); err != nil { return nil, err } } e := &Entry{ meta: h.meta, UserMeta: h.userMeta, ExpiresAt: h.expiresAt, offset: offset, Key: kv[:h.klen], Value: kv[h.klen : h.klen+h.vlen], } return e, nil } func (lf *logFile) decryptKV(buf []byte, offset uint32) ([]byte, error) { return y.XORBlock(buf, lf.dataKey.Data, lf.generateIV(offset)) } // KeyID returns datakey's ID. func (lf *logFile) keyID() uint64 { if lf.dataKey == nil { // If there is no datakey, then we'll return 0. Which means no encryption. return 0 } return lf.dataKey.KeyId } func (lf *logFile) mmap(size int64) (err error) { if lf.loadingMode != options.MemoryMap { // Nothing to do return nil } lf.fmap, err = y.Mmap(lf.fd, false, size) if err == nil { err = y.Madvise(lf.fmap, false) // Disable readahead } return err } func (lf *logFile) encryptionEnabled() bool { return lf.dataKey != nil } func (lf *logFile) munmap() (err error) { if lf.loadingMode != options.MemoryMap || len(lf.fmap) == 0 { // Nothing to do return nil } if err := y.Munmap(lf.fmap); err != nil { return errors.Wrapf(err, "Unable to munmap value log: %q", lf.path) } // This is important. We should set the map to nil because ummap // system call doesn't change the length or capacity of the fmap slice. lf.fmap = nil return nil } // Acquire lock on mmap/file if you are calling this func (lf *logFile) read(p valuePointer, s *y.Slice) (buf []byte, err error) { var nbr int64 offset := p.Offset if lf.loadingMode == options.FileIO { buf = s.Resize(int(p.Len)) var n int n, err = lf.fd.ReadAt(buf, int64(offset)) nbr = int64(n) } else { // Do not convert size to uint32, because the lf.fmap can be of size // 4GB, which overflows the uint32 during conversion to make the size 0, // causing the read to fail with ErrEOF. See issue #585. size := int64(len(lf.fmap)) valsz := p.Len lfsz := atomic.LoadUint32(&lf.size) if int64(offset) >= size || int64(offset+valsz) > size || // Ensure that the read is within the file's actual size. It might be possible that // the offset+valsz length is beyond the file's actual size. This could happen when // dropAll and iterations are running simultaneously. int64(offset+valsz) > int64(lfsz) { err = y.ErrEOF } else { buf = lf.fmap[offset : offset+valsz] nbr = int64(valsz) } } y.NumReads.Add(1) y.NumBytesRead.Add(nbr) return buf, err } // generateIV will generate IV by appending given offset with the base IV. func (lf *logFile) generateIV(offset uint32) []byte { iv := make([]byte, aes.BlockSize) // baseIV is of 12 bytes. y.AssertTrue(12 == copy(iv[:12], lf.baseIV)) // remaining 4 bytes is obtained from offset. binary.BigEndian.PutUint32(iv[12:], offset) return iv } func (lf *logFile) doneWriting(offset uint32) error { // Sync before acquiring lock. (We call this from write() and thus know we have shared access // to the fd.) if err := lf.fd.Sync(); err != nil { return errors.Wrapf(err, "Unable to sync value log: %q", lf.path) } // Before we were acquiring a lock here on lf.lock, because we were invalidating the file // descriptor due to reopening it as read-only. Now, we don't invalidate the fd, but unmap it, // truncate it and remap it. That creates a window where we have segfaults because the mmap is // no longer valid, while someone might be reading it. Therefore, we need a lock here again. lf.lock.Lock() defer lf.lock.Unlock() // Unmap file before we truncate it. Windows cannot truncate a file that is mmapped. if err := lf.munmap(); err != nil { return errors.Wrapf(err, "failed to munmap vlog file %s", lf.fd.Name()) } // TODO: Confirm if we need to run a file sync after truncation. // Truncation must run after unmapping, otherwise Windows would crap itself. if err := lf.fd.Truncate(int64(offset)); err != nil { return errors.Wrapf(err, "Unable to truncate file: %q", lf.path) } // Reinitialize the log file. This will mmap the entire file. if err := lf.init(); err != nil { return errors.Wrapf(err, "failed to initialize file %s", lf.fd.Name()) } // Previously we used to close the file after it was written and reopen it in read-only mode. // We no longer open files in read-only mode. We keep all vlog files open in read-write mode. return nil } // You must hold lf.lock to sync() func (lf *logFile) sync() error { return lf.fd.Sync() } var errStop = errors.New("Stop iteration") var errTruncate = errors.New("Do truncate") var errDeleteVlogFile = errors.New("Delete vlog file") type logEntry func(e Entry, vp valuePointer) error type safeRead struct { k []byte v []byte recordOffset uint32 lf *logFile } // hashReader implements io.Reader, io.ByteReader interfaces. It also keeps track of the number // bytes read. The hashReader writes to h (hash) what it reads from r. type hashReader struct { r io.Reader h hash.Hash32 bytesRead int // Number of bytes read. } func newHashReader(r io.Reader) *hashReader { hash := crc32.New(y.CastagnoliCrcTable) return &hashReader{ r: r, h: hash, } } // Read reads len(p) bytes from the reader. Returns the number of bytes read, error on failure. func (t *hashReader) Read(p []byte) (int, error) { n, err := t.r.Read(p) if err != nil { return n, err } t.bytesRead += n return t.h.Write(p[:n]) } // ReadByte reads exactly one byte from the reader. Returns error on failure. func (t *hashReader) ReadByte() (byte, error) { b := make([]byte, 1) _, err := t.Read(b) return b[0], err } // Sum32 returns the sum32 of the underlying hash. func (t *hashReader) Sum32() uint32 { return t.h.Sum32() } // Entry reads an entry from the provided reader. It also validates the checksum for every entry // read. Returns error on failure. func (r *safeRead) Entry(reader io.Reader) (*Entry, error) { tee := newHashReader(reader) var h header hlen, err := h.DecodeFrom(tee) if err != nil { return nil, err } if h.klen > uint32(1<<16) { // Key length must be below uint16. return nil, errTruncate } kl := int(h.klen) if cap(r.k) < kl { r.k = make([]byte, 2*kl) } vl := int(h.vlen) if cap(r.v) < vl { r.v = make([]byte, 2*vl) } e := &Entry{} e.offset = r.recordOffset e.hlen = hlen buf := make([]byte, h.klen+h.vlen) if _, err := io.ReadFull(tee, buf[:]); err != nil { if err == io.EOF { err = errTruncate } return nil, err } if r.lf.encryptionEnabled() { if buf, err = r.lf.decryptKV(buf[:], r.recordOffset); err != nil { return nil, err } } e.Key = buf[:h.klen] e.Value = buf[h.klen:] var crcBuf [crc32.Size]byte if _, err := io.ReadFull(reader, crcBuf[:]); err != nil { if err == io.EOF { err = errTruncate } return nil, err } crc := y.BytesToU32(crcBuf[:]) if crc != tee.Sum32() { return nil, errTruncate } e.meta = h.meta e.UserMeta = h.userMeta e.ExpiresAt = h.expiresAt return e, nil } // iterate iterates over log file. It doesn't not allocate new memory for every kv pair. // Therefore, the kv pair is only valid for the duration of fn call. func (vlog *valueLog) iterate(lf *logFile, offset uint32, fn logEntry) (uint32, error) { fi, err := lf.fd.Stat() if err != nil { return 0, err } if offset == 0 { // If offset is set to zero, let's advance past the encryption key header. offset = vlogHeaderSize } if int64(offset) == fi.Size() { // We're at the end of the file already. No need to do anything. return offset, nil } if vlog.opt.ReadOnly { // We're not at the end of the file. We'd need to replay the entries, or // possibly truncate the file. return 0, ErrReplayNeeded } // We're not at the end of the file. Let's Seek to the offset and start reading. if _, err := lf.fd.Seek(int64(offset), io.SeekStart); err != nil { return 0, errFile(err, lf.path, "Unable to seek") } reader := bufio.NewReader(lf.fd) read := &safeRead{ k: make([]byte, 10), v: make([]byte, 10), recordOffset: offset, lf: lf, } var lastCommit uint64 var validEndOffset uint32 = offset loop: for { e, err := read.Entry(reader) switch { case err == io.EOF: break loop case err == io.ErrUnexpectedEOF || err == errTruncate: break loop case err != nil: return 0, err case e == nil: continue } var vp valuePointer vp.Len = uint32(int(e.hlen) + len(e.Key) + len(e.Value) + crc32.Size) read.recordOffset += vp.Len vp.Offset = e.offset vp.Fid = lf.fid switch { case e.meta&bitTxn > 0: txnTs := y.ParseTs(e.Key) if lastCommit == 0 { lastCommit = txnTs } if lastCommit != txnTs { break loop } case e.meta&bitFinTxn > 0: txnTs, err := strconv.ParseUint(string(e.Value), 10, 64) if err != nil || lastCommit != txnTs { break loop } // Got the end of txn. Now we can store them. lastCommit = 0 validEndOffset = read.recordOffset default: if lastCommit != 0 { // This is most likely an entry which was moved as part of GC. // We shouldn't get this entry in the middle of a transaction. break loop } validEndOffset = read.recordOffset } if err := fn(*e, vp); err != nil { if err == errStop { break } return 0, errFile(err, lf.path, "Iteration function") } } return validEndOffset, nil } func (vlog *valueLog) rewrite(f *logFile, tr trace.Trace) error { vlog.filesLock.RLock() maxFid := vlog.maxFid vlog.filesLock.RUnlock() y.AssertTruef(uint32(f.fid) < maxFid, "fid to move: %d. Current max fid: %d", f.fid, maxFid) tr.LazyPrintf("Rewriting fid: %d", f.fid) wb := make([]*Entry, 0, 1000) var size int64 y.AssertTrue(vlog.db != nil) var count, moved int fe := func(e Entry) error { count++ if count%100000 == 0 { tr.LazyPrintf("Processing entry %d", count) } vs, err := vlog.db.get(e.Key) if err != nil { return err } if discardEntry(e, vs, vlog.db) { return nil } // Value is still present in value log. if len(vs.Value) == 0 { return errors.Errorf("Empty value: %+v", vs) } var vp valuePointer vp.Decode(vs.Value) // If the entry found from the LSM Tree points to a newer vlog file, don't do anything. if vp.Fid > f.fid { return nil } // If the entry found from the LSM Tree points to an offset greater than the one // read from vlog, don't do anything. if vp.Offset > e.offset { return nil } // If the entry read from LSM Tree and vlog file point to the same vlog file and offset, // insert them back into the DB. // NOTE: It might be possible that the entry read from the LSM Tree points to // an older vlog file. See the comments in the else part. if vp.Fid == f.fid && vp.Offset == e.offset { moved++ // This new entry only contains the key, and a pointer to the value. ne := new(Entry) ne.meta = 0 // Remove all bits. Different keyspace doesn't need these bits. ne.UserMeta = e.UserMeta ne.ExpiresAt = e.ExpiresAt // Create a new key in a separate keyspace, prefixed by moveKey. We are not // allowed to rewrite an older version of key in the LSM tree, because then this older // version would be at the top of the LSM tree. To work correctly, reads expect the // latest versions to be at the top, and the older versions at the bottom. if bytes.HasPrefix(e.Key, badgerMove) { ne.Key = append([]byte{}, e.Key...) } else { ne.Key = make([]byte, len(badgerMove)+len(e.Key)) n := copy(ne.Key, badgerMove) copy(ne.Key[n:], e.Key) } ne.Value = append([]byte{}, e.Value...) es := int64(ne.estimateSize(vlog.opt.ValueThreshold)) // Consider size of value as well while considering the total size // of the batch. There have been reports of high memory usage in // rewrite because we don't consider the value size. See #1292. es += int64(len(e.Value)) // Ensure length and size of wb is within transaction limits. if int64(len(wb)+1) >= vlog.opt.maxBatchCount || size+es >= vlog.opt.maxBatchSize { tr.LazyPrintf("request has %d entries, size %d", len(wb), size) if err := vlog.db.batchSet(wb); err != nil { return err } size = 0 wb = wb[:0] } wb = append(wb, ne) size += es } else { // It might be possible that the entry read from LSM Tree points to an older vlog file. // This can happen in the following situation. Assume DB is opened with // numberOfVersionsToKeep=1 // // Now, if we have ONLY one key in the system "FOO" which has been updated 3 times and // the same key has been garbage collected 3 times, we'll have 3 versions of the movekey // for the same key "FOO". // NOTE: moveKeyi is the moveKey with version i // Assume we have 3 move keys in L0. // - moveKey1 (points to vlog file 10), // - moveKey2 (points to vlog file 14) and // - moveKey3 (points to vlog file 15). // Also, assume there is another move key "moveKey1" (points to vlog file 6) (this is // also a move Key for key "FOO" ) on upper levels (let's say 3). The move key // "moveKey1" on level 0 was inserted because vlog file 6 was GCed. // // Here's what the arrangement looks like // L0 => (moveKey1 => vlog10), (moveKey2 => vlog14), (moveKey3 => vlog15) // L1 => .... // L2 => .... // L3 => (moveKey1 => vlog6) // // When L0 compaction runs, it keeps only moveKey3 because the number of versions // to keep is set to 1. (we've dropped moveKey1's latest version) // // The new arrangement of keys is // L0 => .... // L1 => (moveKey3 => vlog15) // L2 => .... // L3 => (moveKey1 => vlog6) // // Now if we try to GC vlog file 10, the entry read from vlog file will point to vlog10 // but the entry read from LSM Tree will point to vlog6. The move key read from LSM tree // will point to vlog6 because we've asked for version 1 of the move key. // // This might seem like an issue but it's not really an issue because the user has set // the number of versions to keep to 1 and the latest version of moveKey points to the // correct vlog file and offset. The stale move key on L3 will be eventually dropped by // compaction because there is a newer versions in the upper levels. } return nil } _, err := vlog.iterate(f, 0, func(e Entry, vp valuePointer) error { return fe(e) }) if err != nil { return err } tr.LazyPrintf("request has %d entries, size %d", len(wb), size) batchSize := 1024 var loops int for i := 0; i < len(wb); { loops++ if batchSize == 0 { vlog.db.opt.Warningf("We shouldn't reach batch size of zero.") return ErrNoRewrite } end := i + batchSize if end > len(wb) { end = len(wb) } if err := vlog.db.batchSet(wb[i:end]); err != nil { if err == ErrTxnTooBig { // Decrease the batch size to half. batchSize = batchSize / 2 tr.LazyPrintf("Dropped batch size to %d", batchSize) continue } return err } i += batchSize } tr.LazyPrintf("Processed %d entries in %d loops", len(wb), loops) tr.LazyPrintf("Total entries: %d. Moved: %d", count, moved) tr.LazyPrintf("Removing fid: %d", f.fid) var deleteFileNow bool // Entries written to LSM. Remove the older file now. { vlog.filesLock.Lock() // Just a sanity-check. if _, ok := vlog.filesMap[f.fid]; !ok { vlog.filesLock.Unlock() return errors.Errorf("Unable to find fid: %d", f.fid) } if vlog.iteratorCount() == 0 { delete(vlog.filesMap, f.fid) deleteFileNow = true } else { vlog.filesToBeDeleted = append(vlog.filesToBeDeleted, f.fid) } vlog.filesLock.Unlock() } if deleteFileNow { if err := vlog.deleteLogFile(f); err != nil { return err } } return nil } func (vlog *valueLog) deleteMoveKeysFor(fid uint32, tr trace.Trace) error { db := vlog.db var result []*Entry var count, pointers uint64 tr.LazyPrintf("Iterating over move keys to find invalids for fid: %d", fid) err := db.View(func(txn *Txn) error { opt := DefaultIteratorOptions opt.InternalAccess = true opt.PrefetchValues = false itr := txn.NewIterator(opt) defer itr.Close() for itr.Seek(badgerMove); itr.ValidForPrefix(badgerMove); itr.Next() { count++ item := itr.Item() if item.meta&bitValuePointer == 0 { continue } pointers++ var vp valuePointer vp.Decode(item.vptr) if vp.Fid == fid { e := &Entry{Key: y.KeyWithTs(item.Key(), item.Version()), meta: bitDelete} result = append(result, e) } } return nil }) if err != nil { tr.LazyPrintf("Got error while iterating move keys: %v", err) tr.SetError() return err } tr.LazyPrintf("Num total move keys: %d. Num pointers: %d", count, pointers) tr.LazyPrintf("Number of invalid move keys found: %d", len(result)) batchSize := 10240 for i := 0; i < len(result); { end := i + batchSize if end > len(result) { end = len(result) } if err := db.batchSet(result[i:end]); err != nil { if err == ErrTxnTooBig { batchSize /= 2 tr.LazyPrintf("Dropped batch size to %d", batchSize) continue } tr.LazyPrintf("Error while doing batchSet: %v", err) tr.SetError() return err } i += batchSize } tr.LazyPrintf("Move keys deletion done.") return nil } func (vlog *valueLog) incrIteratorCount() { atomic.AddInt32(&vlog.numActiveIterators, 1) } func (vlog *valueLog) iteratorCount() int { return int(atomic.LoadInt32(&vlog.numActiveIterators)) } func (vlog *valueLog) decrIteratorCount() error { num := atomic.AddInt32(&vlog.numActiveIterators, -1) if num != 0 { return nil } vlog.filesLock.Lock() lfs := make([]*logFile, 0, len(vlog.filesToBeDeleted)) for _, id := range vlog.filesToBeDeleted { lfs = append(lfs, vlog.filesMap[id]) delete(vlog.filesMap, id) } vlog.filesToBeDeleted = nil vlog.filesLock.Unlock() for _, lf := range lfs { if err := vlog.deleteLogFile(lf); err != nil { return err } } return nil } func (vlog *valueLog) deleteLogFile(lf *logFile) error { if lf == nil { return nil } lf.lock.Lock() defer lf.lock.Unlock() path := vlog.fpath(lf.fid) if err := lf.munmap(); err != nil { _ = lf.fd.Close() return err } lf.fmap = nil if err := lf.fd.Close(); err != nil { return err } return os.Remove(path) } func (vlog *valueLog) dropAll() (int, error) { // If db is opened in InMemory mode, we don't need to do anything since there are no vlog files. if vlog.db.opt.InMemory { return 0, nil } // We don't want to block dropAll on any pending transactions. So, don't worry about iterator // count. var count int deleteAll := func() error { vlog.filesLock.Lock() defer vlog.filesLock.Unlock() for _, lf := range vlog.filesMap { if err := vlog.deleteLogFile(lf); err != nil { return err } count++ } vlog.filesMap = make(map[uint32]*logFile) return nil } if err := deleteAll(); err != nil { return count, err } vlog.db.opt.Infof("Value logs deleted. Creating value log file: 0") if _, err := vlog.createVlogFile(0); err != nil { // Called while writes are stopped. return count, err } return count, nil } // lfDiscardStats keeps track of the amount of data that could be discarded for // a given logfile. type lfDiscardStats struct { sync.RWMutex m map[uint32]int64 flushChan chan map[uint32]int64 closer *y.Closer updatesSinceFlush int } type valueLog struct { dirPath string // guards our view of which files exist, which to be deleted, how many active iterators filesLock sync.RWMutex filesMap map[uint32]*logFile maxFid uint32 filesToBeDeleted []uint32 // A refcount of iterators -- when this hits zero, we can delete the filesToBeDeleted. numActiveIterators int32 db *DB writableLogOffset uint32 // read by read, written by write. Must access via atomics. numEntriesWritten uint32 opt Options garbageCh chan struct{} lfDiscardStats *lfDiscardStats } func vlogFilePath(dirPath string, fid uint32) string { return fmt.Sprintf("%s%s%06d.vlog", dirPath, string(os.PathSeparator), fid) } func (vlog *valueLog) fpath(fid uint32) string { return vlogFilePath(vlog.dirPath, fid) } func (vlog *valueLog) populateFilesMap() error { vlog.filesMap = make(map[uint32]*logFile) files, err := ioutil.ReadDir(vlog.dirPath) if err != nil { return errFile(err, vlog.dirPath, "Unable to open log dir.") } found := make(map[uint64]struct{}) for _, file := range files { if !strings.HasSuffix(file.Name(), ".vlog") { continue } fsz := len(file.Name()) fid, err := strconv.ParseUint(file.Name()[:fsz-5], 10, 32) if err != nil { return errFile(err, file.Name(), "Unable to parse log id.") } if _, ok := found[fid]; ok { return errFile(err, file.Name(), "Duplicate file found. Please delete one.") } found[fid] = struct{}{} lf := &logFile{ fid: uint32(fid), path: vlog.fpath(uint32(fid)), loadingMode: vlog.opt.ValueLogLoadingMode, registry: vlog.db.registry, } vlog.filesMap[uint32(fid)] = lf if vlog.maxFid < uint32(fid) { vlog.maxFid = uint32(fid) } } return nil } func (lf *logFile) open(path string, flags uint32) error { var err error if lf.fd, err = y.OpenExistingFile(path, flags); err != nil { return y.Wrapf(err, "Error while opening file in logfile %s", path) } fi, err := lf.fd.Stat() if err != nil { return errFile(err, lf.path, "Unable to run file.Stat") } sz := fi.Size() y.AssertTruef( sz <= math.MaxUint32, "file size: %d greater than %d", uint32(sz), uint32(math.MaxUint32), ) lf.size = uint32(sz) if sz < vlogHeaderSize { // Every vlog file should have at least vlogHeaderSize. If it is less than vlogHeaderSize // then it must have been corrupted. But no need to handle here. log replayer will truncate // and bootstrap the logfile. So ignoring here. return nil } buf := make([]byte, vlogHeaderSize) if _, err = lf.fd.Read(buf); err != nil { return y.Wrapf(err, "Error while reading vlog file %d", lf.fid) } keyID := binary.BigEndian.Uint64(buf[:8]) var dk *pb.DataKey // retrieve datakey. if dk, err = lf.registry.dataKey(keyID); err != nil { return y.Wrapf(err, "While opening vlog file %d", lf.fid) } lf.dataKey = dk lf.baseIV = buf[8:] y.AssertTrue(len(lf.baseIV) == 12) return nil } // bootstrap will initialize the log file with key id and baseIV. // The below figure shows the layout of log file. // +----------------+------------------+------------------+ // | keyID(8 bytes) | baseIV(12 bytes)| entry... | // +----------------+------------------+------------------+ func (lf *logFile) bootstrap() error { var err error // delete all the data. because bootstrap is been called while creating vlog and as well // as replaying log. While replaying log, there may be any data left. So we need to truncate // everything. if err = lf.fd.Truncate(0); err != nil { return y.Wrapf(err, "Error while bootstraping.") } if _, err = lf.fd.Seek(0, io.SeekStart); err != nil { return y.Wrapf(err, "Error while SeekStart for the logfile %d in logFile.bootstarp", lf.fid) } // generate data key for the log file. var dk *pb.DataKey if dk, err = lf.registry.latestDataKey(); err != nil { return y.Wrapf(err, "Error while retrieving datakey in logFile.bootstarp") } lf.dataKey = dk // We'll always preserve vlogHeaderSize for key id and baseIV. buf := make([]byte, vlogHeaderSize) // write key id to the buf. // key id will be zero if the logfile is in plain text. binary.BigEndian.PutUint64(buf[:8], lf.keyID()) // generate base IV. It'll be used with offset of the vptr to encrypt the entry. if _, err := cryptorand.Read(buf[8:]); err != nil { return y.Wrapf(err, "Error while creating base IV, while creating logfile") } // Initialize base IV. lf.baseIV = buf[8:] y.AssertTrue(len(lf.baseIV) == 12) // write the key id and base IV to the file. _, err = lf.fd.Write(buf) return err } func (vlog *valueLog) createVlogFile(fid uint32) (*logFile, error) { path := vlog.fpath(fid) lf := &logFile{ fid: fid, path: path, loadingMode: vlog.opt.ValueLogLoadingMode, registry: vlog.db.registry, } // writableLogOffset is only written by write func, by read by Read func. // To avoid a race condition, all reads and updates to this variable must be // done via atomics. var err error if lf.fd, err = y.CreateSyncedFile(path, vlog.opt.SyncWrites); err != nil { return nil, errFile(err, lf.path, "Create value log file") } removeFile := func() { // Remove the file so that we don't get an error when createVlogFile is // called for the same fid, again. This could happen if there is an // transient error because of which we couldn't create a new file // and the second attempt to create the file succeeds. y.Check(os.Remove(lf.fd.Name())) } if err = lf.bootstrap(); err != nil { removeFile() return nil, err } if err = syncDir(vlog.dirPath); err != nil { removeFile() return nil, errFile(err, vlog.dirPath, "Sync value log dir") } if err = lf.mmap(2 * vlog.opt.ValueLogFileSize); err != nil { removeFile() return nil, errFile(err, lf.path, "Mmap value log file") } vlog.filesLock.Lock() vlog.filesMap[fid] = lf vlog.maxFid = fid // writableLogOffset is only written by write func, by read by Read func. // To avoid a race condition, all reads and updates to this variable must be // done via atomics. atomic.StoreUint32(&vlog.writableLogOffset, vlogHeaderSize) vlog.numEntriesWritten = 0 vlog.filesLock.Unlock() return lf, nil } func errFile(err error, path string, msg string) error { return fmt.Errorf("%s. Path=%s. Error=%v", msg, path, err) } func (vlog *valueLog) replayLog(lf *logFile, offset uint32, replayFn logEntry) error { fi, err := lf.fd.Stat() if err != nil { return errFile(err, lf.path, "Unable to run file.Stat") } // Alright, let's iterate now. endOffset, err := vlog.iterate(lf, offset, replayFn) if err != nil { return errFile(err, lf.path, "Unable to replay logfile") } if int64(endOffset) == fi.Size() { return nil } // End offset is different from file size. So, we should truncate the file // to that size. if !vlog.opt.Truncate { vlog.db.opt.Warningf("Truncate Needed. File %s size: %d Endoffset: %d", lf.fd.Name(), fi.Size(), endOffset) return ErrTruncateNeeded } // The entire file should be truncated (i.e. it should be deleted). // If fid == maxFid then it's okay to truncate the entire file since it will be // used for future additions. Also, it's okay if the last file has size zero. // We mmap 2*opt.ValueLogSize for the last file. See vlog.Open() function // if endOffset <= vlogHeaderSize && lf.fid != vlog.maxFid { if endOffset <= vlogHeaderSize { if lf.fid != vlog.maxFid { return errDeleteVlogFile } return lf.bootstrap() } vlog.db.opt.Infof("Truncating vlog file %s to offset: %d", lf.fd.Name(), endOffset) if err := lf.fd.Truncate(int64(endOffset)); err != nil { return errFile(err, lf.path, fmt.Sprintf( "Truncation needed at offset %d. Can be done manually as well.", endOffset)) } return nil } // init initializes the value log struct. This initialization needs to happen // before compactions start. func (vlog *valueLog) init(db *DB) { vlog.opt = db.opt vlog.db = db // We don't need to open any vlog files or collect stats for GC if DB is opened // in InMemory mode. InMemory mode doesn't create any files/directories on disk. if vlog.opt.InMemory { return } vlog.dirPath = vlog.opt.ValueDir vlog.garbageCh = make(chan struct{}, 1) // Only allow one GC at a time. vlog.lfDiscardStats = &lfDiscardStats{ m: make(map[uint32]int64), closer: y.NewCloser(1), flushChan: make(chan map[uint32]int64, 16), } } func (vlog *valueLog) open(db *DB, ptr valuePointer, replayFn logEntry) error { // We don't need to open any vlog files or collect stats for GC if DB is opened // in InMemory mode. InMemory mode doesn't create any files/directories on disk. if db.opt.InMemory { return nil } go vlog.flushDiscardStats() if err := vlog.populateFilesMap(); err != nil { return err } // If no files are found, then create a new file. if len(vlog.filesMap) == 0 { _, err := vlog.createVlogFile(0) return y.Wrapf(err, "Error while creating log file in valueLog.open") } fids := vlog.sortedFids() for _, fid := range fids { lf, ok := vlog.filesMap[fid] y.AssertTrue(ok) var flags uint32 switch { case vlog.opt.ReadOnly: // If we have read only, we don't need SyncWrites. flags |= y.ReadOnly // Set sync flag. case vlog.opt.SyncWrites: flags |= y.Sync } // We cannot mmap the files upfront here. Windows does not like mmapped files to be // truncated. We might need to truncate files during a replay. var err error if err = lf.open(vlog.fpath(fid), flags); err != nil { return errors.Wrapf(err, "Open existing file: %q", lf.path) } // This file is before the value head pointer. So, we don't need to // replay it, and can just open it in readonly mode. if fid < ptr.Fid { // Mmap the file here, we don't need to replay it. if err := lf.init(); err != nil { return err } continue } var offset uint32 if fid == ptr.Fid { offset = ptr.Offset + ptr.Len } vlog.db.opt.Infof("Replaying file id: %d at offset: %d\n", fid, offset) now := time.Now() // Replay and possible truncation done. Now we can open the file as per // user specified options. if err := vlog.replayLog(lf, offset, replayFn); err != nil { // Log file is corrupted. Delete it. if err == errDeleteVlogFile { delete(vlog.filesMap, fid) // Close the fd of the file before deleting the file otherwise windows complaints. if err := lf.fd.Close(); err != nil { return errors.Wrapf(err, "failed to close vlog file %s", lf.fd.Name()) } path := vlog.fpath(lf.fid) if err := os.Remove(path); err != nil { return y.Wrapf(err, "failed to delete empty value log file: %q", path) } continue } return err } vlog.db.opt.Infof("Replay took: %s\n", time.Since(now)) if fid < vlog.maxFid { // This file has been replayed. It can now be mmapped. // For maxFid, the mmap would be done by the specially written code below. if err := lf.init(); err != nil { return err } } } // Seek to the end to start writing. last, ok := vlog.filesMap[vlog.maxFid] y.AssertTrue(ok) // We'll create a new vlog if the last vlog is encrypted and db is opened in // plain text mode or vice versa. A single vlog file can't have both // encrypted entries and plain text entries. if last.encryptionEnabled() != vlog.db.shouldEncrypt() { newid := vlog.maxFid + 1 _, err := vlog.createVlogFile(newid) if err != nil { return y.Wrapf(err, "Error while creating log file %d in valueLog.open", newid) } last, ok = vlog.filesMap[newid] y.AssertTrue(ok) } lastOffset, err := last.fd.Seek(0, io.SeekEnd) if err != nil { return errFile(err, last.path, "file.Seek to end") } vlog.writableLogOffset = uint32(lastOffset) // Update the head to point to the updated tail. Otherwise, even after doing a successful // replay and closing the DB, the value log head does not get updated, which causes the replay // to happen repeatedly. vlog.db.vhead = valuePointer{Fid: vlog.maxFid, Offset: uint32(lastOffset)} // Map the file if needed. When we create a file, it is automatically mapped. if err = last.mmap(2 * vlog.opt.ValueLogFileSize); err != nil { return errFile(err, last.path, "Map log file") } if err := vlog.populateDiscardStats(); err != nil { // Print the error and continue. We don't want to prevent value log open if there's an error // with the fetching discards stats. db.opt.Errorf("Failed to populate discard stats: %s", err) } return nil } func (lf *logFile) init() error { fstat, err := lf.fd.Stat() if err != nil { return errors.Wrapf(err, "Unable to check stat for %q", lf.path) } sz := fstat.Size() if sz == 0 { // File is empty. We don't need to mmap it. Return. return nil } y.AssertTrue(sz <= math.MaxUint32) lf.size = uint32(sz) if err = lf.mmap(sz); err != nil { _ = lf.fd.Close() return errors.Wrapf(err, "Unable to map file: %q", fstat.Name()) } return nil } func (vlog *valueLog) stopFlushDiscardStats() { if vlog.lfDiscardStats != nil { vlog.lfDiscardStats.closer.Signal() } } func (vlog *valueLog) Close() error { if vlog == nil || vlog.db == nil || vlog.db.opt.InMemory { return nil } // close flushDiscardStats. vlog.lfDiscardStats.closer.SignalAndWait() vlog.opt.Debugf("Stopping garbage collection of values.") var err error for id, f := range vlog.filesMap { f.lock.Lock() // We won’t release the lock. if munmapErr := f.munmap(); munmapErr != nil && err == nil { err = munmapErr } maxFid := vlog.maxFid // TODO(ibrahim) - Do we need the following truncations on non-windows // platforms? We expand the file only on windows and the vlog.woffset() // should point to end of file on all other platforms. if !vlog.opt.ReadOnly && id == maxFid { // truncate writable log file to correct offset. if truncErr := f.fd.Truncate( int64(vlog.woffset())); truncErr != nil && err == nil { err = truncErr } } if closeErr := f.fd.Close(); closeErr != nil && err == nil { err = closeErr } } return err } // sortedFids returns the file id's not pending deletion, sorted. Assumes we have shared access to // filesMap. func (vlog *valueLog) sortedFids() []uint32 { toBeDeleted := make(map[uint32]struct{}) for _, fid := range vlog.filesToBeDeleted { toBeDeleted[fid] = struct{}{} } ret := make([]uint32, 0, len(vlog.filesMap)) for fid := range vlog.filesMap { if _, ok := toBeDeleted[fid]; !ok { ret = append(ret, fid) } } sort.Slice(ret, func(i, j int) bool { return ret[i] < ret[j] }) return ret } type request struct { // Input values Entries []*Entry // Output values and wait group stuff below Ptrs []valuePointer Wg sync.WaitGroup Err error ref int32 } func (req *request) reset() { req.Entries = req.Entries[:0] req.Ptrs = req.Ptrs[:0] req.Wg = sync.WaitGroup{} req.Err = nil req.ref = 0 } func (req *request) IncrRef() { atomic.AddInt32(&req.ref, 1) } func (req *request) DecrRef() { nRef := atomic.AddInt32(&req.ref, -1) if nRef > 0 { return } req.Entries = nil requestPool.Put(req) } func (req *request) Wait() error { req.Wg.Wait() err := req.Err req.DecrRef() // DecrRef after writing to DB. return err } type requests []*request func (reqs requests) DecrRef() { for _, req := range reqs { req.DecrRef() } } func (reqs requests) IncrRef() { for _, req := range reqs { req.IncrRef() } } // sync function syncs content of latest value log file to disk. Syncing of value log directory is // not required here as it happens every time a value log file rotation happens(check createVlogFile // function). During rotation, previous value log file also gets synced to disk. It only syncs file // if fid >= vlog.maxFid. In some cases such as replay(while opening db), it might be called with // fid < vlog.maxFid. To sync irrespective of file id just call it with math.MaxUint32. func (vlog *valueLog) sync(fid uint32) error { if vlog.opt.SyncWrites || vlog.opt.InMemory { return nil } vlog.filesLock.RLock() maxFid := vlog.maxFid // During replay it is possible to get sync call with fid less than maxFid. // Because older file has already been synced, we can return from here. if fid < maxFid || len(vlog.filesMap) == 0 { vlog.filesLock.RUnlock() return nil } curlf := vlog.filesMap[maxFid] // Sometimes it is possible that vlog.maxFid has been increased but file creation // with same id is still in progress and this function is called. In those cases // entry for the file might not be present in vlog.filesMap. if curlf == nil { vlog.filesLock.RUnlock() return nil } curlf.lock.RLock() vlog.filesLock.RUnlock() err := curlf.sync() curlf.lock.RUnlock() return err } func (vlog *valueLog) woffset() uint32 { return atomic.LoadUint32(&vlog.writableLogOffset) } // validateWrites will check whether the given requests can fit into 4GB vlog file. // NOTE: 4GB is the maximum size we can create for vlog because value pointer offset is of type // uint32. If we create more than 4GB, it will overflow uint32. So, limiting the size to 4GB. func (vlog *valueLog) validateWrites(reqs []*request) error { vlogOffset := uint64(vlog.woffset()) for _, req := range reqs { // calculate size of the request. size := estimateRequestSize(req) estimatedVlogOffset := vlogOffset + size if estimatedVlogOffset > uint64(maxVlogFileSize) { return errors.Errorf("Request size offset %d is bigger than maximum offset %d", estimatedVlogOffset, maxVlogFileSize) } if estimatedVlogOffset >= uint64(vlog.opt.ValueLogFileSize) { // We'll create a new vlog file if the estimated offset is greater or equal to // max vlog size. So, resetting the vlogOffset. vlogOffset = 0 continue } // Estimated vlog offset will become current vlog offset if the vlog is not rotated. vlogOffset = estimatedVlogOffset } return nil } // estimateRequestSize returns the size that needed to be written for the given request. func estimateRequestSize(req *request) uint64 { size := uint64(0) for _, e := range req.Entries { size += uint64(maxHeaderSize + len(e.Key) + len(e.Value) + crc32.Size) } return size } // write is thread-unsafe by design and should not be called concurrently. func (vlog *valueLog) write(reqs []*request) error { if vlog.db.opt.InMemory { return nil } // Validate writes before writing to vlog. Because, we don't want to partially write and return // an error. if err := vlog.validateWrites(reqs); err != nil { return err } vlog.filesLock.RLock() maxFid := vlog.maxFid curlf := vlog.filesMap[maxFid] vlog.filesLock.RUnlock() var buf bytes.Buffer flushWrites := func() error { if buf.Len() == 0 { return nil } vlog.opt.Debugf("Flushing buffer of size %d to vlog", buf.Len()) n, err := curlf.fd.Write(buf.Bytes()) if err != nil { return errors.Wrapf(err, "Unable to write to value log file: %q", curlf.path) } buf.Reset() y.NumWrites.Add(1) y.NumBytesWritten.Add(int64(n)) vlog.opt.Debugf("Done") atomic.AddUint32(&vlog.writableLogOffset, uint32(n)) atomic.StoreUint32(&curlf.size, vlog.writableLogOffset) return nil } toDisk := func() error { if err := flushWrites(); err != nil { return err } if vlog.woffset() > uint32(vlog.opt.ValueLogFileSize) || vlog.numEntriesWritten > vlog.opt.ValueLogMaxEntries { if err := curlf.doneWriting(vlog.woffset()); err != nil { return err } newid := vlog.maxFid + 1 y.AssertTruef(newid > 0, "newid has overflown uint32: %v", newid) newlf, err := vlog.createVlogFile(newid) if err != nil { return err } curlf = newlf atomic.AddInt32(&vlog.db.logRotates, 1) } return nil } for i := range reqs { b := reqs[i] b.Ptrs = b.Ptrs[:0] var written int for j := range b.Entries { e := b.Entries[j] if e.skipVlog { b.Ptrs = append(b.Ptrs, valuePointer{}) continue } var p valuePointer p.Fid = curlf.fid // Use the offset including buffer length so far. p.Offset = vlog.woffset() + uint32(buf.Len()) plen, err := curlf.encodeEntry(e, &buf, p.Offset) // Now encode the entry into buffer. if err != nil { return err } p.Len = uint32(plen) b.Ptrs = append(b.Ptrs, p) written++ // It is possible that the size of the buffer grows beyond the max size of the value // log (this happens when a transaction contains entries with large value sizes) and // badger might run into out of memory errors. We flush the buffer here if it's size // grows beyond the max value log size. if int64(buf.Len()) > vlog.db.opt.ValueLogFileSize { if err := flushWrites(); err != nil { return err } } } vlog.numEntriesWritten += uint32(written) // We write to disk here so that all entries that are part of the same transaction are // written to the same vlog file. writeNow := vlog.woffset()+uint32(buf.Len()) > uint32(vlog.opt.ValueLogFileSize) || vlog.numEntriesWritten > uint32(vlog.opt.ValueLogMaxEntries) if writeNow { if err := toDisk(); err != nil { return err } } } return toDisk() } // Gets the logFile and acquires and RLock() for the mmap. You must call RUnlock on the file // (if non-nil) func (vlog *valueLog) getFileRLocked(vp valuePointer) (*logFile, error) { vlog.filesLock.RLock() defer vlog.filesLock.RUnlock() ret, ok := vlog.filesMap[vp.Fid] if !ok { // log file has gone away, will need to retry the operation. return nil, ErrRetry } // Check for valid offset if we are reading from writable log. maxFid := vlog.maxFid if vp.Fid == maxFid { currentOffset := vlog.woffset() if vp.Offset >= currentOffset { return nil, errors.Errorf( "Invalid value pointer offset: %d greater than current offset: %d", vp.Offset, currentOffset) } } ret.lock.RLock() return ret, nil } // Read reads the value log at a given location. // TODO: Make this read private. func (vlog *valueLog) Read(vp valuePointer, s *y.Slice) ([]byte, func(), error) { buf, lf, err := vlog.readValueBytes(vp, s) // log file is locked so, decide whether to lock immediately or let the caller to // unlock it, after caller uses it. cb := vlog.getUnlockCallback(lf) if err != nil { return nil, cb, err } if vlog.opt.VerifyValueChecksum { hash := crc32.New(y.CastagnoliCrcTable) if _, err := hash.Write(buf[:len(buf)-crc32.Size]); err != nil { runCallback(cb) return nil, nil, errors.Wrapf(err, "failed to write hash for vp %+v", vp) } // Fetch checksum from the end of the buffer. checksum := buf[len(buf)-crc32.Size:] if hash.Sum32() != y.BytesToU32(checksum) { runCallback(cb) return nil, nil, errors.Wrapf(y.ErrChecksumMismatch, "value corrupted for vp: %+v", vp) } } var h header headerLen := h.Decode(buf) kv := buf[headerLen:] if lf.encryptionEnabled() { kv, err = lf.decryptKV(kv, vp.Offset) if err != nil { return nil, cb, err } } if uint32(len(kv)) < h.klen+h.vlen { vlog.db.opt.Logger.Errorf("Invalid read: vp: %+v", vp) return nil, nil, errors.Errorf("Invalid read: Len: %d read at:[%d:%d]", len(kv), h.klen, h.klen+h.vlen) } return kv[h.klen : h.klen+h.vlen], cb, nil } // getUnlockCallback will returns a function which unlock the logfile if the logfile is mmaped. // otherwise, it unlock the logfile and return nil. func (vlog *valueLog) getUnlockCallback(lf *logFile) func() { if lf == nil { return nil } if vlog.opt.ValueLogLoadingMode == options.MemoryMap { return lf.lock.RUnlock } lf.lock.RUnlock() return nil } // readValueBytes return vlog entry slice and read locked log file. Caller should take care of // logFile unlocking. func (vlog *valueLog) readValueBytes(vp valuePointer, s *y.Slice) ([]byte, *logFile, error) { lf, err := vlog.getFileRLocked(vp) if err != nil { return nil, nil, err } buf, err := lf.read(vp, s) return buf, lf, err } func (vlog *valueLog) pickLog(head valuePointer, tr trace.Trace) (files []*logFile) { vlog.filesLock.RLock() defer vlog.filesLock.RUnlock() fids := vlog.sortedFids() switch { case len(fids) <= 1: tr.LazyPrintf("Only one or less value log file.") return nil case head.Fid == 0: tr.LazyPrintf("Head pointer is at zero.") return nil } // Pick a candidate that contains the largest amount of discardable data candidate := struct { fid uint32 discard int64 }{math.MaxUint32, 0} vlog.lfDiscardStats.RLock() for _, fid := range fids { if fid >= head.Fid { break } if vlog.lfDiscardStats.m[fid] > candidate.discard { candidate.fid = fid candidate.discard = vlog.lfDiscardStats.m[fid] } } vlog.lfDiscardStats.RUnlock() if candidate.fid != math.MaxUint32 { // Found a candidate tr.LazyPrintf("Found candidate via discard stats: %v", candidate) files = append(files, vlog.filesMap[candidate.fid]) } else { tr.LazyPrintf("Could not find candidate via discard stats. Randomly picking one.") } // Fallback to randomly picking a log file var idxHead int for i, fid := range fids { if fid == head.Fid { idxHead = i break } } if idxHead == 0 { // Not found or first file tr.LazyPrintf("Could not find any file.") return nil } idx := rand.Intn(idxHead) // Don’t include head.Fid. We pick a random file before it. if idx > 0 { idx = rand.Intn(idx + 1) // Another level of rand to favor smaller fids. } tr.LazyPrintf("Randomly chose fid: %d", fids[idx]) files = append(files, vlog.filesMap[fids[idx]]) return files } func discardEntry(e Entry, vs y.ValueStruct, db *DB) bool { if vs.Version != y.ParseTs(e.Key) { // Version not found. Discard. return true } if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) { return true } if (vs.Meta & bitValuePointer) == 0 { // Key also stores the value in LSM. Discard. return true } if (vs.Meta & bitFinTxn) > 0 { // Just a txn finish entry. Discard. return true } if bytes.HasPrefix(e.Key, badgerMove) { // Verify the actual key entry without the badgerPrefix has not been deleted. // If this is not done the badgerMove entry will be kept forever moving from // vlog to vlog during rewrites. avs, err := db.get(e.Key[len(badgerMove):]) if err != nil { return false } return avs.Version == 0 } return false } func (vlog *valueLog) doRunGC(lf *logFile, discardRatio float64, tr trace.Trace) (err error) { // Update stats before exiting defer func() { if err == nil { vlog.lfDiscardStats.Lock() delete(vlog.lfDiscardStats.m, lf.fid) vlog.lfDiscardStats.Unlock() } }() type reason struct { total float64 discard float64 count int } fi, err := lf.fd.Stat() if err != nil { tr.LazyPrintf("Error while finding file size: %v", err) tr.SetError() return err } // Set up the sampling window sizes. sizeWindow := float64(fi.Size()) * 0.1 // 10% of the file as window. sizeWindowM := sizeWindow / (1 << 20) // in MBs. countWindow := int(float64(vlog.opt.ValueLogMaxEntries) * 0.01) // 1% of num entries. tr.LazyPrintf("Size window: %5.2f. Count window: %d.", sizeWindow, countWindow) // Pick a random start point for the log. skipFirstM := float64(rand.Int63n(fi.Size())) // Pick a random starting location. skipFirstM -= sizeWindow // Avoid hitting EOF by moving back by window. skipFirstM /= float64(mi) // Convert to MBs. tr.LazyPrintf("Skip first %5.2f MB of file of size: %d MB", skipFirstM, fi.Size()/mi) var skipped float64 var r reason start := time.Now() y.AssertTrue(vlog.db != nil) s := new(y.Slice) var numIterations int _, err = vlog.iterate(lf, 0, func(e Entry, vp valuePointer) error { numIterations++ esz := float64(vp.Len) / (1 << 20) // in MBs. if skipped < skipFirstM { skipped += esz return nil } // Sample until we reach the window sizes or exceed 10 seconds. if r.count > countWindow { tr.LazyPrintf("Stopping sampling after %d entries.", countWindow) return errStop } if r.total > sizeWindowM { tr.LazyPrintf("Stopping sampling after reaching window size.") return errStop } if time.Since(start) > 10*time.Second { tr.LazyPrintf("Stopping sampling after 10 seconds.") return errStop } r.total += esz r.count++ vs, err := vlog.db.get(e.Key) if err != nil { return err } if discardEntry(e, vs, vlog.db) { r.discard += esz return nil } // Value is still present in value log. y.AssertTrue(len(vs.Value) > 0) vp.Decode(vs.Value) if vp.Fid > lf.fid { // Value is present in a later log. Discard. r.discard += esz return nil } if vp.Offset > e.offset { // Value is present in a later offset, but in the same log. r.discard += esz return nil } if vp.Fid == lf.fid && vp.Offset == e.offset { // This is still the active entry. This would need to be rewritten. } else { vlog.opt.Debugf("Reason=%+v\n", r) buf, lf, err := vlog.readValueBytes(vp, s) // we need to decide, whether to unlock the lock file immediately based on the // loading mode. getUnlockCallback will take care of it. cb := vlog.getUnlockCallback(lf) if err != nil { runCallback(cb) return errStop } ne, err := lf.decodeEntry(buf, vp.Offset) if err != nil { runCallback(cb) return errStop } ne.print("Latest Entry Header in LSM") e.print("Latest Entry in Log") runCallback(cb) return errors.Errorf("This shouldn't happen. Latest Pointer:%+v. Meta:%v.", vp, vs.Meta) } return nil }) if err != nil { tr.LazyPrintf("Error while iterating for RunGC: %v", err) tr.SetError() return err } tr.LazyPrintf("Fid: %d. Skipped: %5.2fMB Num iterations: %d. Data status=%+v\n", lf.fid, skipped, numIterations, r) // If we couldn't sample at least a 1000 KV pairs or at least 75% of the window size, // and what we can discard is below the threshold, we should skip the rewrite. if (r.count < countWindow && r.total < sizeWindowM*0.75) || r.discard < discardRatio*r.total { tr.LazyPrintf("Skipping GC on fid: %d", lf.fid) return ErrNoRewrite } if err = vlog.rewrite(lf, tr); err != nil { return err } tr.LazyPrintf("Done rewriting.") return nil } func (vlog *valueLog) waitOnGC(lc *y.Closer) { defer lc.Done() <-lc.HasBeenClosed() // Wait for lc to be closed. // Block any GC in progress to finish, and don't allow any more writes to runGC by filling up // the channel of size 1. vlog.garbageCh <- struct{}{} } func (vlog *valueLog) runGC(discardRatio float64, head valuePointer) error { select { case vlog.garbageCh <- struct{}{}: // Pick a log file for GC. tr := trace.New("Badger.ValueLog", "GC") tr.SetMaxEvents(100) defer func() { tr.Finish() <-vlog.garbageCh }() var err error files := vlog.pickLog(head, tr) if len(files) == 0 { tr.LazyPrintf("PickLog returned zero results.") return ErrNoRewrite } tried := make(map[uint32]bool) for _, lf := range files { if _, done := tried[lf.fid]; done { continue } tried[lf.fid] = true err = vlog.doRunGC(lf, discardRatio, tr) if err == nil { return vlog.deleteMoveKeysFor(lf.fid, tr) } } return err default: return ErrRejected } } func (vlog *valueLog) updateDiscardStats(stats map[uint32]int64) { if vlog.opt.InMemory { return } select { case vlog.lfDiscardStats.flushChan <- stats: default: vlog.opt.Warningf("updateDiscardStats called: discard stats flushChan full, " + "returning without pushing to flushChan") } } func (vlog *valueLog) flushDiscardStats() { defer vlog.lfDiscardStats.closer.Done() mergeStats := func(stats map[uint32]int64) ([]byte, error) { vlog.lfDiscardStats.Lock() defer vlog.lfDiscardStats.Unlock() for fid, count := range stats { vlog.lfDiscardStats.m[fid] += count vlog.lfDiscardStats.updatesSinceFlush++ } if vlog.lfDiscardStats.updatesSinceFlush > discardStatsFlushThreshold { encodedDS, err := json.Marshal(vlog.lfDiscardStats.m) if err != nil { return nil, err } vlog.lfDiscardStats.updatesSinceFlush = 0 return encodedDS, nil } return nil, nil } process := func(stats map[uint32]int64) error { encodedDS, err := mergeStats(stats) if err != nil || encodedDS == nil { return err } entries := []*Entry{{ Key: y.KeyWithTs(lfDiscardStatsKey, 1), Value: encodedDS, }} req, err := vlog.db.sendToWriteCh(entries) // No special handling of ErrBlockedWrites is required as err is just logged in // for loop below. if err != nil { return errors.Wrapf(err, "failed to push discard stats to write channel") } return req.Wait() } closer := vlog.lfDiscardStats.closer for { select { case <-closer.HasBeenClosed(): // For simplicity just return without processing already present in stats in flushChan. return case stats := <-vlog.lfDiscardStats.flushChan: if err := process(stats); err != nil { vlog.opt.Errorf("unable to process discardstats with error: %s", err) } } } } // populateDiscardStats populates vlog.lfDiscardStats. // This function will be called while initializing valueLog. func (vlog *valueLog) populateDiscardStats() error { key := y.KeyWithTs(lfDiscardStatsKey, math.MaxUint64) var statsMap map[uint32]int64 var val []byte var vp valuePointer for { vs, err := vlog.db.get(key) if err != nil { return err } // Value doesn't exist. if vs.Meta == 0 && len(vs.Value) == 0 { vlog.opt.Debugf("Value log discard stats empty") return nil } vp.Decode(vs.Value) // Entry stored in LSM tree. if vs.Meta&bitValuePointer == 0 { val = y.SafeCopy(val, vs.Value) break } // Read entry from value log. result, cb, err := vlog.Read(vp, new(y.Slice)) runCallback(cb) val = y.SafeCopy(val, result) // The result is stored in val. We can break the loop from here. if err == nil { break } if err != ErrRetry { return err } // If we're at this point it means we haven't found the value yet and if the current key has // badger move prefix, we should break from here since we've already tried the original key // and the key with move prefix. "val" would be empty since we haven't found the value yet. if bytes.HasPrefix(key, badgerMove) { break } // If we're at this point it means the discard stats key was moved by the GC and the actual // entry is the one prefixed by badger move key. // Prepend existing key with badger move and search for the key. key = append(badgerMove, key...) } if len(val) == 0 { return nil } if err := json.Unmarshal(val, &statsMap); err != nil { return errors.Wrapf(err, "failed to unmarshal discard stats") } vlog.opt.Debugf("Value Log Discard stats: %v", statsMap) vlog.lfDiscardStats.flushChan <- statsMap return nil } badger-2.2007.2/value_test.go000066400000000000000000001047771372173116500157200ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package badger import ( "bytes" "encoding/json" "fmt" "io/ioutil" "math" "math/rand" "os" "reflect" "runtime" "sync" "testing" "time" "github.com/dgraph-io/badger/v2/options" "github.com/dgraph-io/badger/v2/y" humanize "github.com/dustin/go-humanize" "github.com/stretchr/testify/require" "golang.org/x/net/trace" ) func TestValueBasic(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") y.Check(err) defer removeDir(dir) kv, _ := Open(getTestOptions(dir).WithValueThreshold(32)) defer kv.Close() log := &kv.vlog // Use value big enough that the value log writes them even if SyncWrites is false. const val1 = "sampleval012345678901234567890123" const val2 = "samplevalb012345678901234567890123" require.True(t, len(val1) >= kv.opt.ValueThreshold) e1 := &Entry{ Key: []byte("samplekey"), Value: []byte(val1), meta: bitValuePointer, } e2 := &Entry{ Key: []byte("samplekeyb"), Value: []byte(val2), meta: bitValuePointer, } b := new(request) b.Entries = []*Entry{e1, e2} log.write([]*request{b}) require.Len(t, b.Ptrs, 2) t.Logf("Pointer written: %+v %+v\n", b.Ptrs[0], b.Ptrs[1]) s := new(y.Slice) buf1, lf1, err1 := log.readValueBytes(b.Ptrs[0], s) buf2, lf2, err2 := log.readValueBytes(b.Ptrs[1], s) require.NoError(t, err1) require.NoError(t, err2) defer runCallback(log.getUnlockCallback(lf1)) defer runCallback(log.getUnlockCallback(lf2)) e1, err = lf1.decodeEntry(buf1, b.Ptrs[0].Offset) require.NoError(t, err) e2, err = lf1.decodeEntry(buf2, b.Ptrs[1].Offset) require.NoError(t, err) readEntries := []Entry{*e1, *e2} require.EqualValues(t, []Entry{ { Key: []byte("samplekey"), Value: []byte(val1), meta: bitValuePointer, offset: b.Ptrs[0].Offset, }, { Key: []byte("samplekeyb"), Value: []byte(val2), meta: bitValuePointer, offset: b.Ptrs[1].Offset, }, }, readEntries) } func TestValueGCManaged(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) N := 10000 opt := getTestOptions(dir) opt.ValueLogMaxEntries = uint32(N / 10) opt.managedTxns = true db, err := Open(opt) require.NoError(t, err) defer db.Close() var ts uint64 newTs := func() uint64 { ts++ return ts } sz := 64 << 10 var wg sync.WaitGroup for i := 0; i < N; i++ { v := make([]byte, sz) rand.Read(v[:rand.Intn(sz)]) wg.Add(1) txn := db.NewTransactionAt(newTs(), true) require.NoError(t, txn.SetEntry(NewEntry([]byte(fmt.Sprintf("key%d", i)), v))) require.NoError(t, txn.CommitAt(newTs(), func(err error) { wg.Done() require.NoError(t, err) })) } for i := 0; i < N; i++ { wg.Add(1) txn := db.NewTransactionAt(newTs(), true) require.NoError(t, txn.Delete([]byte(fmt.Sprintf("key%d", i)))) require.NoError(t, txn.CommitAt(newTs(), func(err error) { wg.Done() require.NoError(t, err) })) } wg.Wait() files, err := ioutil.ReadDir(dir) require.NoError(t, err) for _, fi := range files { t.Logf("File: %s. Size: %s\n", fi.Name(), humanize.Bytes(uint64(fi.Size()))) } for i := 0; i < 100; i++ { // Try at max 100 times to GC even a single value log file. if err := db.RunValueLogGC(0.0001); err == nil { return // Done } } require.Fail(t, "Unable to GC even a single value log file.") } func TestValueGC(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opt := getTestOptions(dir) opt.ValueLogFileSize = 1 << 20 kv, _ := Open(opt) defer kv.Close() sz := 32 << 10 txn := kv.NewTransaction(true) for i := 0; i < 100; i++ { v := make([]byte, sz) rand.Read(v[:rand.Intn(sz)]) require.NoError(t, txn.SetEntry(NewEntry([]byte(fmt.Sprintf("key%d", i)), v))) if i%20 == 0 { require.NoError(t, txn.Commit()) txn = kv.NewTransaction(true) } } require.NoError(t, txn.Commit()) for i := 0; i < 45; i++ { txnDelete(t, kv, []byte(fmt.Sprintf("key%d", i))) } kv.vlog.filesLock.RLock() lf := kv.vlog.filesMap[kv.vlog.sortedFids()[0]] kv.vlog.filesLock.RUnlock() // lf.iterate(0, func(e Entry) bool { // e.print("lf") // return true // }) tr := trace.New("Test", "Test") defer tr.Finish() kv.vlog.rewrite(lf, tr) for i := 45; i < 100; i++ { key := []byte(fmt.Sprintf("key%d", i)) require.NoError(t, kv.View(func(txn *Txn) error { item, err := txn.Get(key) require.NoError(t, err) val := getItemValue(t, item) require.NotNil(t, val) require.True(t, len(val) == sz, "Size found: %d", len(val)) return nil })) } } func TestValueGC2(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opt := getTestOptions(dir) opt.ValueLogFileSize = 1 << 20 kv, _ := Open(opt) defer kv.Close() sz := 32 << 10 txn := kv.NewTransaction(true) for i := 0; i < 100; i++ { v := make([]byte, sz) rand.Read(v[:rand.Intn(sz)]) require.NoError(t, txn.SetEntry(NewEntry([]byte(fmt.Sprintf("key%d", i)), v))) if i%20 == 0 { require.NoError(t, txn.Commit()) txn = kv.NewTransaction(true) } } require.NoError(t, txn.Commit()) for i := 0; i < 5; i++ { txnDelete(t, kv, []byte(fmt.Sprintf("key%d", i))) } for i := 5; i < 10; i++ { v := []byte(fmt.Sprintf("value%d", i)) txnSet(t, kv, []byte(fmt.Sprintf("key%d", i)), v, 0) } kv.vlog.filesLock.RLock() lf := kv.vlog.filesMap[kv.vlog.sortedFids()[0]] kv.vlog.filesLock.RUnlock() // lf.iterate(0, func(e Entry) bool { // e.print("lf") // return true // }) tr := trace.New("Test", "Test") defer tr.Finish() kv.vlog.rewrite(lf, tr) for i := 0; i < 5; i++ { key := []byte(fmt.Sprintf("key%d", i)) require.NoError(t, kv.View(func(txn *Txn) error { _, err := txn.Get(key) require.Equal(t, ErrKeyNotFound, err) return nil })) } for i := 5; i < 10; i++ { key := []byte(fmt.Sprintf("key%d", i)) require.NoError(t, kv.View(func(txn *Txn) error { item, err := txn.Get(key) require.NoError(t, err) val := getItemValue(t, item) require.NotNil(t, val) require.Equal(t, string(val), fmt.Sprintf("value%d", i)) return nil })) } for i := 10; i < 100; i++ { key := []byte(fmt.Sprintf("key%d", i)) require.NoError(t, kv.View(func(txn *Txn) error { item, err := txn.Get(key) require.NoError(t, err) val := getItemValue(t, item) require.NotNil(t, val) require.True(t, len(val) == sz, "Size found: %d", len(val)) return nil })) } } func TestValueGC3(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opt := getTestOptions(dir) opt.ValueLogFileSize = 1 << 20 kv, err := Open(opt) require.NoError(t, err) defer kv.Close() // We want to test whether an iterator can continue through a value log GC. valueSize := 32 << 10 var value3 []byte txn := kv.NewTransaction(true) for i := 0; i < 100; i++ { v := make([]byte, valueSize) // 32K * 100 will take >=3'276'800 B. if i == 3 { value3 = v } rand.Read(v[:]) // Keys key000, key001, key002, such that sorted order matches insertion order require.NoError(t, txn.SetEntry(NewEntry([]byte(fmt.Sprintf("key%03d", i)), v))) if i%20 == 0 { require.NoError(t, txn.Commit()) txn = kv.NewTransaction(true) } } require.NoError(t, txn.Commit()) // Start an iterator to keys in the first value log file itOpt := IteratorOptions{ PrefetchValues: false, PrefetchSize: 0, Reverse: false, } txn = kv.NewTransaction(true) it := txn.NewIterator(itOpt) defer it.Close() // Walk a few keys it.Rewind() require.True(t, it.Valid()) item := it.Item() require.Equal(t, []byte("key000"), item.Key()) it.Next() require.True(t, it.Valid()) item = it.Item() require.Equal(t, []byte("key001"), item.Key()) it.Next() require.True(t, it.Valid()) item = it.Item() require.Equal(t, []byte("key002"), item.Key()) // Like other tests, we pull out a logFile to rewrite it directly kv.vlog.filesLock.RLock() logFile := kv.vlog.filesMap[kv.vlog.sortedFids()[0]] kv.vlog.filesLock.RUnlock() tr := trace.New("Test", "Test") defer tr.Finish() kv.vlog.rewrite(logFile, tr) it.Next() require.True(t, it.Valid()) item = it.Item() require.Equal(t, []byte("key003"), item.Key()) v3, err := item.ValueCopy(nil) require.NoError(t, err) require.Equal(t, value3, v3) } func TestValueGC4(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opt := getTestOptions(dir) opt.ValueLogFileSize = 1 << 20 opt.Truncate = true kv, err := Open(opt) require.NoError(t, err) sz := 128 << 10 // 5 entries per value log file. txn := kv.NewTransaction(true) for i := 0; i < 24; i++ { v := make([]byte, sz) rand.Read(v[:rand.Intn(sz)]) require.NoError(t, txn.SetEntry(NewEntry([]byte(fmt.Sprintf("key%d", i)), v))) if i%3 == 0 { require.NoError(t, txn.Commit()) txn = kv.NewTransaction(true) } } require.NoError(t, txn.Commit()) for i := 0; i < 8; i++ { txnDelete(t, kv, []byte(fmt.Sprintf("key%d", i))) } for i := 8; i < 16; i++ { v := []byte(fmt.Sprintf("value%d", i)) txnSet(t, kv, []byte(fmt.Sprintf("key%d", i)), v, 0) } kv.vlog.filesLock.RLock() lf0 := kv.vlog.filesMap[kv.vlog.sortedFids()[0]] lf1 := kv.vlog.filesMap[kv.vlog.sortedFids()[1]] kv.vlog.filesLock.RUnlock() // lf.iterate(0, func(e Entry) bool { // e.print("lf") // return true // }) tr := trace.New("Test", "Test") defer tr.Finish() kv.vlog.rewrite(lf0, tr) kv.vlog.rewrite(lf1, tr) require.NoError(t, kv.Close()) kv, err = Open(opt) require.NoError(t, err) for i := 0; i < 8; i++ { key := []byte(fmt.Sprintf("key%d", i)) require.NoError(t, kv.View(func(txn *Txn) error { _, err := txn.Get(key) require.Equal(t, ErrKeyNotFound, err) return nil })) } for i := 8; i < 16; i++ { key := []byte(fmt.Sprintf("key%d", i)) require.NoError(t, kv.View(func(txn *Txn) error { item, err := txn.Get(key) require.NoError(t, err) val := getItemValue(t, item) require.NotNil(t, val) require.Equal(t, string(val), fmt.Sprintf("value%d", i)) return nil })) } require.NoError(t, kv.Close()) } func TestPersistLFDiscardStats(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opt := getTestOptions(dir) opt.ValueLogFileSize = 1 << 20 opt.Truncate = true // avoid compaction on close, so that discard map remains same opt.CompactL0OnClose = false db, err := Open(opt) require.NoError(t, err) sz := 128 << 10 // 5 entries per value log file. v := make([]byte, sz) rand.Read(v[:rand.Intn(sz)]) txn := db.NewTransaction(true) for i := 0; i < 500; i++ { require.NoError(t, txn.SetEntry(NewEntry([]byte(fmt.Sprintf("key%d", i)), v))) if i%3 == 0 { require.NoError(t, txn.Commit()) txn = db.NewTransaction(true) } } require.NoError(t, txn.Commit(), "error while committing txn") for i := 0; i < 500; i++ { // use Entry.WithDiscard() to delete entries, because this causes data to be flushed on // disk, creating SSTs. Simple Delete was having data in Memtables only. err = db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry([]byte(fmt.Sprintf("key%d", i)), v).WithDiscard()) }) require.NoError(t, err) } time.Sleep(2 * time.Second) // wait for compaction to complete persistedMap := make(map[uint32]int64) db.vlog.lfDiscardStats.Lock() require.True(t, len(db.vlog.lfDiscardStats.m) > 0, "some discardStats should be generated") for k, v := range db.vlog.lfDiscardStats.m { persistedMap[k] = v } db.vlog.lfDiscardStats.updatesSinceFlush = discardStatsFlushThreshold + 1 db.vlog.lfDiscardStats.Unlock() // db.vlog.lfDiscardStats.updatesSinceFlush is already > discardStatsFlushThreshold, // send empty map to flushChan, so that latest discardStats map can be persisted. db.vlog.lfDiscardStats.flushChan <- map[uint32]int64{} time.Sleep(1 * time.Second) // Wait for map to be persisted. err = db.Close() require.NoError(t, err) // Avoid running compactors on reopening badger. opt.NumCompactors = 0 db, err = Open(opt) require.NoError(t, err) defer db.Close() time.Sleep(1 * time.Second) // Wait for discardStats to be populated by populateDiscardStats(). db.vlog.lfDiscardStats.RLock() require.True(t, reflect.DeepEqual(persistedMap, db.vlog.lfDiscardStats.m), "Discard maps are not equal") db.vlog.lfDiscardStats.RUnlock() } func TestChecksums(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) // Set up SST with K1=V1 opts := getTestOptions(dir) opts.Truncate = true opts.ValueLogFileSize = 100 * 1024 * 1024 // 100Mb opts.ValueThreshold = 32 kv, err := Open(opts) require.NoError(t, err) require.NoError(t, kv.Close()) var ( k0 = []byte("k0") k1 = []byte("k1") k2 = []byte("k2") k3 = []byte("k3") v0 = []byte("value0-012345678901234567890123012345678901234567890123") v1 = []byte("value1-012345678901234567890123012345678901234567890123") v2 = []byte("value2-012345678901234567890123012345678901234567890123") v3 = []byte("value3-012345678901234567890123012345678901234567890123") ) // Make sure the value log would actually store the item require.True(t, len(v0) >= kv.opt.ValueThreshold) // Use a vlog with K0=V0 and a (corrupted) second transaction(k1,k2) buf := createVlog(t, []*Entry{ {Key: k0, Value: v0}, {Key: k1, Value: v1}, {Key: k2, Value: v2}, }) buf[len(buf)-1]++ // Corrupt last byte require.NoError(t, ioutil.WriteFile(vlogFilePath(dir, 0), buf, 0777)) // K1 should exist, but K2 shouldn't. kv, err = Open(opts) require.NoError(t, err) require.NoError(t, kv.View(func(txn *Txn) error { item, err := txn.Get(k0) require.NoError(t, err) require.Equal(t, getItemValue(t, item), v0) _, err = txn.Get(k1) require.Equal(t, ErrKeyNotFound, err) _, err = txn.Get(k2) require.Equal(t, ErrKeyNotFound, err) return nil })) // Write K3 at the end of the vlog. txnSet(t, kv, k3, v3, 0) require.NoError(t, kv.Close()) // The vlog should contain K0 and K3 (K1 and k2 was lost when Badger started up // last due to checksum failure). kv, err = Open(opts) require.NoError(t, err) { txn := kv.NewTransaction(false) iter := txn.NewIterator(DefaultIteratorOptions) iter.Seek(k0) require.True(t, iter.Valid()) it := iter.Item() require.Equal(t, it.Key(), k0) require.Equal(t, getItemValue(t, it), v0) iter.Next() require.True(t, iter.Valid()) it = iter.Item() require.Equal(t, it.Key(), k3) require.Equal(t, getItemValue(t, it), v3) iter.Close() txn.Discard() } require.NoError(t, kv.Close()) } func TestPartialAppendToValueLog(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) // Create skeleton files. opts := getTestOptions(dir) opts.Truncate = true opts.ValueLogFileSize = 100 * 1024 * 1024 // 100Mb opts.ValueThreshold = 32 kv, err := Open(opts) require.NoError(t, err) require.NoError(t, kv.Close()) var ( k0 = []byte("k0") k1 = []byte("k1") k2 = []byte("k2") k3 = []byte("k3") v0 = []byte("value0-01234567890123456789012012345678901234567890123") v1 = []byte("value1-01234567890123456789012012345678901234567890123") v2 = []byte("value2-01234567890123456789012012345678901234567890123") v3 = []byte("value3-01234567890123456789012012345678901234567890123") ) // Values need to be long enough to actually get written to value log. require.True(t, len(v3) >= kv.opt.ValueThreshold) // Create truncated vlog to simulate a partial append. // k0 - single transaction, k1 and k2 in another transaction buf := createVlog(t, []*Entry{ {Key: k0, Value: v0}, {Key: k1, Value: v1}, {Key: k2, Value: v2}, }) buf = buf[:len(buf)-6] require.NoError(t, ioutil.WriteFile(vlogFilePath(dir, 0), buf, 0777)) // Badger should now start up kv, err = Open(opts) require.NoError(t, err) require.NoError(t, kv.View(func(txn *Txn) error { item, err := txn.Get(k0) require.NoError(t, err) require.Equal(t, v0, getItemValue(t, item)) _, err = txn.Get(k1) require.Equal(t, ErrKeyNotFound, err) _, err = txn.Get(k2) require.Equal(t, ErrKeyNotFound, err) return nil })) // When K3 is set, it should be persisted after a restart. txnSet(t, kv, k3, v3, 0) require.NoError(t, kv.Close()) kv, err = Open(opts) require.NoError(t, err) checkKeys(t, kv, [][]byte{k3}) // Replay value log from beginning, badger head is past k2. require.NoError(t, kv.vlog.Close()) // clean up the current db.vhead so that we can replay from the beginning. // If we don't clear the current vhead, badger will error out since new // head passed while opening vlog is zero in the following lines. kv.vhead = valuePointer{} kv.vlog.init(kv) require.NoError( t, kv.vlog.open(kv, valuePointer{Fid: 0}, kv.replayFunction()), ) require.NoError(t, kv.Close()) } func TestReadOnlyOpenWithPartialAppendToValueLog(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) // Create skeleton files. opts := getTestOptions(dir) opts.ValueLogFileSize = 100 * 1024 * 1024 // 100Mb kv, err := Open(opts) require.NoError(t, err) require.NoError(t, kv.Close()) var ( k0 = []byte("k0") k1 = []byte("k1") k2 = []byte("k2") v0 = []byte("value0-012345678901234567890123") v1 = []byte("value1-012345678901234567890123") v2 = []byte("value2-012345678901234567890123") ) // Create truncated vlog to simulate a partial append. // k0 - single transaction, k1 and k2 in another transaction buf := createVlog(t, []*Entry{ {Key: k0, Value: v0}, {Key: k1, Value: v1}, {Key: k2, Value: v2}, }) buf = buf[:len(buf)-6] require.NoError(t, ioutil.WriteFile(vlogFilePath(dir, 0), buf, 0777)) opts.ReadOnly = true // Badger should fail a read-only open with values to replay _, err = Open(opts) require.Error(t, err) require.Regexp(t, "Database was not properly closed, cannot open read-only|Read-only mode is not supported on Windows", err.Error()) } func TestValueLogTrigger(t *testing.T) { t.Skip("Difficult to trigger compaction, so skipping. Re-enable after fixing #226") dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opt := getTestOptions(dir) opt.ValueLogFileSize = 1 << 20 kv, err := Open(opt) require.NoError(t, err) // Write a lot of data, so it creates some work for valug log GC. sz := 32 << 10 txn := kv.NewTransaction(true) for i := 0; i < 100; i++ { v := make([]byte, sz) rand.Read(v[:rand.Intn(sz)]) require.NoError(t, txn.SetEntry(NewEntry([]byte(fmt.Sprintf("key%d", i)), v))) if i%20 == 0 { require.NoError(t, txn.Commit()) txn = kv.NewTransaction(true) } } require.NoError(t, txn.Commit()) for i := 0; i < 45; i++ { txnDelete(t, kv, []byte(fmt.Sprintf("key%d", i))) } require.NoError(t, kv.RunValueLogGC(0.5)) require.NoError(t, kv.Close()) err = kv.RunValueLogGC(0.5) require.Equal(t, ErrRejected, err, "Error should be returned after closing DB.") } func createVlog(t *testing.T, entries []*Entry) []byte { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opts := getTestOptions(dir) opts.ValueLogFileSize = 100 * 1024 * 1024 // 100Mb kv, err := Open(opts) require.NoError(t, err) txnSet(t, kv, entries[0].Key, entries[0].Value, entries[0].meta) entries = entries[1:] txn := kv.NewTransaction(true) for _, entry := range entries { require.NoError(t, txn.SetEntry(NewEntry(entry.Key, entry.Value).WithMeta(entry.meta))) } require.NoError(t, txn.Commit()) require.NoError(t, kv.Close()) filename := vlogFilePath(dir, 0) buf, err := ioutil.ReadFile(filename) require.NoError(t, err) return buf } func TestPenultimateLogCorruption(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opt := getTestOptions(dir) opt.ValueLogLoadingMode = options.FileIO // Each txn generates at least two entries. 3 txns will fit each file. opt.ValueLogMaxEntries = 5 opt.LogRotatesToFlush = 1000 db0, err := Open(opt) require.NoError(t, err) defer func() { require.NoError(t, db0.Close()) }() h := testHelper{db: db0, t: t} h.writeRange(0, 7) h.readRange(0, 7) for i := 2; i >= 0; i-- { fpath := vlogFilePath(dir, uint32(i)) fi, err := os.Stat(fpath) require.NoError(t, err) require.True(t, fi.Size() > 0, "Empty file at log=%d", i) if i == 0 { err := os.Truncate(fpath, fi.Size()-1) require.NoError(t, err) } } // Simulate a crash by not closing db0, but releasing the locks. if db0.dirLockGuard != nil { require.NoError(t, db0.dirLockGuard.release()) db0.dirLockGuard = nil } if db0.valueDirGuard != nil { require.NoError(t, db0.valueDirGuard.release()) db0.valueDirGuard = nil } opt.Truncate = true db1, err := Open(opt) require.NoError(t, err) h.db = db1 h.readRange(0, 1) // Only 2 should be gone, because it is at the end of logfile 0. h.readRange(3, 7) err = db1.View(func(txn *Txn) error { _, err := txn.Get(h.key(2)) // Verify that 2 is gone. require.Equal(t, ErrKeyNotFound, err) return nil }) require.NoError(t, err) require.NoError(t, db1.Close()) } func checkKeys(t *testing.T, kv *DB, keys [][]byte) { i := 0 txn := kv.NewTransaction(false) defer txn.Discard() iter := txn.NewIterator(IteratorOptions{}) defer iter.Close() for iter.Seek(keys[0]); iter.Valid(); iter.Next() { require.Equal(t, iter.Item().Key(), keys[i]) i++ } require.Equal(t, i, len(keys)) } type testHelper struct { db *DB t *testing.T val []byte } func (th *testHelper) key(i int) []byte { return []byte(fmt.Sprintf("%010d", i)) } func (th *testHelper) value() []byte { if len(th.val) > 0 { return th.val } th.val = make([]byte, 100) y.Check2(rand.Read(th.val)) return th.val } // writeRange [from, to]. func (th *testHelper) writeRange(from, to int) { for i := from; i <= to; i++ { err := th.db.Update(func(txn *Txn) error { return txn.SetEntry(NewEntry(th.key(i), th.value())) }) require.NoError(th.t, err) } } func (th *testHelper) readRange(from, to int) { for i := from; i <= to; i++ { err := th.db.View(func(txn *Txn) error { item, err := txn.Get(th.key(i)) if err != nil { return err } return item.Value(func(val []byte) error { require.Equal(th.t, val, th.value(), "key=%q", th.key(i)) return nil }) }) require.NoError(th.t, err, "key=%q", th.key(i)) } } // Test Bug #578, which showed that if a value is moved during value log GC, an // older version can end up at a higher level in the LSM tree than a newer // version, causing the data to not be returned. func TestBug578(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") y.Check(err) defer removeDir(dir) db, err := Open(DefaultOptions(dir). WithValueLogMaxEntries(64). WithMaxTableSize(1 << 13)) require.NoError(t, err) h := testHelper{db: db, t: t} // Let's run this whole thing a few times. for j := 0; j < 10; j++ { t.Logf("Cycle: %d\n", j) h.writeRange(0, 32) h.writeRange(0, 10) h.writeRange(50, 72) h.writeRange(40, 72) h.writeRange(40, 72) // Run value log GC a few times. for i := 0; i < 5; i++ { db.RunValueLogGC(0.5) } h.readRange(0, 10) } require.NoError(t, db.Close()) } func BenchmarkReadWrite(b *testing.B) { rwRatio := []float32{ 0.1, 0.2, 0.5, 1.0, } valueSize := []int{ 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, } for _, vsz := range valueSize { for _, rw := range rwRatio { b.Run(fmt.Sprintf("%3.1f,%04d", rw, vsz), func(b *testing.B) { dir, err := ioutil.TempDir("", "vlog-benchmark") y.Check(err) defer removeDir(dir) db, err := Open(getTestOptions(dir)) y.Check(err) vl := &db.vlog b.ResetTimer() for i := 0; i < b.N; i++ { e := new(Entry) e.Key = make([]byte, 16) e.Value = make([]byte, vsz) bl := new(request) bl.Entries = []*Entry{e} var ptrs []valuePointer vl.write([]*request{bl}) ptrs = append(ptrs, bl.Ptrs...) f := rand.Float32() if f < rw { vl.write([]*request{bl}) } else { ln := len(ptrs) if ln == 0 { b.Fatalf("Zero length of ptrs") } idx := rand.Intn(ln) s := new(y.Slice) buf, lf, err := vl.readValueBytes(ptrs[idx], s) if err != nil { b.Fatalf("Benchmark Read: %v", err) } e, err := lf.decodeEntry(buf, ptrs[idx].Offset) require.NoError(b, err) if len(e.Key) != 16 { b.Fatalf("Key is invalid") } if len(e.Value) != vsz { b.Fatalf("Value is invalid") } runCallback(db.vlog.getUnlockCallback(lf)) } } }) } } } // Regression test for https://github.com/dgraph-io/badger/issues/817 func TestValueLogTruncate(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) db, err := Open(DefaultOptions(dir).WithTruncate(true)) require.NoError(t, err) // Insert 1 entry so that we have valid data in first vlog file require.NoError(t, db.Update(func(txn *Txn) error { return txn.Set([]byte("foo"), nil) })) fileCountBeforeCorruption := len(db.vlog.filesMap) require.NoError(t, db.Close()) // Create two vlog files corrupted data. These will be truncated when DB starts next time require.NoError(t, ioutil.WriteFile(vlogFilePath(dir, 1), []byte("foo"), 0664)) require.NoError(t, ioutil.WriteFile(vlogFilePath(dir, 2), []byte("foo"), 0664)) db, err = Open(DefaultOptions(dir).WithTruncate(true)) require.NoError(t, err) // Ensure vlog file with id=1 is not present require.Nil(t, db.vlog.filesMap[1]) // Ensure filesize of fid=2 is zero zeroFile, ok := db.vlog.filesMap[2] require.True(t, ok) fileStat, err := zeroFile.fd.Stat() require.NoError(t, err) // The size of last vlog file in windows is equal to 2*opt.ValueLogFileSize. This is because // we mmap the last value log file and windows doesn't allow us to mmap a file more than // it's acutal size. So we increase the file size and then mmap it. See mmap_windows.go file. if runtime.GOOS == "windows" { require.Equal(t, 2*db.opt.ValueLogFileSize, fileStat.Size()) } else { require.Equal(t, int64(vlogHeaderSize), fileStat.Size()) } fileCountAfterCorruption := len(db.vlog.filesMap) // +1 because the file with id=2 will be completely truncated. It won't be deleted. // There would be two files. fid=0 with valid data, fid=2 with zero data (truncated). require.Equal(t, fileCountBeforeCorruption+1, fileCountAfterCorruption) // Max file ID would point to the last vlog file, which is fid=2 in this case require.Equal(t, 2, int(db.vlog.maxFid)) require.NoError(t, db.Close()) } // Regression test for https://github.com/dgraph-io/dgraph/issues/3669 func TestTruncatedDiscardStat(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) ops := getTestOptions(dir) db, err := Open(ops) require.NoError(t, err) stat := make(map[uint32]int64, 20) for i := uint32(0); i < uint32(20); i++ { stat[i] = 0 } db.vlog.lfDiscardStats.m = stat encodedDS, _ := json.Marshal(db.vlog.lfDiscardStats.m) entries := []*Entry{{ Key: y.KeyWithTs(lfDiscardStatsKey, 1), // Insert truncated discard stats. This is important. Value: encodedDS[:13], }} // Push discard stats entry to the write channel. req, err := db.sendToWriteCh(entries) require.NoError(t, err) req.Wait() // Unset discard stats. We've already pushed the stats. If we don't unset it then it will be // pushed again on DB close. db.vlog.lfDiscardStats.m = nil require.NoError(t, db.Close()) db, err = Open(ops) require.NoError(t, err) require.NoError(t, db.Close()) } func TestSafeEntry(t *testing.T) { var s safeRead s.lf = &logFile{} e := NewEntry([]byte("foo"), []byte("bar")) buf := bytes.NewBuffer(nil) _, err := s.lf.encodeEntry(e, buf, 0) require.NoError(t, err) ne, err := s.Entry(buf) require.NoError(t, err) require.Equal(t, e.Key, ne.Key, "key mismatch") require.Equal(t, e.Value, ne.Value, "value mismatch") require.Equal(t, e.meta, ne.meta, "meta mismatch") require.Equal(t, e.UserMeta, ne.UserMeta, "usermeta mismatch") require.Equal(t, e.ExpiresAt, ne.ExpiresAt, "expiresAt mismatch") } // Regression test for https://github.com/dgraph-io/badger/issues/926 func TestDiscardStatsMove(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) ops := getTestOptions(dir) ops.ValueLogMaxEntries = 1 db, err := Open(ops) require.NoError(t, err) stat := make(map[uint32]int64, ops.ValueThreshold+10) for i := uint32(0); i < uint32(ops.ValueThreshold+10); i++ { stat[i] = 0 } db.vlog.lfDiscardStats.Lock() db.vlog.lfDiscardStats.m = stat encodedDS, _ := json.Marshal(db.vlog.lfDiscardStats.m) db.vlog.lfDiscardStats.Unlock() entries := []*Entry{{ Key: y.KeyWithTs(lfDiscardStatsKey, 1), // The discard stat value is more than value threshold. Value: encodedDS, }} // Push discard stats entry to the write channel. req, err := db.sendToWriteCh(entries) require.NoError(t, err) req.Wait() // Unset discard stats. We've already pushed the stats. If we don't unset it then it will be // pushed again on DB close. Also, the first insertion was in vlog file 1, this insertion would // be in value log file 3. db.vlog.lfDiscardStats.Lock() db.vlog.lfDiscardStats.m = nil db.vlog.lfDiscardStats.Unlock() // Push more entries so that we get more than 1 value log files. require.NoError(t, db.Update(func(txn *Txn) error { e := NewEntry([]byte("f"), []byte("1")) return txn.SetEntry(e) })) require.NoError(t, db.Update(func(txn *Txn) error { e := NewEntry([]byte("ff"), []byte("1")) return txn.SetEntry(e) })) tr := trace.New("Badger.ValueLog", "GC") // Use first value log file for GC. This value log file contains the discard stats. lf := db.vlog.filesMap[0] require.NoError(t, db.vlog.rewrite(lf, tr)) require.NoError(t, db.Close()) db, err = Open(ops) // discardStats will be populate using vlog.populateDiscardStats(), which pushes discard stats // to vlog.lfDiscardStats.flushChan. Hence wait for some time, for discard stats to be updated. time.Sleep(1 * time.Second) require.NoError(t, err) db.vlog.lfDiscardStats.RLock() require.Equal(t, stat, db.vlog.lfDiscardStats.m) db.vlog.lfDiscardStats.RUnlock() require.NoError(t, db.Close()) } // This test ensures, flushDiscardStats() doesn't crash. func TestBlockedDiscardStats(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer os.Remove(dir) db, err := Open(getTestOptions(dir)) require.NoError(t, err) // Set discard stats. db.vlog.lfDiscardStats.m = map[uint32]int64{0: 0} db.blockWrite() // Push discard stats more than the capacity of flushChan. This ensures at least one flush // operation completes successfully after the writes were blocked. for i := 0; i < cap(db.vlog.lfDiscardStats.flushChan)+2; i++ { db.vlog.lfDiscardStats.flushChan <- db.vlog.lfDiscardStats.m } db.unblockWrite() require.NoError(t, db.Close()) } // Regression test for https://github.com/dgraph-io/badger/issues/970 func TestBlockedDiscardStatsOnClose(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) db, err := Open(getTestOptions(dir)) require.NoError(t, err) db.vlog.lfDiscardStats.m = map[uint32]int64{0: 0} // This is important. Set updateSinceFlush to discardStatsFlushThreshold so // that the next update call flushes the discard stats. db.vlog.lfDiscardStats.updatesSinceFlush = discardStatsFlushThreshold + 1 require.NoError(t, db.Close()) } func TestValueEntryChecksum(t *testing.T) { k := []byte("KEY") v := []byte(fmt.Sprintf("val%100d", 10)) t.Run("ok", func(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opt := getTestOptions(dir) opt.VerifyValueChecksum = true opt.ValueThreshold = 32 db, err := Open(opt) require.NoError(t, err) require.Greater(t, len(v), db.opt.ValueThreshold) txnSet(t, db, k, v, 0) require.NoError(t, db.Close()) db, err = Open(opt) require.NoError(t, err) txn := db.NewTransaction(false) entry, err := txn.Get(k) require.NoError(t, err) x, err := entry.ValueCopy(nil) require.NoError(t, err) require.Equal(t, v, x) require.NoError(t, db.Close()) }) // Regression test for https://github.com/dgraph-io/badger/issues/1049 t.Run("Corruption", func(t *testing.T) { dir, err := ioutil.TempDir("", "badger-test") require.NoError(t, err) defer removeDir(dir) opt := getTestOptions(dir) opt.VerifyValueChecksum = true opt.ValueThreshold = 32 db, err := Open(opt) require.NoError(t, err) require.Greater(t, len(v), db.opt.ValueThreshold) txnSet(t, db, k, v, 0) path := db.vlog.fpath(0) require.NoError(t, db.Close()) file, err := os.OpenFile(path, os.O_RDWR, 0644) require.NoError(t, err) offset := 50 orig := make([]byte, 1) _, err = file.ReadAt(orig, int64(offset)) require.NoError(t, err) // Corrupt a single bit. _, err = file.WriteAt([]byte{7}, int64(offset)) require.NoError(t, err) require.NoError(t, file.Close()) db, err = Open(opt) require.NoError(t, err) txn := db.NewTransaction(false) entry, err := txn.Get(k) require.NoError(t, err) x, err := entry.ValueCopy(nil) require.Error(t, err) require.Contains(t, err.Error(), "checksum mismatch") require.Nil(t, x) require.NoError(t, db.Close()) }) } func TestValidateWrite(t *testing.T) { // Mocking the file size, so that we don't allocate big memory while running test. maxVlogFileSize = 400 defer func() { maxVlogFileSize = math.MaxUint32 }() bigBuf := make([]byte, maxVlogFileSize+1) log := &valueLog{ opt: DefaultOptions("."), } // Sending a request with big values which will overflow uint32. key := []byte("HelloKey") req := &request{ Entries: []*Entry{ { Key: key, Value: bigBuf, }, { Key: key, Value: bigBuf, }, { Key: key, Value: bigBuf, }, }, } err := log.validateWrites([]*request{req}) require.Error(t, err) // Testing with small values. smallBuf := make([]byte, 4) req1 := &request{ Entries: []*Entry{ { Key: key, Value: smallBuf, }, { Key: key, Value: smallBuf, }, { Key: key, Value: smallBuf, }, }, } err = log.validateWrites([]*request{req1}) require.NoError(t, err) // Batching small and big request. err = log.validateWrites([]*request{req1, req}) require.Error(t, err) } badger-2.2007.2/y/000077500000000000000000000000001372173116500134465ustar00rootroot00000000000000badger-2.2007.2/y/checksum.go000066400000000000000000000030231372173116500155750ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package y import ( "hash/crc32" "github.com/dgraph-io/badger/v2/pb" "github.com/cespare/xxhash" "github.com/pkg/errors" ) // ErrChecksumMismatch is returned at checksum mismatch. var ErrChecksumMismatch = errors.New("checksum mismatch") // CalculateChecksum calculates checksum for data using ct checksum type. func CalculateChecksum(data []byte, ct pb.Checksum_Algorithm) uint64 { switch ct { case pb.Checksum_CRC32C: return uint64(crc32.Checksum(data, CastagnoliCrcTable)) case pb.Checksum_XXHash64: return xxhash.Sum64(data) default: panic("checksum type not supported") } } // VerifyChecksum validates the checksum for the data against the given expected checksum. func VerifyChecksum(data []byte, expected *pb.Checksum) error { actual := CalculateChecksum(data, expected.Algo) if actual != expected.Sum { return Wrapf(ErrChecksumMismatch, "actual: %d, expected: %d", actual, expected.Sum) } return nil } badger-2.2007.2/y/encrypt.go000066400000000000000000000023001372173116500154540ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package y import ( "crypto/aes" "crypto/cipher" "crypto/rand" ) // XORBlock encrypts the given data with AES and XOR's with IV. // Can be used for both encryption and decryption. IV is of // AES block size. func XORBlock(src, key, iv []byte) ([]byte, error) { block, err := aes.NewCipher(key) if err != nil { return nil, err } stream := cipher.NewCTR(block, iv) dst := make([]byte, len(src)) stream.XORKeyStream(dst, src) return dst, nil } // GenerateIV generates IV. func GenerateIV() ([]byte, error) { iv := make([]byte, aes.BlockSize) _, err := rand.Read(iv) return iv, err } badger-2.2007.2/y/error.go000066400000000000000000000046201372173116500151300ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package y // This file contains some functions for error handling. Note that we are moving // towards using x.Trace, i.e., rpc tracing using net/tracer. But for now, these // functions are useful for simple checks logged on one machine. // Some common use cases are: // (1) You receive an error from external lib, and would like to check/log fatal. // For this, use x.Check, x.Checkf. These will check for err != nil, which is // more common in Go. If you want to check for boolean being true, use // x.Assert, x.Assertf. // (2) You receive an error from external lib, and would like to pass on with some // stack trace information. In this case, use x.Wrap or x.Wrapf. // (3) You want to generate a new error with stack trace info. Use x.Errorf. import ( "fmt" "log" "github.com/pkg/errors" ) var debugMode = true // Check logs fatal if err != nil. func Check(err error) { if err != nil { log.Fatalf("%+v", Wrap(err)) } } // Check2 acts as convenience wrapper around Check, using the 2nd argument as error. func Check2(_ interface{}, err error) { Check(err) } // AssertTrue asserts that b is true. Otherwise, it would log fatal. func AssertTrue(b bool) { if !b { log.Fatalf("%+v", errors.Errorf("Assert failed")) } } // AssertTruef is AssertTrue with extra info. func AssertTruef(b bool, format string, args ...interface{}) { if !b { log.Fatalf("%+v", errors.Errorf(format, args...)) } } // Wrap wraps errors from external lib. func Wrap(err error) error { if !debugMode { return err } return errors.Wrap(err, "") } // Wrapf is Wrap with extra info. func Wrapf(err error, format string, args ...interface{}) error { if !debugMode { if err == nil { return nil } return fmt.Errorf(format+" error: %+v", append(args, err)...) } return errors.Wrapf(err, format, args...) } badger-2.2007.2/y/event_log.go000066400000000000000000000016171372173116500157640ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package y import "golang.org/x/net/trace" var ( NoEventLog trace.EventLog = nilEventLog{} ) type nilEventLog struct{} func (nel nilEventLog) Printf(format string, a ...interface{}) {} func (nel nilEventLog) Errorf(format string, a ...interface{}) {} func (nel nilEventLog) Finish() {} badger-2.2007.2/y/file_dsync.go000066400000000000000000000013571372173116500161220ustar00rootroot00000000000000// +build !dragonfly,!freebsd,!windows /* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package y import "golang.org/x/sys/unix" func init() { datasyncFileFlag = unix.O_DSYNC } badger-2.2007.2/y/file_nodsync.go000066400000000000000000000013401372173116500164470ustar00rootroot00000000000000// +build dragonfly freebsd windows /* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package y import "syscall" func init() { datasyncFileFlag = syscall.O_SYNC } badger-2.2007.2/y/iterator.go000066400000000000000000000045571372173116500156410ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package y import ( "bytes" "encoding/binary" ) // ValueStruct represents the value info that can be associated with a key, but also the internal // Meta field. type ValueStruct struct { Meta byte UserMeta byte ExpiresAt uint64 Value []byte Version uint64 // This field is not serialized. Only for internal usage. } func sizeVarint(x uint64) (n int) { for { n++ x >>= 7 if x == 0 { break } } return n } // EncodedSize is the size of the ValueStruct when encoded func (v *ValueStruct) EncodedSize() uint32 { sz := len(v.Value) + 2 // meta, usermeta. if v.ExpiresAt == 0 { return uint32(sz + 1) } enc := sizeVarint(v.ExpiresAt) return uint32(sz + enc) } // Decode uses the length of the slice to infer the length of the Value field. func (v *ValueStruct) Decode(b []byte) { v.Meta = b[0] v.UserMeta = b[1] var sz int v.ExpiresAt, sz = binary.Uvarint(b[2:]) v.Value = b[2+sz:] } // Encode expects a slice of length at least v.EncodedSize(). func (v *ValueStruct) Encode(b []byte) { b[0] = v.Meta b[1] = v.UserMeta sz := binary.PutUvarint(b[2:], v.ExpiresAt) copy(b[2+sz:], v.Value) } // EncodeTo should be kept in sync with the Encode function above. The reason // this function exists is to avoid creating byte arrays per key-value pair in // table/builder.go. func (v *ValueStruct) EncodeTo(buf *bytes.Buffer) { buf.WriteByte(v.Meta) buf.WriteByte(v.UserMeta) var enc [binary.MaxVarintLen64]byte sz := binary.PutUvarint(enc[:], v.ExpiresAt) buf.Write(enc[:sz]) buf.Write(v.Value) } // Iterator is an interface for a basic iterator. type Iterator interface { Next() Rewind() Seek(key []byte) Key() []byte Value() ValueStruct Valid() bool // All iterators should be closed so that file garbage collection works. Close() error } badger-2.2007.2/y/metrics.go000066400000000000000000000046111372173116500154450ustar00rootroot00000000000000/* * Copyright (C) 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package y import "expvar" var ( // LSMSize has size of the LSM in bytes LSMSize *expvar.Map // VlogSize has size of the value log in bytes VlogSize *expvar.Map // PendingWrites tracks the number of pending writes. PendingWrites *expvar.Map // These are cumulative // NumReads has cumulative number of reads NumReads *expvar.Int // NumWrites has cumulative number of writes NumWrites *expvar.Int // NumBytesRead has cumulative number of bytes read NumBytesRead *expvar.Int // NumBytesWritten has cumulative number of bytes written NumBytesWritten *expvar.Int // NumLSMGets is number of LMS gets NumLSMGets *expvar.Map // NumLSMBloomHits is number of LMS bloom hits NumLSMBloomHits *expvar.Map // NumGets is number of gets NumGets *expvar.Int // NumPuts is number of puts NumPuts *expvar.Int // NumBlockedPuts is number of blocked puts NumBlockedPuts *expvar.Int // NumMemtableGets is number of memtable gets NumMemtableGets *expvar.Int ) // These variables are global and have cumulative values for all kv stores. func init() { NumReads = expvar.NewInt("badger_v2_disk_reads_total") NumWrites = expvar.NewInt("badger_v2_disk_writes_total") NumBytesRead = expvar.NewInt("badger_v2_read_bytes") NumBytesWritten = expvar.NewInt("badger_v2_written_bytes") NumLSMGets = expvar.NewMap("badger_v2_lsm_level_gets_total") NumLSMBloomHits = expvar.NewMap("badger_v2_lsm_bloom_hits_total") NumGets = expvar.NewInt("badger_v2_gets_total") NumPuts = expvar.NewInt("badger_v2_puts_total") NumBlockedPuts = expvar.NewInt("badger_v2_blocked_puts_total") NumMemtableGets = expvar.NewInt("badger_v2_memtable_gets_total") LSMSize = expvar.NewMap("badger_v2_lsm_size_bytes") VlogSize = expvar.NewMap("badger_v2_vlog_size_bytes") PendingWrites = expvar.NewMap("badger_v2_pending_writes_total") } badger-2.2007.2/y/mmap.go000066400000000000000000000024261372173116500147330ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package y import ( "os" ) // Mmap uses the mmap system call to memory-map a file. If writable is true, // memory protection of the pages is set so that they may be written to as well. func Mmap(fd *os.File, writable bool, size int64) ([]byte, error) { return mmap(fd, writable, size) } // Munmap unmaps a previously mapped slice. func Munmap(b []byte) error { return munmap(b) } // Madvise uses the madvise system call to give advise about the use of memory // when using a slice that is memory-mapped to a file. Set the readahead flag to // false if page references are expected in random order. func Madvise(b []byte, readahead bool) error { return madvise(b, readahead) } badger-2.2007.2/y/mmap_darwin.go000066400000000000000000000030101372173116500162650ustar00rootroot00000000000000/* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package y import ( "os" "syscall" "unsafe" "golang.org/x/sys/unix" ) // Mmap uses the mmap system call to memory-map a file. If writable is true, // memory protection of the pages is set so that they may be written to as well. func mmap(fd *os.File, writable bool, size int64) ([]byte, error) { mtype := unix.PROT_READ if writable { mtype |= unix.PROT_WRITE } return unix.Mmap(int(fd.Fd()), 0, int(size), mtype, unix.MAP_SHARED) } // Munmap unmaps a previously mapped slice. func munmap(b []byte) error { return unix.Munmap(b) } // This is required because the unix package does not support the madvise system call on OS X. func madvise(b []byte, readahead bool) error { advice := unix.MADV_NORMAL if !readahead { advice = unix.MADV_RANDOM } _, _, e1 := syscall.Syscall(syscall.SYS_MADVISE, uintptr(unsafe.Pointer(&b[0])), uintptr(len(b)), uintptr(advice)) if e1 != 0 { return e1 } return nil } badger-2.2007.2/y/mmap_unix.go000066400000000000000000000030071372173116500157720ustar00rootroot00000000000000// +build !windows,!darwin /* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package y import ( "os" "golang.org/x/sys/unix" ) // Mmap uses the mmap system call to memory-map a file. If writable is true, // memory protection of the pages is set so that they may be written to as well. func mmap(fd *os.File, writable bool, size int64) ([]byte, error) { mtype := unix.PROT_READ if writable { mtype |= unix.PROT_WRITE } return unix.Mmap(int(fd.Fd()), 0, int(size), mtype, unix.MAP_SHARED) } // Munmap unmaps a previously mapped slice. func munmap(b []byte) error { return unix.Munmap(b) } // Madvise uses the madvise system call to give advise about the use of memory // when using a slice that is memory-mapped to a file. Set the readahead flag to // false if page references are expected in random order. func madvise(b []byte, readahead bool) error { flags := unix.MADV_NORMAL if !readahead { flags = unix.MADV_RANDOM } return unix.Madvise(b, flags) } badger-2.2007.2/y/mmap_windows.go000066400000000000000000000044251372173116500165060ustar00rootroot00000000000000// +build windows /* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package y import ( "fmt" "os" "syscall" "unsafe" ) func mmap(fd *os.File, write bool, size int64) ([]byte, error) { protect := syscall.PAGE_READONLY access := syscall.FILE_MAP_READ if write { protect = syscall.PAGE_READWRITE access = syscall.FILE_MAP_WRITE } fi, err := fd.Stat() if err != nil { return nil, err } // In windows, we cannot mmap a file more than it's actual size. // So truncate the file to the size of the mmap. if fi.Size() < size { if err := fd.Truncate(size); err != nil { return nil, fmt.Errorf("truncate: %s", err) } } // Open a file mapping handle. sizelo := uint32(size >> 32) sizehi := uint32(size) & 0xffffffff handler, err := syscall.CreateFileMapping(syscall.Handle(fd.Fd()), nil, uint32(protect), sizelo, sizehi, nil) if err != nil { return nil, os.NewSyscallError("CreateFileMapping", err) } // Create the memory map. addr, err := syscall.MapViewOfFile(handler, uint32(access), 0, 0, uintptr(size)) if addr == 0 { return nil, os.NewSyscallError("MapViewOfFile", err) } // Close mapping handle. if err := syscall.CloseHandle(syscall.Handle(handler)); err != nil { return nil, os.NewSyscallError("CloseHandle", err) } // Slice memory layout // Copied this snippet from golang/sys package var sl = struct { addr uintptr len int cap int }{addr, int(size), int(size)} // Use unsafe to turn sl into a []byte. data := *(*[]byte)(unsafe.Pointer(&sl)) return data, nil } func munmap(b []byte) error { return syscall.UnmapViewOfFile(uintptr(unsafe.Pointer(&b[0]))) } func madvise(b []byte, readahead bool) error { // Do Nothing. We don’t care about this setting on Windows return nil } badger-2.2007.2/y/watermark.go000066400000000000000000000160501372173116500157740ustar00rootroot00000000000000/* * Copyright 2016-2018 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package y import ( "container/heap" "context" "sync/atomic" ) type uint64Heap []uint64 func (u uint64Heap) Len() int { return len(u) } func (u uint64Heap) Less(i, j int) bool { return u[i] < u[j] } func (u uint64Heap) Swap(i, j int) { u[i], u[j] = u[j], u[i] } func (u *uint64Heap) Push(x interface{}) { *u = append(*u, x.(uint64)) } func (u *uint64Heap) Pop() interface{} { old := *u n := len(old) x := old[n-1] *u = old[0 : n-1] return x } // mark contains one of more indices, along with a done boolean to indicate the // status of the index: begin or done. It also contains waiters, who could be // waiting for the watermark to reach >= a certain index. type mark struct { // Either this is an (index, waiter) pair or (index, done) or (indices, done). index uint64 waiter chan struct{} indices []uint64 done bool // Set to true if the index is done. } // WaterMark is used to keep track of the minimum un-finished index. Typically, an index k becomes // finished or "done" according to a WaterMark once Done(k) has been called // 1. as many times as Begin(k) has, AND // 2. a positive number of times. // // An index may also become "done" by calling SetDoneUntil at a time such that it is not // inter-mingled with Begin/Done calls. // // Since doneUntil and lastIndex addresses are passed to sync/atomic packages, we ensure that they // are 64-bit aligned by putting them at the beginning of the structure. type WaterMark struct { doneUntil uint64 lastIndex uint64 Name string markCh chan mark } // Init initializes a WaterMark struct. MUST be called before using it. func (w *WaterMark) Init(closer *Closer) { w.markCh = make(chan mark, 100) go w.process(closer) } // Begin sets the last index to the given value. func (w *WaterMark) Begin(index uint64) { atomic.StoreUint64(&w.lastIndex, index) w.markCh <- mark{index: index, done: false} } // BeginMany works like Begin but accepts multiple indices. func (w *WaterMark) BeginMany(indices []uint64) { atomic.StoreUint64(&w.lastIndex, indices[len(indices)-1]) w.markCh <- mark{index: 0, indices: indices, done: false} } // Done sets a single index as done. func (w *WaterMark) Done(index uint64) { w.markCh <- mark{index: index, done: true} } // DoneMany works like Done but accepts multiple indices. func (w *WaterMark) DoneMany(indices []uint64) { w.markCh <- mark{index: 0, indices: indices, done: true} } // DoneUntil returns the maximum index that has the property that all indices // less than or equal to it are done. func (w *WaterMark) DoneUntil() uint64 { return atomic.LoadUint64(&w.doneUntil) } // SetDoneUntil sets the maximum index that has the property that all indices // less than or equal to it are done. func (w *WaterMark) SetDoneUntil(val uint64) { atomic.StoreUint64(&w.doneUntil, val) } // LastIndex returns the last index for which Begin has been called. func (w *WaterMark) LastIndex() uint64 { return atomic.LoadUint64(&w.lastIndex) } // WaitForMark waits until the given index is marked as done. func (w *WaterMark) WaitForMark(ctx context.Context, index uint64) error { if w.DoneUntil() >= index { return nil } waitCh := make(chan struct{}) w.markCh <- mark{index: index, waiter: waitCh} select { case <-ctx.Done(): return ctx.Err() case <-waitCh: return nil } } // process is used to process the Mark channel. This is not thread-safe, // so only run one goroutine for process. One is sufficient, because // all goroutine ops use purely memory and cpu. // Each index has to emit atleast one begin watermark in serial order otherwise waiters // can get blocked idefinitely. Example: We had an watermark at 100 and a waiter at 101, // if no watermark is emitted at index 101 then waiter would get stuck indefinitely as it // can't decide whether the task at 101 has decided not to emit watermark or it didn't get // scheduled yet. func (w *WaterMark) process(closer *Closer) { defer closer.Done() var indices uint64Heap // pending maps raft proposal index to the number of pending mutations for this proposal. pending := make(map[uint64]int) waiters := make(map[uint64][]chan struct{}) heap.Init(&indices) processOne := func(index uint64, done bool) { // If not already done, then set. Otherwise, don't undo a done entry. prev, present := pending[index] if !present { heap.Push(&indices, index) } delta := 1 if done { delta = -1 } pending[index] = prev + delta // Update mark by going through all indices in order; and checking if they have // been done. Stop at the first index, which isn't done. doneUntil := w.DoneUntil() if doneUntil > index { AssertTruef(false, "Name: %s doneUntil: %d. Index: %d", w.Name, doneUntil, index) } until := doneUntil loops := 0 for len(indices) > 0 { min := indices[0] if done := pending[min]; done > 0 { break // len(indices) will be > 0. } // Even if done is called multiple times causing it to become // negative, we should still pop the index. heap.Pop(&indices) delete(pending, min) until = min loops++ } if until != doneUntil { AssertTrue(atomic.CompareAndSwapUint64(&w.doneUntil, doneUntil, until)) } notifyAndRemove := func(idx uint64, toNotify []chan struct{}) { for _, ch := range toNotify { close(ch) } delete(waiters, idx) // Release the memory back. } if until-doneUntil <= uint64(len(waiters)) { // Issue #908 showed that if doneUntil is close to 2^60, while until is zero, this loop // can hog up CPU just iterating over integers creating a busy-wait loop. So, only do // this path if until - doneUntil is less than the number of waiters. for idx := doneUntil + 1; idx <= until; idx++ { if toNotify, ok := waiters[idx]; ok { notifyAndRemove(idx, toNotify) } } } else { for idx, toNotify := range waiters { if idx <= until { notifyAndRemove(idx, toNotify) } } } // end of notifying waiters. } for { select { case <-closer.HasBeenClosed(): return case mark := <-w.markCh: if mark.waiter != nil { doneUntil := atomic.LoadUint64(&w.doneUntil) if doneUntil >= mark.index { close(mark.waiter) } else { ws, ok := waiters[mark.index] if !ok { waiters[mark.index] = []chan struct{}{mark.waiter} } else { waiters[mark.index] = append(ws, mark.waiter) } } } else { if mark.index > 0 { processOne(mark.index, mark.done) } for _, index := range mark.indices { processOne(index, mark.done) } } } } } badger-2.2007.2/y/y.go000066400000000000000000000327211372173116500142520ustar00rootroot00000000000000/* * Copyright 2017 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package y import ( "bytes" "encoding/binary" "fmt" "hash/crc32" "io" "math" "os" "reflect" "sync" "time" "unsafe" "github.com/pkg/errors" ) var ( // ErrEOF indicates an end of file when trying to read from a memory mapped file // and encountering the end of slice. ErrEOF = errors.New("End of mapped region") // ErrZstdCgo indicates that badger was built without cgo but ZSTD // compression algorithm is being used for compression. ZSTD cannot work // without CGO. ErrZstdCgo = errors.New("zstd compression requires building badger with cgo enabled") ) const ( // Sync indicates that O_DSYNC should be set on the underlying file, // ensuring that data writes do not return until the data is flushed // to disk. Sync = 1 << iota // ReadOnly opens the underlying file on a read-only basis. ReadOnly ) var ( // This is O_DSYNC (datasync) on platforms that support it -- see file_unix.go datasyncFileFlag = 0x0 // CastagnoliCrcTable is a CRC32 polynomial table CastagnoliCrcTable = crc32.MakeTable(crc32.Castagnoli) // Dummy channel for nil closers. dummyCloserChan = make(chan struct{}) ) // OpenExistingFile opens an existing file, errors if it doesn't exist. func OpenExistingFile(filename string, flags uint32) (*os.File, error) { openFlags := os.O_RDWR if flags&ReadOnly != 0 { openFlags = os.O_RDONLY } if flags&Sync != 0 { openFlags |= datasyncFileFlag } return os.OpenFile(filename, openFlags, 0) } // CreateSyncedFile creates a new file (using O_EXCL), errors if it already existed. func CreateSyncedFile(filename string, sync bool) (*os.File, error) { flags := os.O_RDWR | os.O_CREATE | os.O_EXCL if sync { flags |= datasyncFileFlag } return os.OpenFile(filename, flags, 0600) } // OpenSyncedFile creates the file if one doesn't exist. func OpenSyncedFile(filename string, sync bool) (*os.File, error) { flags := os.O_RDWR | os.O_CREATE if sync { flags |= datasyncFileFlag } return os.OpenFile(filename, flags, 0600) } // OpenTruncFile opens the file with O_RDWR | O_CREATE | O_TRUNC func OpenTruncFile(filename string, sync bool) (*os.File, error) { flags := os.O_RDWR | os.O_CREATE | os.O_TRUNC if sync { flags |= datasyncFileFlag } return os.OpenFile(filename, flags, 0600) } // SafeCopy does append(a[:0], src...). func SafeCopy(a, src []byte) []byte { return append(a[:0], src...) } // Copy copies a byte slice and returns the copied slice. func Copy(a []byte) []byte { b := make([]byte, len(a)) copy(b, a) return b } // KeyWithTs generates a new key by appending ts to key. func KeyWithTs(key []byte, ts uint64) []byte { out := make([]byte, len(key)+8) copy(out, key) binary.BigEndian.PutUint64(out[len(key):], math.MaxUint64-ts) return out } // ParseTs parses the timestamp from the key bytes. func ParseTs(key []byte) uint64 { if len(key) <= 8 { return 0 } return math.MaxUint64 - binary.BigEndian.Uint64(key[len(key)-8:]) } // CompareKeys checks the key without timestamp and checks the timestamp if keyNoTs // is same. // a would be sorted higher than aa if we use bytes.compare // All keys should have timestamp. func CompareKeys(key1, key2 []byte) int { if cmp := bytes.Compare(key1[:len(key1)-8], key2[:len(key2)-8]); cmp != 0 { return cmp } return bytes.Compare(key1[len(key1)-8:], key2[len(key2)-8:]) } // ParseKey parses the actual key from the key bytes. func ParseKey(key []byte) []byte { if key == nil { return nil } return key[:len(key)-8] } // SameKey checks for key equality ignoring the version timestamp suffix. func SameKey(src, dst []byte) bool { if len(src) != len(dst) { return false } return bytes.Equal(ParseKey(src), ParseKey(dst)) } // Slice holds a reusable buf, will reallocate if you request a larger size than ever before. // One problem is with n distinct sizes in random order it'll reallocate log(n) times. type Slice struct { buf []byte } // Resize reuses the Slice's buffer (or makes a new one) and returns a slice in that buffer of // length sz. func (s *Slice) Resize(sz int) []byte { if cap(s.buf) < sz { s.buf = make([]byte, sz) } return s.buf[0:sz] } // FixedDuration returns a string representation of the given duration with the // hours, minutes, and seconds. func FixedDuration(d time.Duration) string { str := fmt.Sprintf("%02ds", int(d.Seconds())%60) if d >= time.Minute { str = fmt.Sprintf("%02dm", int(d.Minutes())%60) + str } if d >= time.Hour { str = fmt.Sprintf("%02dh", int(d.Hours())) + str } return str } // Closer holds the two things we need to close a goroutine and wait for it to finish: a chan // to tell the goroutine to shut down, and a WaitGroup with which to wait for it to finish shutting // down. type Closer struct { closed chan struct{} waiting sync.WaitGroup closeOnce sync.Once } // NewCloser constructs a new Closer, with an initial count on the WaitGroup. func NewCloser(initial int) *Closer { ret := &Closer{closed: make(chan struct{})} ret.waiting.Add(initial) return ret } // AddRunning Add()'s delta to the WaitGroup. func (lc *Closer) AddRunning(delta int) { lc.waiting.Add(delta) } // Signal signals the HasBeenClosed signal. func (lc *Closer) Signal() { // Todo(ibrahim): Change Signal to return error on next badger breaking change. lc.closeOnce.Do(func() { close(lc.closed) }) } // HasBeenClosed gets signaled when Signal() is called. func (lc *Closer) HasBeenClosed() <-chan struct{} { if lc == nil { return dummyCloserChan } return lc.closed } // Done calls Done() on the WaitGroup. func (lc *Closer) Done() { if lc == nil { return } lc.waiting.Done() } // Wait waits on the WaitGroup. (It waits for NewCloser's initial value, AddRunning, and Done // calls to balance out.) func (lc *Closer) Wait() { lc.waiting.Wait() } // SignalAndWait calls Signal(), then Wait(). func (lc *Closer) SignalAndWait() { lc.Signal() lc.Wait() } // Throttle allows a limited number of workers to run at a time. It also // provides a mechanism to check for errors encountered by workers and wait for // them to finish. type Throttle struct { once sync.Once wg sync.WaitGroup ch chan struct{} errCh chan error finishErr error } // NewThrottle creates a new throttle with a max number of workers. func NewThrottle(max int) *Throttle { return &Throttle{ ch: make(chan struct{}, max), errCh: make(chan error, max), } } // Do should be called by workers before they start working. It blocks if there // are already maximum number of workers working. If it detects an error from // previously Done workers, it would return it. func (t *Throttle) Do() error { for { select { case t.ch <- struct{}{}: t.wg.Add(1) return nil case err := <-t.errCh: if err != nil { return err } } } } // Done should be called by workers when they finish working. They can also // pass the error status of work done. func (t *Throttle) Done(err error) { if err != nil { t.errCh <- err } select { case <-t.ch: default: panic("Throttle Do Done mismatch") } t.wg.Done() } // Finish waits until all workers have finished working. It would return any error passed by Done. // If Finish is called multiple time, it will wait for workers to finish only once(first time). // From next calls, it will return same error as found on first call. func (t *Throttle) Finish() error { t.once.Do(func() { t.wg.Wait() close(t.ch) close(t.errCh) for err := range t.errCh { if err != nil { t.finishErr = err return } } }) return t.finishErr } // U32ToBytes converts the given Uint32 to bytes func U32ToBytes(v uint32) []byte { var uBuf [4]byte binary.BigEndian.PutUint32(uBuf[:], v) return uBuf[:] } // BytesToU32 converts the given byte slice to uint32 func BytesToU32(b []byte) uint32 { return binary.BigEndian.Uint32(b) } // U32SliceToBytes converts the given Uint32 slice to byte slice func U32SliceToBytes(u32s []uint32) []byte { if len(u32s) == 0 { return nil } var b []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&b)) hdr.Len = len(u32s) * 4 hdr.Cap = hdr.Len hdr.Data = uintptr(unsafe.Pointer(&u32s[0])) return b } // BytesToU32Slice converts the given byte slice to uint32 slice func BytesToU32Slice(b []byte) []uint32 { if len(b) == 0 { return nil } var u32s []uint32 hdr := (*reflect.SliceHeader)(unsafe.Pointer(&u32s)) hdr.Len = len(b) / 4 hdr.Cap = hdr.Len hdr.Data = uintptr(unsafe.Pointer(&b[0])) return u32s } // page struct contains one underlying buffer. type page struct { buf []byte } // PageBuffer consists of many pages. A page is a wrapper over []byte. PageBuffer can act as a // replacement of bytes.Buffer. Instead of having single underlying buffer, it has multiple // underlying buffers. Hence it avoids any copy during relocation(as happens in bytes.Buffer). // PageBuffer allocates memory in pages. Once a page is full, it will allocate page with double the // size of previous page. Its function are not thread safe. type PageBuffer struct { pages []*page length int // Length of PageBuffer. nextPageSize int // Size of next page to be allocated. } // NewPageBuffer returns a new PageBuffer with first page having size pageSize. func NewPageBuffer(pageSize int) *PageBuffer { b := &PageBuffer{} b.pages = append(b.pages, &page{buf: make([]byte, 0, pageSize)}) b.nextPageSize = pageSize * 2 return b } // Write writes data to PageBuffer b. It returns number of bytes written and any error encountered. func (b *PageBuffer) Write(data []byte) (int, error) { dataLen := len(data) for { cp := b.pages[len(b.pages)-1] // Current page. n := copy(cp.buf[len(cp.buf):cap(cp.buf)], data) cp.buf = cp.buf[:len(cp.buf)+n] b.length += n if len(data) == n { break } data = data[n:] b.pages = append(b.pages, &page{buf: make([]byte, 0, b.nextPageSize)}) b.nextPageSize *= 2 } return dataLen, nil } // WriteByte writes data byte to PageBuffer and returns any encountered error. func (b *PageBuffer) WriteByte(data byte) error { _, err := b.Write([]byte{data}) return err } // Len returns length of PageBuffer. func (b *PageBuffer) Len() int { return b.length } // pageForOffset returns pageIdx and startIdx for the offset. func (b *PageBuffer) pageForOffset(offset int) (int, int) { AssertTrue(offset < b.length) var pageIdx, startIdx, sizeNow int for i := 0; i < len(b.pages); i++ { cp := b.pages[i] if sizeNow+len(cp.buf)-1 < offset { sizeNow += len(cp.buf) } else { pageIdx = i startIdx = offset - sizeNow break } } return pageIdx, startIdx } // Truncate truncates PageBuffer to length n. func (b *PageBuffer) Truncate(n int) { pageIdx, startIdx := b.pageForOffset(n) // For simplicity of the code reject extra pages. These pages can be kept. b.pages = b.pages[:pageIdx+1] cp := b.pages[len(b.pages)-1] cp.buf = cp.buf[:startIdx] b.length = n } // Bytes returns whole Buffer data as single []byte. func (b *PageBuffer) Bytes() []byte { buf := make([]byte, b.length) written := 0 for i := 0; i < len(b.pages); i++ { written += copy(buf[written:], b.pages[i].buf) } return buf } // WriteTo writes whole buffer to w. It returns number of bytes written and any error encountered. func (b *PageBuffer) WriteTo(w io.Writer) (int64, error) { written := int64(0) for i := 0; i < len(b.pages); i++ { n, err := w.Write(b.pages[i].buf) written += int64(n) if err != nil { return written, err } } return written, nil } // NewReaderAt returns a reader which starts reading from offset in page buffer. func (b *PageBuffer) NewReaderAt(offset int) *PageBufferReader { pageIdx, startIdx := b.pageForOffset(offset) return &PageBufferReader{ buf: b, pageIdx: pageIdx, startIdx: startIdx, } } // PageBufferReader is a reader for PageBuffer. type PageBufferReader struct { buf *PageBuffer // Underlying page buffer. pageIdx int // Idx of page from where it will start reading. startIdx int // Idx inside page - buf.pages[pageIdx] from where it will start reading. } // Read reads upto len(p) bytes. It returns number of bytes read and any error encountered. func (r *PageBufferReader) Read(p []byte) (int, error) { // Check if there is enough to Read. pc := len(r.buf.pages) read := 0 for r.pageIdx < pc && read < len(p) { cp := r.buf.pages[r.pageIdx] // Current Page. endIdx := len(cp.buf) // Last Idx up to which we can read from this page. n := copy(p[read:], cp.buf[r.startIdx:endIdx]) read += n r.startIdx += n // Instead of len(cp.buf), we comparing with cap(cp.buf). This ensures that we move to next // page only when we have read all data. Reading from last page is an edge case. We don't // want to move to next page until last page is full to its capacity. if r.startIdx >= cap(cp.buf) { // We should move to next page. r.pageIdx++ r.startIdx = 0 continue } // When last page in not full to its capacity and we have read all data up to its // length, just break out of the loop. if r.pageIdx == pc-1 { break } } if read == 0 { return read, io.EOF } return read, nil } badger-2.2007.2/y/y_test.go000066400000000000000000000172751372173116500153200ustar00rootroot00000000000000package y import ( "bytes" "fmt" "io" "math/rand" "testing" "time" "github.com/stretchr/testify/require" ) func BenchmarkBuffer(b *testing.B) { var btw [1024]byte rand.Read(btw[:]) pageSize := 1024 b.Run("bytes-buffer", func(b *testing.B) { buf := new(bytes.Buffer) buf.Grow(pageSize) for i := 0; i < b.N; i++ { buf.Write(btw[:]) } }) b.Run("page-buffer", func(b *testing.B) { b.Run(fmt.Sprintf("page-size-%d", pageSize), func(b *testing.B) { pageBuffer := NewPageBuffer(pageSize) for i := 0; i < b.N; i++ { pageBuffer.Write(btw[:]) } }) }) } func TestPageBuffer(t *testing.T) { rand.Seed(time.Now().Unix()) var bytesBuffer bytes.Buffer // This is just for verifying result. bytesBuffer.Grow(512) pageBuffer := NewPageBuffer(512) // Writer small []byte var smallBytes [256]byte rand.Read(smallBytes[:]) var bigBytes [1024]byte rand.Read(bigBytes[:]) _, err := pageBuffer.Write(smallBytes[:]) require.NoError(t, err, "unable to write data to page buffer") _, err = pageBuffer.Write(bigBytes[:]) require.NoError(t, err, "unable to write data to page buffer") // Write data to bytesBuffer also, just to match result. bytesBuffer.Write(smallBytes[:]) bytesBuffer.Write(bigBytes[:]) require.True(t, bytes.Equal(pageBuffer.Bytes(), bytesBuffer.Bytes())) } func TestBufferWrite(t *testing.T) { rand.Seed(time.Now().Unix()) var wb [128]byte rand.Read(wb[:]) pb := NewPageBuffer(32) bb := new(bytes.Buffer) end := 32 for i := 0; i < 3; i++ { n, err := pb.Write(wb[:end]) require.NoError(t, err, "unable to write bytes to buffer") require.Equal(t, n, end, "length of buffer and length written should be equal") // append to bb also for testing. bb.Write(wb[:end]) require.True(t, bytes.Equal(pb.Bytes(), bb.Bytes()), "Both bytes should match") end = end * 2 } } func TestPagebufferTruncate(t *testing.T) { rand.Seed(time.Now().Unix()) var wb [1024]byte rand.Read(wb[:]) b := NewPageBuffer(32) n, err := b.Write(wb[:]) require.Equal(t, n, len(wb), "length of buffer and length written should be equal") require.NoError(t, err, "unable to write bytes to buffer") require.True(t, bytes.Equal(wb[:], b.Bytes()), "bytes written and read should be equal") // Truncate to 512. b.Truncate(512) require.True(t, bytes.Equal(b.Bytes(), wb[:512])) // Again write wb. n, err = b.Write(wb[:]) require.Equal(t, n, len(wb), "length of buffer and length written should be equal") require.NoError(t, err, "unable to write bytes to buffer") // Truncate to 1000. b.Truncate(1000) require.True(t, bytes.Equal(b.Bytes(), append(wb[:512], wb[:]...)[:1000])) } // Test PageBufferReader using large buffers. func TestPagebufferReader(t *testing.T) { rand.Seed(time.Now().Unix()) var wb [1024]byte rand.Read(wb[:]) b := NewPageBuffer(32) n, err := b.Write(wb[:]) require.Equal(t, n, len(wb), "length of buffer and length written should be equal") require.NoError(t, err, "unable to write bytes to buffer") // Also append some bytes so that last page is not full. n, err = b.Write(wb[:10]) require.Equal(t, n, 10, "length of buffer and length written should be equal") require.NoError(t, err, "unable to write bytes to buffer") reader := b.NewReaderAt(0) // Read first 512 bytes. var rb [512]byte n, err = reader.Read(rb[:]) require.NoError(t, err, "unable to read error") require.True(t, n == len(rb), "length read should be equal") // Match if read bytes are correct or not. rb2 := b.Bytes()[:512] require.True(t, bytes.Equal(rb[:], rb2)) // Next read using reader. n, err = reader.Read(rb[:]) require.NoError(t, err, "unable to read error") require.True(t, n == len(rb), "length read should be equal") // Read same number of bytes using Bytes method. rb2 = b.Bytes()[512:1024] require.True(t, bytes.Equal(rb[:], rb2)) // Next read using reader for reading last 10 bytes. n, err = reader.Read(rb[:10]) require.NoError(t, err, "unable to read error") require.True(t, n == 10, "length read should be equal") // Read same number of bytes using Bytes method. rb2 = b.Bytes()[1024 : 1024+10] require.True(t, bytes.Equal(rb[:10], rb2)) // Check if EOF is returned at end or not. n, err = reader.Read(rb[:10]) require.Equal(t, err, io.EOF, "EOF should be returned at end") require.Zero(t, n, "read length should be 0") } // Test PageBuffer by reading at random offset, random length. func TestPagebufferReader2(t *testing.T) { rand.Seed(time.Now().Unix()) var wb [1024]byte rand.Read(wb[:]) b := NewPageBuffer(32) n, err := b.Write(wb[:]) require.Equal(t, n, len(wb), "length of buffer and length written should be equal") require.NoError(t, err, "unable to write bytes to buffer") // Also append some bytes so that last page is not full. n, err = b.Write(wb[:10]) require.Equal(t, n, 10, "length of buffer and length written should be equal") require.NoError(t, err, "unable to write bytes to buffer") randOffset := int(rand.Int31n(int32(b.length) - 1)) randLength := int(rand.Int31n(int32(b.length - randOffset))) reader := b.NewReaderAt(randOffset) // Read randLength bytes. rb := make([]byte, randLength) n, err = reader.Read(rb[:]) require.NoError(t, err, "unable to read error") require.True(t, n == len(rb), "length read should be equal") // Read same number of bytes using Bytes method. rb2 := b.Bytes()[randOffset : randOffset+randLength] require.True(t, bytes.Equal(rb[:], rb2)) } // Test PageBuffer while reading multiple chunks. Chunks are smaller than pages of PageBuffer. func TestPagebufferReader3(t *testing.T) { rand.Seed(time.Now().Unix()) var wb [1000]byte rand.Read(wb[:]) b := NewPageBuffer(32) n, err := b.Write(wb[:]) require.Equal(t, n, len(wb), "length of buffer and length written should be equal") require.NoError(t, err, "unable to write bytes to buffer") reader := b.NewReaderAt(0) chunk := 10 // Read 10 bytes in loop. readBuf := make([]byte, chunk) currentOffset := 0 for i := 0; i < len(wb)/chunk; i++ { n, err = reader.Read(readBuf) require.NoError(t, err, "unable to read from reader") require.Equal(t, chunk, n, "length read should be equal to chunk") require.True(t, bytes.Equal(readBuf, wb[currentOffset:currentOffset+chunk])) rb := b.Bytes()[currentOffset : currentOffset+chunk] require.True(t, bytes.Equal(wb[currentOffset:currentOffset+chunk], rb)) currentOffset += chunk } // Read EOF. n, err = reader.Read(readBuf) require.Equal(t, err, io.EOF, "should return EOF") require.Equal(t, n, 0) // Read EOF again. n, err = reader.Read(readBuf) require.Equal(t, err, io.EOF, "should return EOF") require.Equal(t, n, 0) } // Test when read buffer is larger than PageBuffer. func TestPagebufferReader4(t *testing.T) { rand.Seed(time.Now().Unix()) var wb [20]byte rand.Read(wb[:]) b := NewPageBuffer(32) n, err := b.Write(wb[:]) require.Equal(t, n, len(wb), "length of buffer and length written should be equal") require.NoError(t, err, "unable to write bytes to buffer") reader := b.NewReaderAt(0) readBuf := make([]byte, 100) n, err = reader.Read(readBuf) require.NoError(t, err, "unable to read from reader") require.Equal(t, 20, n, "length read should be equal to chunk") // Read EOF. n, err = reader.Read(readBuf) require.Equal(t, err, io.EOF, "should return EOF") require.Equal(t, n, 0) } func TestMulipleSignals(t *testing.T) { closer := NewCloser(0) require.NotPanics(t, func() { closer.Signal() }) // Should not panic. require.NotPanics(t, func() { closer.Signal() }) require.NotPanics(t, func() { closer.SignalAndWait() }) // Attempt 2. closer = NewCloser(1) require.NotPanics(t, func() { closer.Done() }) require.NotPanics(t, func() { closer.SignalAndWait() }) // Should not panic. require.NotPanics(t, func() { closer.SignalAndWait() }) require.NotPanics(t, func() { closer.Signal() }) } badger-2.2007.2/y/zstd_cgo.go000066400000000000000000000021431372173116500156110ustar00rootroot00000000000000// +build cgo /* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package y import ( "github.com/DataDog/zstd" ) // CgoEnabled is used to check if CGO is enabled while building badger. const CgoEnabled = true // ZSTDDecompress decompresses a block using ZSTD algorithm. func ZSTDDecompress(dst, src []byte) ([]byte, error) { return zstd.Decompress(dst, src) } // ZSTDCompress compresses a block using ZSTD algorithm. func ZSTDCompress(dst, src []byte, compressionLevel int) ([]byte, error) { return zstd.CompressLevel(dst, src, compressionLevel) } badger-2.2007.2/y/zstd_nocgo.go000066400000000000000000000020251372173116500161450ustar00rootroot00000000000000// +build !cgo /* * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package y // CgoEnabled is used to check if CGO is enabled while building badger. const CgoEnabled = false // ZSTDDecompress decompresses a block using ZSTD algorithm. func ZSTDDecompress(dst, src []byte) ([]byte, error) { return nil, ErrZstdCgo } // ZSTDCompress compresses a block using ZSTD algorithm. func ZSTDCompress(dst, src []byte, compressionLevel int) ([]byte, error) { return nil, ErrZstdCgo }